Coverage for tests/test_parquet.py: 27%
109 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-31 10:07 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-31 10:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import numpy as np
32 import pandas as pd
33except ImportError:
34 pd = None
36try:
37 import pyarrow.parquet
38except ImportError:
39 pyarrow = None
41from lsst.daf.butler import Butler, Config, DatasetType, StorageClassConfig, StorageClassFactory
42from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
43from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
45TESTDIR = os.path.abspath(os.path.dirname(__file__))
48def _makeSingleIndexDataFrame():
49 """Make a single index data frame for testing.
51 Returns
52 -------
53 dataFrame : `~pandas.DataFrame`
54 The test dataframe.
55 allColumns : `list` [`str`]
56 List of all the columns (including index columns).
57 """
58 nrow = 5
59 data = np.zeros(nrow, dtype=[("index", "i4"), ("a", "f8"), ("b", "f8"), ("c", "f8"), ("ddd", "f8")])
60 data["index"][:] = np.arange(nrow)
61 data["a"] = np.random.randn(nrow)
62 data["b"] = np.random.randn(nrow)
63 data["c"] = np.random.randn(nrow)
64 data["ddd"] = np.random.randn(nrow)
65 df = pd.DataFrame(data)
66 df = df.set_index("index")
67 allColumns = df.columns.append(pd.Index(df.index.names))
69 return df, allColumns
72def _makeMultiIndexDataFrame():
73 """Make a multi-index data frame for testing.
75 Returns
76 -------
77 dataFrame : `~pandas.DataFrame`
78 The test dataframe.
79 """
80 columns = pd.MultiIndex.from_tuples(
81 [
82 ("g", "a"),
83 ("g", "b"),
84 ("g", "c"),
85 ("r", "a"),
86 ("r", "b"),
87 ("r", "c"),
88 ],
89 names=["filter", "column"],
90 )
91 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
93 return df
96@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatter without pandas.")
97@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.")
98class ParquetFormatterTestCase(unittest.TestCase):
99 """Tests for ParquetFormatter, using local file datastore."""
101 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
103 def setUp(self):
104 """Create a new butler root for each test."""
105 self.root = makeTestTempDir(TESTDIR)
106 config = Config(self.configFile)
107 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
108 # No dimensions in dataset type so we don't have to worry about
109 # inserting dimension data or defining data IDs.
110 self.datasetType = DatasetType(
111 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
112 )
113 self.butler.registry.registerDatasetType(self.datasetType)
115 def tearDown(self):
116 removeTestTempDir(self.root)
118 def testSingleIndexDataFrame(self):
119 df1, allColumns = _makeSingleIndexDataFrame()
121 self.butler.put(df1, self.datasetType, dataId={})
122 # Read the whole DataFrame.
123 df2 = self.butler.get(self.datasetType, dataId={})
124 self.assertTrue(df1.equals(df2))
125 # Read just the column descriptions.
126 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
127 self.assertTrue(allColumns.equals(columns2))
128 # Read just some columns a few different ways.
129 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
130 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
131 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
132 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
133 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
134 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
135 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
136 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
137 # Passing an unrecognized column should be a ValueError.
138 with self.assertRaises(ValueError):
139 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
141 def testMultiIndexDataFrame(self):
142 df1 = _makeMultiIndexDataFrame()
144 self.butler.put(df1, self.datasetType, dataId={})
145 # Read the whole DataFrame.
146 df2 = self.butler.get(self.datasetType, dataId={})
147 self.assertTrue(df1.equals(df2))
148 # Read just the column descriptions.
149 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
150 self.assertTrue(df1.columns.equals(columns2))
151 # Read just some columns a few different ways.
152 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
153 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
154 df4 = self.butler.get(
155 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
156 )
157 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
158 column_list = [("g", "a"), ("r", "c")]
159 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
160 self.assertTrue(df1.loc[:, column_list].equals(df5))
161 # Passing an unrecognized column should be a ValueError.
162 with self.assertRaises(ValueError):
163 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
166@unittest.skipUnless(pd is not None, "Cannot test parquet InMemoryDatastore without pandas.")
167class InMemoryParquetFormatterTestCase(ParquetFormatterTestCase):
168 """Tests for InMemoryDatastore, using DataFrameDelegate"""
170 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
172 def testMultiIndexDataFrame(self):
173 df1 = _makeMultiIndexDataFrame()
175 delegate = DataFrameDelegate("DataFrame")
177 # Read the whole DataFrame.
178 df2 = delegate.handleParameters(inMemoryDataset=df1)
179 self.assertTrue(df1.equals(df2))
180 # Read just the column descriptions.
181 columns2 = delegate.getComponent(composite=df1, componentName="columns")
182 self.assertTrue(df1.columns.equals(columns2))
184 # Read just some columns a few different ways.
185 with self.assertRaises(NotImplementedError) as cm:
186 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
187 self.assertIn("only supports string column names", str(cm.exception))
188 with self.assertRaises(NotImplementedError) as cm:
189 delegate.handleParameters(
190 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
191 )
192 self.assertIn("only supports string column names", str(cm.exception))
194 def testBadInput(self):
195 delegate = DataFrameDelegate("DataFrame")
197 with self.assertRaises(ValueError):
198 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
200 def testStorageClass(self):
201 df1, allColumns = _makeSingleIndexDataFrame()
203 factory = StorageClassFactory()
204 factory.addFromConfig(StorageClassConfig())
206 storageClass = factory.findStorageClass(type(df1), compare_types=False)
207 # Force the name lookup to do name matching
208 storageClass._pytype = None
209 self.assertEqual(storageClass.name, "DataFrame")
211 storageClass = factory.findStorageClass(type(df1), compare_types=True)
212 # Force the name lookup to do name matching
213 storageClass._pytype = None
214 self.assertEqual(storageClass.name, "DataFrame")
217if __name__ == "__main__": 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true
218 unittest.main()