Coverage for tests/test_parquet.py: 18%
692 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-12 02:19 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-12 02:19 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 arrow_to_astropy,
67 arrow_to_numpy,
68 arrow_to_numpy_dict,
69 arrow_to_pandas,
70 astropy_to_arrow,
71 numpy_dict_to_arrow,
72 numpy_to_arrow,
73 pandas_to_arrow,
74)
75from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
77TESTDIR = os.path.abspath(os.path.dirname(__file__))
80def _makeSimpleNumpyTable():
81 """Make a simple numpy table with random data.
83 Returns
84 -------
85 numpyTable : `numpy.ndarray`
86 """
87 nrow = 5
88 data = np.zeros(
89 nrow,
90 dtype=[
91 ("index", "i4"),
92 ("a", "f8"),
93 ("b", "f8"),
94 ("c", "f8"),
95 ("ddd", "f8"),
96 ("strcol", "U10"),
97 ("bytecol", "a10"),
98 ],
99 )
100 data["index"][:] = np.arange(nrow)
101 data["a"] = np.random.randn(nrow)
102 data["b"] = np.random.randn(nrow)
103 data["c"] = np.random.randn(nrow)
104 data["ddd"] = np.random.randn(nrow)
105 data["strcol"][:] = "teststring"
106 data["bytecol"][:] = "teststring"
108 return data
111def _makeSingleIndexDataFrame():
112 """Make a single index data frame for testing.
114 Returns
115 -------
116 dataFrame : `~pandas.DataFrame`
117 The test dataframe.
118 allColumns : `list` [`str`]
119 List of all the columns (including index columns).
120 """
121 data = _makeSimpleNumpyTable()
122 df = pd.DataFrame(data)
123 df = df.set_index("index")
124 allColumns = df.columns.append(pd.Index(df.index.names))
126 return df, allColumns
129def _makeMultiIndexDataFrame():
130 """Make a multi-index data frame for testing.
132 Returns
133 -------
134 dataFrame : `~pandas.DataFrame`
135 The test dataframe.
136 """
137 columns = pd.MultiIndex.from_tuples(
138 [
139 ("g", "a"),
140 ("g", "b"),
141 ("g", "c"),
142 ("r", "a"),
143 ("r", "b"),
144 ("r", "c"),
145 ],
146 names=["filter", "column"],
147 )
148 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
150 return df
153def _makeSimpleAstropyTable():
154 """Make an astropy table for testing.
156 Returns
157 -------
158 astropyTable : `astropy.table.Table`
159 The test table.
160 """
161 data = _makeSimpleNumpyTable()
162 # Add a couple of units.
163 table = atable.Table(data)
164 table["a"].unit = units.degree
165 table["b"].unit = units.meter
166 return table
169def _makeSimpleArrowTable():
170 """Make an arrow table for testing.
172 Returns
173 -------
174 arrowTable : `pyarrow.Table`
175 The test table.
176 """
177 data = _makeSimpleNumpyTable()
178 return numpy_to_arrow(data)
181@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
182@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
183class ParquetFormatterDataFrameTestCase(unittest.TestCase):
184 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
186 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
188 def setUp(self):
189 """Create a new butler root for each test."""
190 self.root = makeTestTempDir(TESTDIR)
191 config = Config(self.configFile)
192 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
193 # No dimensions in dataset type so we don't have to worry about
194 # inserting dimension data or defining data IDs.
195 self.datasetType = DatasetType(
196 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
197 )
198 self.butler.registry.registerDatasetType(self.datasetType)
200 def tearDown(self):
201 removeTestTempDir(self.root)
203 def testSingleIndexDataFrame(self):
204 df1, allColumns = _makeSingleIndexDataFrame()
206 self.butler.put(df1, self.datasetType, dataId={})
207 # Read the whole DataFrame.
208 df2 = self.butler.get(self.datasetType, dataId={})
209 self.assertTrue(df1.equals(df2))
210 # Read just the column descriptions.
211 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
212 self.assertTrue(allColumns.equals(columns2))
213 # Read the rowcount.
214 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
215 self.assertEqual(rowcount, len(df1))
216 # Read the schema.
217 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
218 self.assertEqual(schema, DataFrameSchema(df1))
219 # Read just some columns a few different ways.
220 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
221 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
222 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
223 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
224 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
225 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
226 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
227 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
228 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
229 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
230 # Passing an unrecognized column should be a ValueError.
231 with self.assertRaises(ValueError):
232 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
234 def testMultiIndexDataFrame(self):
235 df1 = _makeMultiIndexDataFrame()
237 self.butler.put(df1, self.datasetType, dataId={})
238 # Read the whole DataFrame.
239 df2 = self.butler.get(self.datasetType, dataId={})
240 self.assertTrue(df1.equals(df2))
241 # Read just the column descriptions.
242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
243 self.assertTrue(df1.columns.equals(columns2))
244 # Read the rowcount.
245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
246 self.assertEqual(rowcount, len(df1))
247 # Read the schema.
248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
249 self.assertEqual(schema, DataFrameSchema(df1))
250 # Read just some columns a few different ways.
251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
252 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
253 df4 = self.butler.get(
254 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
255 )
256 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
257 column_list = [("g", "a"), ("r", "c")]
258 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
259 self.assertTrue(df1.loc[:, column_list].equals(df5))
260 # Passing an unrecognized column should be a ValueError.
261 with self.assertRaises(ValueError):
262 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
264 def testSingleIndexDataFrameEmptyString(self):
265 """Test persisting a single index dataframe with empty strings."""
266 df1, _ = _makeSingleIndexDataFrame()
268 # Set one of the strings to None
269 df1.at[1, "strcol"] = None
271 self.butler.put(df1, self.datasetType, dataId={})
272 # Read the whole DataFrame.
273 df2 = self.butler.get(self.datasetType, dataId={})
274 self.assertTrue(df1.equals(df2))
276 def testSingleIndexDataFrameAllEmptyStrings(self):
277 """Test persisting a single index dataframe with an empty string
278 column.
279 """
280 df1, _ = _makeSingleIndexDataFrame()
282 # Set all of the strings to None
283 df1.loc[0:, "strcol"] = None
285 self.butler.put(df1, self.datasetType, dataId={})
286 # Read the whole DataFrame.
287 df2 = self.butler.get(self.datasetType, dataId={})
288 self.assertTrue(df1.equals(df2))
290 def testLegacyDataFrame(self):
291 """Test writing a dataframe to parquet via pandas (without additional
292 metadata) and ensure that we can read it back with all the new
293 functionality.
294 """
295 df1, allColumns = _makeSingleIndexDataFrame()
297 fname = os.path.join(self.root, "test_dataframe.parq")
298 df1.to_parquet(fname)
300 legacy_type = DatasetType(
301 "legacy_dataframe",
302 dimensions=(),
303 storageClass="DataFrame",
304 universe=self.butler.registry.dimensions,
305 )
306 self.butler.registry.registerDatasetType(legacy_type)
308 data_id = {}
309 ref = DatasetRef(legacy_type, data_id, id=None)
310 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
312 self.butler.ingest(dataset, transfer="copy")
314 self.butler.put(df1, self.datasetType, dataId={})
316 df2a = self.butler.get(self.datasetType, dataId={})
317 df2b = self.butler.get("legacy_dataframe", dataId={})
318 self.assertTrue(df2a.equals(df2b))
320 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
321 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
322 self.assertTrue(df3a.equals(df3b))
324 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
325 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
326 self.assertTrue(columns2a.equals(columns2b))
328 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
329 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
330 self.assertEqual(rowcount2a, rowcount2b)
332 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
333 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
334 self.assertEqual(schema2a, schema2b)
336 def testDataFrameSchema(self):
337 tab1 = _makeSimpleArrowTable()
339 schema = DataFrameSchema.from_arrow(tab1.schema)
341 self.assertIsInstance(schema.schema, pd.DataFrame)
342 self.assertEqual(repr(schema), repr(schema._schema))
343 self.assertNotEqual(schema, "not_a_schema")
344 self.assertEqual(schema, schema)
346 tab2 = _makeMultiIndexDataFrame()
347 schema2 = DataFrameSchema(tab2)
349 self.assertNotEqual(schema, schema2)
351 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
352 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
353 df1, allColumns = _makeSingleIndexDataFrame()
355 self.butler.put(df1, self.datasetType, dataId={})
357 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
359 tab2_df = tab2.to_pandas(index="index")
360 self.assertTrue(df1.equals(tab2_df))
362 # Check reading the columns.
363 columns = list(tab2.columns.keys())
364 columns2 = self.butler.get(
365 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
366 )
367 # We check the set because pandas reorders the columns.
368 self.assertEqual(set(columns2), set(columns))
370 # Check reading the schema.
371 schema = ArrowAstropySchema(tab2)
372 schema2 = self.butler.get(
373 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
374 )
376 # The string types are objectified by pandas, and the order
377 # will be changed because of pandas indexing.
378 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
379 for name in schema.schema.columns:
380 self.assertIn(name, schema2.schema.columns)
381 if schema2.schema[name].dtype != np.dtype("O"):
382 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
384 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
385 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
386 df1 = _makeMultiIndexDataFrame()
388 self.butler.put(df1, self.datasetType, dataId={})
390 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
392 # This is an odd duck, it doesn't really round-trip.
393 # This test simply checks that it's readable, but definitely not
394 # recommended.
396 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
397 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
398 df1, allColumns = _makeSingleIndexDataFrame()
400 self.butler.put(df1, self.datasetType, dataId={})
402 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
404 tab2_df = arrow_to_pandas(tab2)
405 self.assertTrue(df1.equals(tab2_df))
407 # Check reading the columns.
408 columns = list(tab2.schema.names)
409 columns2 = self.butler.get(
410 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
411 )
412 # We check the set because pandas reorders the columns.
413 self.assertEqual(set(columns), set(columns2))
415 # Check reading the schema.
416 schema = tab2.schema
417 schema2 = self.butler.get(
418 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
419 )
421 # These will not have the same metadata, nor will the string column
422 # information be maintained.
423 self.assertEqual(len(schema.names), len(schema2.names))
424 for name in schema.names:
425 if schema.field(name).type not in (pa.string(), pa.binary()):
426 self.assertEqual(schema.field(name).type, schema2.field(name).type)
428 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
429 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
430 df1 = _makeMultiIndexDataFrame()
432 self.butler.put(df1, self.datasetType, dataId={})
434 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
436 tab2_df = arrow_to_pandas(tab2)
437 self.assertTrue(df1.equals(tab2_df))
439 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
440 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
441 df1, allColumns = _makeSingleIndexDataFrame()
443 self.butler.put(df1, self.datasetType, dataId={})
445 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
447 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
448 self.assertTrue(df1.equals(tab2_df))
450 # Check reading the columns.
451 columns = list(tab2.dtype.names)
452 columns2 = self.butler.get(
453 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
454 )
455 # We check the set because pandas reorders the columns.
456 self.assertEqual(set(columns2), set(columns))
458 # Check reading the schema.
459 schema = ArrowNumpySchema(tab2.dtype)
460 schema2 = self.butler.get(
461 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
462 )
464 # The string types will be objectified by pandas, and the order
465 # will be changed because of pandas indexing.
466 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
467 for name in schema.schema.names:
468 self.assertIn(name, schema2.schema.names)
469 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
471 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
472 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
473 df1 = _makeMultiIndexDataFrame()
475 self.butler.put(df1, self.datasetType, dataId={})
477 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
479 # This is an odd duck, it doesn't really round-trip.
480 # This test simply checks that it's readable, but definitely not
481 # recommended.
484@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
485class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
486 """Tests for InMemoryDatastore, using DataFrameDelegate."""
488 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
490 def testMultiIndexDataFrame(self):
491 df1 = _makeMultiIndexDataFrame()
493 delegate = DataFrameDelegate("DataFrame")
495 # Read the whole DataFrame.
496 df2 = delegate.handleParameters(inMemoryDataset=df1)
497 self.assertTrue(df1.equals(df2))
498 # Read just the column descriptions.
499 columns2 = delegate.getComponent(composite=df1, componentName="columns")
500 self.assertTrue(df1.columns.equals(columns2))
502 # Read just some columns a few different ways.
503 with self.assertRaises(NotImplementedError) as cm:
504 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
505 self.assertIn("only supports string column names", str(cm.exception))
506 with self.assertRaises(NotImplementedError) as cm:
507 delegate.handleParameters(
508 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
509 )
510 self.assertIn("only supports string column names", str(cm.exception))
512 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
513 df1 = _makeMultiIndexDataFrame()
515 self.butler.put(df1, self.datasetType, dataId={})
517 with self.assertRaises(ValueError):
518 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
520 def testLegacyDataFrame(self):
521 # This test does not work with an inMemoryDatastore.
522 pass
524 def testBadInput(self):
525 df1, _ = _makeSingleIndexDataFrame()
526 delegate = DataFrameDelegate("DataFrame")
528 with self.assertRaises(ValueError):
529 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
531 with self.assertRaises(AttributeError):
532 delegate.getComponent(composite=df1, componentName="nothing")
534 def testStorageClass(self):
535 df1, allColumns = _makeSingleIndexDataFrame()
537 factory = StorageClassFactory()
538 factory.addFromConfig(StorageClassConfig())
540 storageClass = factory.findStorageClass(type(df1), compare_types=False)
541 # Force the name lookup to do name matching.
542 storageClass._pytype = None
543 self.assertEqual(storageClass.name, "DataFrame")
545 storageClass = factory.findStorageClass(type(df1), compare_types=True)
546 # Force the name lookup to do name matching.
547 storageClass._pytype = None
548 self.assertEqual(storageClass.name, "DataFrame")
551@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
552@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
553class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
554 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
556 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
558 def setUp(self):
559 """Create a new butler root for each test."""
560 self.root = makeTestTempDir(TESTDIR)
561 config = Config(self.configFile)
562 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
563 # No dimensions in dataset type so we don't have to worry about
564 # inserting dimension data or defining data IDs.
565 self.datasetType = DatasetType(
566 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
567 )
568 self.butler.registry.registerDatasetType(self.datasetType)
570 def tearDown(self):
571 removeTestTempDir(self.root)
573 def testAstropyTable(self):
574 tab1 = _makeSimpleAstropyTable()
576 self.butler.put(tab1, self.datasetType, dataId={})
577 # Read the whole Table.
578 tab2 = self.butler.get(self.datasetType, dataId={})
579 self._checkAstropyTableEquality(tab1, tab2)
580 # Read the columns.
581 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
582 self.assertEqual(len(columns2), len(tab1.dtype.names))
583 for i, name in enumerate(tab1.dtype.names):
584 self.assertEqual(columns2[i], name)
585 # Read the rowcount.
586 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
587 self.assertEqual(rowcount, len(tab1))
588 # Read the schema.
589 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
590 self.assertEqual(schema, ArrowAstropySchema(tab1))
591 # Read just some columns a few different ways.
592 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
593 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
594 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
595 self._checkAstropyTableEquality(tab1[("a",)], tab4)
596 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
597 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
598 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
599 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
600 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
601 self._checkAstropyTableEquality(tab1[("a",)], tab7)
602 # Passing an unrecognized column should be a ValueError.
603 with self.assertRaises(ValueError):
604 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
606 def testArrowAstropySchema(self):
607 tab1 = _makeSimpleAstropyTable()
608 tab1_arrow = astropy_to_arrow(tab1)
609 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
611 self.assertIsInstance(schema.schema, atable.Table)
612 self.assertEqual(repr(schema), repr(schema._schema))
613 self.assertNotEqual(schema, "not_a_schema")
614 self.assertEqual(schema, schema)
616 # Test various inequalities
617 tab2 = tab1.copy()
618 tab2.rename_column("index", "index2")
619 schema2 = ArrowAstropySchema(tab2)
620 self.assertNotEqual(schema2, schema)
622 tab2 = tab1.copy()
623 tab2["index"].unit = units.micron
624 schema2 = ArrowAstropySchema(tab2)
625 self.assertNotEqual(schema2, schema)
627 tab2 = tab1.copy()
628 tab2["index"].description = "Index column"
629 schema2 = ArrowAstropySchema(tab2)
630 self.assertNotEqual(schema2, schema)
632 tab2 = tab1.copy()
633 tab2["index"].format = "%05d"
634 schema2 = ArrowAstropySchema(tab2)
635 self.assertNotEqual(schema2, schema)
637 def testAstropyParquet(self):
638 """Test writing a dataframe to parquet via pandas (without additional
639 metadata) and ensure that we can read it back with all the new
640 functionality.
641 """
642 tab1 = _makeSimpleAstropyTable()
644 fname = os.path.join(self.root, "test_astropy.parq")
645 tab1.write(fname)
647 astropy_type = DatasetType(
648 "astropy_parquet",
649 dimensions=(),
650 storageClass="ArrowAstropy",
651 universe=self.butler.registry.dimensions,
652 )
653 self.butler.registry.registerDatasetType(astropy_type)
655 data_id = {}
656 ref = DatasetRef(astropy_type, data_id, id=None)
657 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
659 self.butler.ingest(dataset, transfer="copy")
661 self.butler.put(tab1, self.datasetType, dataId={})
663 tab2a = self.butler.get(self.datasetType, dataId={})
664 tab2b = self.butler.get("astropy_parquet", dataId={})
665 self._checkAstropyTableEquality(tab2a, tab2b)
667 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
668 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
669 self.assertEqual(len(columns2b), len(columns2a))
670 for i, name in enumerate(columns2a):
671 self.assertEqual(columns2b[i], name)
673 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
674 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
675 self.assertEqual(rowcount2a, rowcount2b)
677 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
678 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
679 self.assertEqual(schema2a, schema2b)
681 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
682 def testWriteAstropyReadAsArrowTable(self):
683 tab1 = _makeSimpleAstropyTable()
685 self.butler.put(tab1, self.datasetType, dataId={})
687 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
689 tab2_astropy = arrow_to_astropy(tab2)
690 self._checkAstropyTableEquality(tab1, tab2_astropy)
692 # Check reading the columns.
693 columns = tab2.schema.names
694 columns2 = self.butler.get(
695 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
696 )
697 self.assertEqual(columns2, columns)
699 # Check reading the schema.
700 schema = tab2.schema
701 schema2 = self.butler.get(
702 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
703 )
705 self.assertEqual(schema, schema2)
707 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
708 def testWriteAstropyReadAsDataFrame(self):
709 tab1 = _makeSimpleAstropyTable()
711 self.butler.put(tab1, self.datasetType, dataId={})
713 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
715 # This is tricky because it loses the units and gains a bonus pandas
716 # _index_ column, so we just test the dataframe form.
718 tab1_df = tab1.to_pandas()
719 self.assertTrue(tab1_df.equals(tab2))
721 # Check reading the columns.
722 columns = tab2.columns
723 columns2 = self.butler.get(
724 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
725 )
726 self.assertTrue(columns.equals(columns2))
728 # Check reading the schema.
729 schema = DataFrameSchema(tab2)
730 schema2 = self.butler.get(
731 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
732 )
734 self.assertEqual(schema2, schema)
736 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
737 def testWriteAstropyReadAsNumpyTable(self):
738 tab1 = _makeSimpleAstropyTable()
739 self.butler.put(tab1, self.datasetType, dataId={})
741 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
743 # This is tricky because it loses the units.
744 tab2_astropy = atable.Table(tab2)
746 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
748 # Check reading the columns.
749 columns = list(tab2.dtype.names)
750 columns2 = self.butler.get(
751 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
752 )
753 self.assertEqual(columns2, columns)
755 # Check reading the schema.
756 schema = ArrowNumpySchema(tab2.dtype)
757 schema2 = self.butler.get(
758 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
759 )
761 self.assertEqual(schema2, schema)
763 def _checkAstropyTableEquality(self, table1, table2, skip_units=False):
764 """Check if two astropy tables have the same columns/values.
766 Parameters
767 ----------
768 table1 : `astropy.table.Table`
769 table2 : `astropy.table.Table`
770 skip_units : `bool`
771 """
772 self.assertEqual(table1.dtype, table2.dtype)
773 if not skip_units:
774 for name in table1.columns:
775 self.assertEqual(table1[name].unit, table2[name].unit)
776 self.assertEqual(table1[name].description, table2[name].description)
777 self.assertEqual(table1[name].format, table2[name].format)
778 self.assertTrue(np.all(table1 == table2))
781@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
782class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
783 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
785 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
787 def testAstropyParquet(self):
788 # This test does not work with an inMemoryDatastore.
789 pass
791 def testBadInput(self):
792 tab1 = _makeSimpleAstropyTable()
793 delegate = ArrowAstropyDelegate("ArrowAstropy")
795 with self.assertRaises(ValueError):
796 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
798 with self.assertRaises(NotImplementedError):
799 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
801 with self.assertRaises(AttributeError):
802 delegate.getComponent(composite=tab1, componentName="nothing")
805@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
806@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
807class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
808 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
810 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
812 def setUp(self):
813 """Create a new butler root for each test."""
814 self.root = makeTestTempDir(TESTDIR)
815 config = Config(self.configFile)
816 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
817 # No dimensions in dataset type so we don't have to worry about
818 # inserting dimension data or defining data IDs.
819 self.datasetType = DatasetType(
820 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
821 )
822 self.butler.registry.registerDatasetType(self.datasetType)
824 def tearDown(self):
825 removeTestTempDir(self.root)
827 def testNumpyTable(self):
828 tab1 = _makeSimpleNumpyTable()
830 self.butler.put(tab1, self.datasetType, dataId={})
831 # Read the whole Table.
832 tab2 = self.butler.get(self.datasetType, dataId={})
833 self._checkNumpyTableEquality(tab1, tab2)
834 # Read the columns.
835 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
836 self.assertEqual(len(columns2), len(tab1.dtype.names))
837 for i, name in enumerate(tab1.dtype.names):
838 self.assertEqual(columns2[i], name)
839 # Read the rowcount.
840 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
841 self.assertEqual(rowcount, len(tab1))
842 # Read the schema.
843 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
844 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
845 # Read just some columns a few different ways.
846 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
847 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
848 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
849 self._checkNumpyTableEquality(
850 tab1[
851 [
852 "a",
853 ]
854 ],
855 tab4,
856 )
857 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
858 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
859 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
860 self._checkNumpyTableEquality(
861 tab1[
862 [
863 "ddd",
864 ]
865 ],
866 tab6,
867 )
868 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
869 self._checkNumpyTableEquality(
870 tab1[
871 [
872 "a",
873 ]
874 ],
875 tab7,
876 )
877 # Passing an unrecognized column should be a ValueError.
878 with self.assertRaises(ValueError):
879 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
881 def testArrowNumpySchema(self):
882 tab1 = _makeSimpleNumpyTable()
883 tab1_arrow = numpy_to_arrow(tab1)
884 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
886 self.assertIsInstance(schema.schema, np.dtype)
887 self.assertEqual(repr(schema), repr(schema._dtype))
888 self.assertNotEqual(schema, "not_a_schema")
889 self.assertEqual(schema, schema)
891 # Test inequality
892 tab2 = tab1.copy()
893 names = list(tab2.dtype.names)
894 names[0] = "index2"
895 tab2.dtype.names = names
896 schema2 = ArrowNumpySchema(tab2.dtype)
897 self.assertNotEqual(schema2, schema)
899 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
900 def testNumpyDictConversions(self):
901 tab1 = _makeSimpleNumpyTable()
903 # Verify that everything round-trips, including the schema.
904 tab1_arrow = numpy_to_arrow(tab1)
905 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
906 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
908 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
909 self.assertEqual(tab1_arrow, tab1_dict_arrow)
911 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
912 def testWriteNumpyTableReadAsArrowTable(self):
913 tab1 = _makeSimpleNumpyTable()
915 self.butler.put(tab1, self.datasetType, dataId={})
917 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
919 tab2_numpy = arrow_to_numpy(tab2)
921 self._checkNumpyTableEquality(tab1, tab2_numpy)
923 # Check reading the columns.
924 columns = tab2.schema.names
925 columns2 = self.butler.get(
926 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
927 )
928 self.assertEqual(columns2, columns)
930 # Check reading the schema.
931 schema = tab2.schema
932 schema2 = self.butler.get(
933 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
934 )
935 self.assertEqual(schema2, schema)
937 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
938 def testWriteNumpyTableReadAsDataFrame(self):
939 tab1 = _makeSimpleNumpyTable()
941 self.butler.put(tab1, self.datasetType, dataId={})
943 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
945 # Converting this back to numpy gets confused with the index column
946 # and changes the datatype of the string column.
948 tab1_df = pd.DataFrame(tab1)
950 self.assertTrue(tab1_df.equals(tab2))
952 # Check reading the columns.
953 columns = tab2.columns
954 columns2 = self.butler.get(
955 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
956 )
957 self.assertTrue(columns.equals(columns2))
959 # Check reading the schema.
960 schema = DataFrameSchema(tab2)
961 schema2 = self.butler.get(
962 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
963 )
965 self.assertEqual(schema2, schema)
967 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
968 def testWriteNumpyTableReadAsAstropyTable(self):
969 tab1 = _makeSimpleNumpyTable()
971 self.butler.put(tab1, self.datasetType, dataId={})
973 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
974 tab2_numpy = tab2.as_array()
976 self._checkNumpyTableEquality(tab1, tab2_numpy)
978 # Check reading the columns.
979 columns = list(tab2.columns.keys())
980 columns2 = self.butler.get(
981 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
982 )
983 self.assertEqual(columns2, columns)
985 # Check reading the schema.
986 schema = ArrowAstropySchema(tab2)
987 schema2 = self.butler.get(
988 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
989 )
991 self.assertEqual(schema2, schema)
993 def _checkNumpyTableEquality(self, table1, table2):
994 """Check if two numpy tables have the same columns/values
996 Parameters
997 ----------
998 table1 : `numpy.ndarray`
999 table2 : `numpy.ndarray`
1000 """
1001 self.assertEqual(table1.dtype.names, table2.dtype.names)
1002 for name in table1.dtype.names:
1003 self.assertEqual(table1.dtype[name], table2.dtype[name])
1004 self.assertTrue(np.all(table1 == table2))
1007@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1008class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1009 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1011 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1013 def testBadInput(self):
1014 tab1 = _makeSimpleNumpyTable()
1015 delegate = ArrowNumpyDelegate("ArrowNumpy")
1017 with self.assertRaises(ValueError):
1018 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1020 with self.assertRaises(NotImplementedError):
1021 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1023 with self.assertRaises(AttributeError):
1024 delegate.getComponent(composite=tab1, componentName="nothing")
1026 def testStorageClass(self):
1027 tab1 = _makeSimpleNumpyTable()
1029 factory = StorageClassFactory()
1030 factory.addFromConfig(StorageClassConfig())
1032 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1033 # Force the name lookup to do name matching.
1034 storageClass._pytype = None
1035 self.assertEqual(storageClass.name, "ArrowNumpy")
1037 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1038 # Force the name lookup to do name matching.
1039 storageClass._pytype = None
1040 self.assertEqual(storageClass.name, "ArrowNumpy")
1043@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1044class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1045 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1047 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1049 def setUp(self):
1050 """Create a new butler root for each test."""
1051 self.root = makeTestTempDir(TESTDIR)
1052 config = Config(self.configFile)
1053 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1054 # No dimensions in dataset type so we don't have to worry about
1055 # inserting dimension data or defining data IDs.
1056 self.datasetType = DatasetType(
1057 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1058 )
1059 self.butler.registry.registerDatasetType(self.datasetType)
1061 def tearDown(self):
1062 removeTestTempDir(self.root)
1064 def testArrowTable(self):
1065 tab1 = _makeSimpleArrowTable()
1067 self.butler.put(tab1, self.datasetType, dataId={})
1068 # Read the whole Table.
1069 tab2 = self.butler.get(self.datasetType, dataId={})
1070 self.assertEqual(tab2, tab1)
1071 # Read the columns.
1072 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1073 self.assertEqual(len(columns2), len(tab1.schema.names))
1074 for i, name in enumerate(tab1.schema.names):
1075 self.assertEqual(columns2[i], name)
1076 # Read the rowcount.
1077 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1078 self.assertEqual(rowcount, len(tab1))
1079 # Read the schema.
1080 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1081 self.assertEqual(schema, tab1.schema)
1082 # Read just some columns a few different ways.
1083 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1084 self.assertEqual(tab3, tab1.select(("a", "c")))
1085 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1086 self.assertEqual(tab4, tab1.select(("a",)))
1087 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1088 self.assertEqual(tab5, tab1.select(("index", "a")))
1089 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1090 self.assertEqual(tab6, tab1.select(("ddd",)))
1091 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1092 self.assertEqual(tab7, tab1.select(("a",)))
1093 # Passing an unrecognized column should be a ValueError.
1094 with self.assertRaises(ValueError):
1095 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1097 def testEmptyArrowTable(self):
1098 data = _makeSimpleNumpyTable()
1099 type_list = [(name, pa.from_numpy_dtype(data.dtype[name].type)) for name in data.dtype.names]
1101 schema = pa.schema(type_list)
1102 arrays = [[]] * len(schema.names)
1104 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1106 self.butler.put(tab1, self.datasetType, dataId={})
1107 tab2 = self.butler.get(self.datasetType, dataId={})
1108 self.assertEqual(tab2, tab1)
1110 tab1_numpy = arrow_to_numpy(tab1)
1111 self.assertEqual(len(tab1_numpy), 0)
1112 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1113 self.assertEqual(tab1_numpy_arrow, tab1)
1115 tab1_pandas = arrow_to_pandas(tab1)
1116 self.assertEqual(len(tab1_pandas), 0)
1117 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1118 # Unfortunately, string/byte columns get mangled when translated
1119 # through empty pandas dataframes.
1120 self.assertEqual(
1121 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1122 tab1.select(("index", "a", "b", "c", "ddd")),
1123 )
1125 tab1_astropy = arrow_to_astropy(tab1)
1126 self.assertEqual(len(tab1_astropy), 0)
1127 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1128 self.assertEqual(tab1_astropy_arrow, tab1)
1130 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1131 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1132 df1, allColumns = _makeSingleIndexDataFrame()
1134 self.butler.put(df1, self.datasetType, dataId={})
1136 # Read back out as a dataframe.
1137 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1138 self.assertTrue(df1.equals(df2))
1140 # Read back out as an arrow table, convert to dataframe.
1141 tab3 = self.butler.get(self.datasetType, dataId={})
1142 df3 = arrow_to_pandas(tab3)
1143 self.assertTrue(df1.equals(df3))
1145 # Check reading the columns.
1146 columns = df2.reset_index().columns
1147 columns2 = self.butler.get(
1148 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1149 )
1150 # We check the set because pandas reorders the columns.
1151 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1153 # Check reading the schema.
1154 schema = DataFrameSchema(df1)
1155 schema2 = self.butler.get(
1156 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1157 )
1158 self.assertEqual(schema2, schema)
1160 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1161 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1162 df1 = _makeMultiIndexDataFrame()
1164 self.butler.put(df1, self.datasetType, dataId={})
1166 # Read back out as a dataframe.
1167 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1168 self.assertTrue(df1.equals(df2))
1170 # Read back out as an arrow table, convert to dataframe.
1171 atab3 = self.butler.get(self.datasetType, dataId={})
1172 df3 = arrow_to_pandas(atab3)
1173 self.assertTrue(df1.equals(df3))
1175 # Check reading the columns.
1176 columns = df2.columns
1177 columns2 = self.butler.get(
1178 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1179 )
1180 self.assertTrue(columns2.equals(columns))
1182 # Check reading the schema.
1183 schema = DataFrameSchema(df1)
1184 schema2 = self.butler.get(
1185 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1186 )
1187 self.assertEqual(schema2, schema)
1189 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1190 def testWriteArrowTableReadAsAstropyTable(self):
1191 tab1 = _makeSimpleAstropyTable()
1193 self.butler.put(tab1, self.datasetType, dataId={})
1195 # Read back out as an astropy table.
1196 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1197 self._checkAstropyTableEquality(tab1, tab2)
1199 # Read back out as an arrow table, convert to astropy table.
1200 atab3 = self.butler.get(self.datasetType, dataId={})
1201 tab3 = arrow_to_astropy(atab3)
1202 self._checkAstropyTableEquality(tab1, tab3)
1204 # Check reading the columns.
1205 columns = list(tab2.columns.keys())
1206 columns2 = self.butler.get(
1207 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1208 )
1209 self.assertEqual(columns2, columns)
1211 # Check reading the schema.
1212 schema = ArrowAstropySchema(tab1)
1213 schema2 = self.butler.get(
1214 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1215 )
1216 self.assertEqual(schema2, schema)
1218 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1219 def testWriteArrowTableReadAsNumpyTable(self):
1220 tab1 = _makeSimpleNumpyTable()
1222 self.butler.put(tab1, self.datasetType, dataId={})
1224 # Read back out as a numpy table.
1225 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1226 self._checkNumpyTableEquality(tab1, tab2)
1228 # Read back out as an arrow table, convert to numpy table.
1229 atab3 = self.butler.get(self.datasetType, dataId={})
1230 tab3 = arrow_to_numpy(atab3)
1231 self._checkNumpyTableEquality(tab1, tab3)
1233 # Check reading the columns.
1234 columns = list(tab2.dtype.names)
1235 columns2 = self.butler.get(
1236 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1237 )
1238 self.assertEqual(columns2, columns)
1240 # Check reading the schema.
1241 schema = ArrowNumpySchema(tab1.dtype)
1242 schema2 = self.butler.get(
1243 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1244 )
1245 self.assertEqual(schema2, schema)
1247 def _checkAstropyTableEquality(self, table1, table2):
1248 """Check if two astropy tables have the same columns/values
1250 Parameters
1251 ----------
1252 table1 : `astropy.table.Table`
1253 table2 : `astropy.table.Table`
1254 """
1255 self.assertEqual(table1.dtype, table2.dtype)
1256 for name in table1.columns:
1257 self.assertEqual(table1[name].unit, table2[name].unit)
1258 self.assertEqual(table1[name].description, table2[name].description)
1259 self.assertEqual(table1[name].format, table2[name].format)
1260 self.assertTrue(np.all(table1 == table2))
1262 def _checkNumpyTableEquality(self, table1, table2):
1263 """Check if two numpy tables have the same columns/values
1265 Parameters
1266 ----------
1267 table1 : `numpy.ndarray`
1268 table2 : `numpy.ndarray`
1269 """
1270 self.assertEqual(table1.dtype.names, table2.dtype.names)
1271 for name in table1.dtype.names:
1272 self.assertEqual(table1.dtype[name], table2.dtype[name])
1273 self.assertTrue(np.all(table1 == table2))
1276@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1277class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1278 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1280 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1282 def testBadInput(self):
1283 tab1 = _makeSimpleArrowTable()
1284 delegate = ArrowTableDelegate("ArrowTable")
1286 with self.assertRaises(ValueError):
1287 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1289 with self.assertRaises(NotImplementedError):
1290 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1292 with self.assertRaises(AttributeError):
1293 delegate.getComponent(composite=tab1, componentName="nothing")
1295 def testStorageClass(self):
1296 tab1 = _makeSimpleArrowTable()
1298 factory = StorageClassFactory()
1299 factory.addFromConfig(StorageClassConfig())
1301 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1302 # Force the name lookup to do name matching.
1303 storageClass._pytype = None
1304 self.assertEqual(storageClass.name, "ArrowTable")
1306 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1307 # Force the name lookup to do name matching.
1308 storageClass._pytype = None
1309 self.assertEqual(storageClass.name, "ArrowTable")
1312if __name__ == "__main__": 1312 ↛ 1313line 1312 didn't jump to line 1313, because the condition on line 1312 was never true
1313 unittest.main()