Coverage for tests/test_parquet.py: 18%
680 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-29 02:20 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-29 02:20 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 arrow_to_astropy,
67 arrow_to_numpy,
68 arrow_to_numpy_dict,
69 arrow_to_pandas,
70 astropy_to_arrow,
71 numpy_dict_to_arrow,
72 numpy_to_arrow,
73 pandas_to_arrow,
74)
75from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
77TESTDIR = os.path.abspath(os.path.dirname(__file__))
80def _makeSimpleNumpyTable():
81 """Make a simple numpy table with random data.
83 Returns
84 -------
85 numpyTable : `numpy.ndarray`
86 """
87 nrow = 5
88 data = np.zeros(
89 nrow,
90 dtype=[
91 ("index", "i4"),
92 ("a", "f8"),
93 ("b", "f8"),
94 ("c", "f8"),
95 ("ddd", "f8"),
96 ("strcol", "U10"),
97 ("bytecol", "a10"),
98 ],
99 )
100 data["index"][:] = np.arange(nrow)
101 data["a"] = np.random.randn(nrow)
102 data["b"] = np.random.randn(nrow)
103 data["c"] = np.random.randn(nrow)
104 data["ddd"] = np.random.randn(nrow)
105 data["strcol"][:] = "teststring"
106 data["bytecol"][:] = "teststring"
108 return data
111def _makeSingleIndexDataFrame():
112 """Make a single index data frame for testing.
114 Returns
115 -------
116 dataFrame : `~pandas.DataFrame`
117 The test dataframe.
118 allColumns : `list` [`str`]
119 List of all the columns (including index columns).
120 """
121 data = _makeSimpleNumpyTable()
122 df = pd.DataFrame(data)
123 df = df.set_index("index")
124 allColumns = df.columns.append(pd.Index(df.index.names))
126 return df, allColumns
129def _makeMultiIndexDataFrame():
130 """Make a multi-index data frame for testing.
132 Returns
133 -------
134 dataFrame : `~pandas.DataFrame`
135 The test dataframe.
136 """
137 columns = pd.MultiIndex.from_tuples(
138 [
139 ("g", "a"),
140 ("g", "b"),
141 ("g", "c"),
142 ("r", "a"),
143 ("r", "b"),
144 ("r", "c"),
145 ],
146 names=["filter", "column"],
147 )
148 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
150 return df
153def _makeSimpleAstropyTable():
154 """Make an astropy table for testing.
156 Returns
157 -------
158 astropyTable : `astropy.table.Table`
159 The test table.
160 """
161 data = _makeSimpleNumpyTable()
162 # Add a couple of units.
163 table = atable.Table(data)
164 table["a"].unit = units.degree
165 table["b"].unit = units.meter
166 return table
169def _makeSimpleArrowTable():
170 """Make an arrow table for testing.
172 Returns
173 -------
174 arrowTable : `pyarrow.Table`
175 The test table.
176 """
177 data = _makeSimpleNumpyTable()
178 return numpy_to_arrow(data)
181@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
182@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
183class ParquetFormatterDataFrameTestCase(unittest.TestCase):
184 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
186 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
188 def setUp(self):
189 """Create a new butler root for each test."""
190 self.root = makeTestTempDir(TESTDIR)
191 config = Config(self.configFile)
192 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
193 # No dimensions in dataset type so we don't have to worry about
194 # inserting dimension data or defining data IDs.
195 self.datasetType = DatasetType(
196 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
197 )
198 self.butler.registry.registerDatasetType(self.datasetType)
200 def tearDown(self):
201 removeTestTempDir(self.root)
203 def testSingleIndexDataFrame(self):
204 df1, allColumns = _makeSingleIndexDataFrame()
206 self.butler.put(df1, self.datasetType, dataId={})
207 # Read the whole DataFrame.
208 df2 = self.butler.get(self.datasetType, dataId={})
209 self.assertTrue(df1.equals(df2))
210 # Read just the column descriptions.
211 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
212 self.assertTrue(allColumns.equals(columns2))
213 # Read the rowcount.
214 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
215 self.assertEqual(rowcount, len(df1))
216 # Read the schema.
217 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
218 self.assertEqual(schema, DataFrameSchema(df1))
219 # Read just some columns a few different ways.
220 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
221 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
222 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
223 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
224 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
225 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
226 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
227 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
228 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
229 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
230 # Passing an unrecognized column should be a ValueError.
231 with self.assertRaises(ValueError):
232 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
234 def testMultiIndexDataFrame(self):
235 df1 = _makeMultiIndexDataFrame()
237 self.butler.put(df1, self.datasetType, dataId={})
238 # Read the whole DataFrame.
239 df2 = self.butler.get(self.datasetType, dataId={})
240 self.assertTrue(df1.equals(df2))
241 # Read just the column descriptions.
242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
243 self.assertTrue(df1.columns.equals(columns2))
244 # Read the rowcount.
245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
246 self.assertEqual(rowcount, len(df1))
247 # Read the schema.
248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
249 self.assertEqual(schema, DataFrameSchema(df1))
250 # Read just some columns a few different ways.
251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
252 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
253 df4 = self.butler.get(
254 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
255 )
256 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
257 column_list = [("g", "a"), ("r", "c")]
258 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
259 self.assertTrue(df1.loc[:, column_list].equals(df5))
260 # Passing an unrecognized column should be a ValueError.
261 with self.assertRaises(ValueError):
262 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
264 def testLegacyDataFrame(self):
265 """Test writing a dataframe to parquet via pandas (without additional
266 metadata) and ensure that we can read it back with all the new
267 functionality.
268 """
269 df1, allColumns = _makeSingleIndexDataFrame()
271 fname = os.path.join(self.root, "test_dataframe.parq")
272 df1.to_parquet(fname)
274 legacy_type = DatasetType(
275 "legacy_dataframe",
276 dimensions=(),
277 storageClass="DataFrame",
278 universe=self.butler.registry.dimensions,
279 )
280 self.butler.registry.registerDatasetType(legacy_type)
282 data_id = {}
283 ref = DatasetRef(legacy_type, data_id, id=None)
284 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
286 self.butler.ingest(dataset, transfer="copy")
288 self.butler.put(df1, self.datasetType, dataId={})
290 df2a = self.butler.get(self.datasetType, dataId={})
291 df2b = self.butler.get("legacy_dataframe", dataId={})
292 self.assertTrue(df2a.equals(df2b))
294 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
295 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
296 self.assertTrue(df3a.equals(df3b))
298 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
299 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
300 self.assertTrue(columns2a.equals(columns2b))
302 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
303 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
304 self.assertEqual(rowcount2a, rowcount2b)
306 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
307 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
308 self.assertEqual(schema2a, schema2b)
310 def testDataFrameSchema(self):
311 tab1 = _makeSimpleArrowTable()
313 schema = DataFrameSchema.from_arrow(tab1.schema)
315 self.assertIsInstance(schema.schema, pd.DataFrame)
316 self.assertEqual(repr(schema), repr(schema._schema))
317 self.assertNotEqual(schema, "not_a_schema")
318 self.assertEqual(schema, schema)
320 tab2 = _makeMultiIndexDataFrame()
321 schema2 = DataFrameSchema(tab2)
323 self.assertNotEqual(schema, schema2)
325 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
326 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
327 df1, allColumns = _makeSingleIndexDataFrame()
329 self.butler.put(df1, self.datasetType, dataId={})
331 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
333 tab2_df = tab2.to_pandas(index="index")
334 self.assertTrue(df1.equals(tab2_df))
336 # Check reading the columns.
337 columns = list(tab2.columns.keys())
338 columns2 = self.butler.get(
339 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
340 )
341 # We check the set because pandas reorders the columns.
342 self.assertEqual(set(columns2), set(columns))
344 # Check reading the schema.
345 schema = ArrowAstropySchema(tab2)
346 schema2 = self.butler.get(
347 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
348 )
350 # The string types are objectified by pandas, and the order
351 # will be changed because of pandas indexing.
352 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
353 for name in schema.schema.columns:
354 self.assertIn(name, schema2.schema.columns)
355 if schema2.schema[name].dtype != np.dtype("O"):
356 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
358 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
359 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
360 df1 = _makeMultiIndexDataFrame()
362 self.butler.put(df1, self.datasetType, dataId={})
364 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
366 # This is an odd duck, it doesn't really round-trip.
367 # This test simply checks that it's readable, but definitely not
368 # recommended.
370 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
371 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
372 df1, allColumns = _makeSingleIndexDataFrame()
374 self.butler.put(df1, self.datasetType, dataId={})
376 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
378 tab2_df = arrow_to_pandas(tab2)
379 self.assertTrue(df1.equals(tab2_df))
381 # Check reading the columns.
382 columns = list(tab2.schema.names)
383 columns2 = self.butler.get(
384 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
385 )
386 # We check the set because pandas reorders the columns.
387 self.assertEqual(set(columns), set(columns2))
389 # Check reading the schema.
390 schema = tab2.schema
391 schema2 = self.butler.get(
392 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
393 )
395 # These will not have the same metadata, nor will the string column
396 # information be maintained.
397 self.assertEqual(len(schema.names), len(schema2.names))
398 for name in schema.names:
399 if schema.field(name).type not in (pa.string(), pa.binary()):
400 self.assertEqual(schema.field(name).type, schema2.field(name).type)
402 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
403 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
404 df1 = _makeMultiIndexDataFrame()
406 self.butler.put(df1, self.datasetType, dataId={})
408 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
410 tab2_df = arrow_to_pandas(tab2)
411 self.assertTrue(df1.equals(tab2_df))
413 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
414 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
415 df1, allColumns = _makeSingleIndexDataFrame()
417 self.butler.put(df1, self.datasetType, dataId={})
419 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
421 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
422 self.assertTrue(df1.equals(tab2_df))
424 # Check reading the columns.
425 columns = list(tab2.dtype.names)
426 columns2 = self.butler.get(
427 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
428 )
429 # We check the set because pandas reorders the columns.
430 self.assertEqual(set(columns2), set(columns))
432 # Check reading the schema.
433 schema = ArrowNumpySchema(tab2.dtype)
434 schema2 = self.butler.get(
435 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
436 )
438 # The string types will be objectified by pandas, and the order
439 # will be changed because of pandas indexing.
440 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
441 for name in schema.schema.names:
442 self.assertIn(name, schema2.schema.names)
443 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
445 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
446 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
447 df1 = _makeMultiIndexDataFrame()
449 self.butler.put(df1, self.datasetType, dataId={})
451 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
453 # This is an odd duck, it doesn't really round-trip.
454 # This test simply checks that it's readable, but definitely not
455 # recommended.
458@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
459class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
460 """Tests for InMemoryDatastore, using DataFrameDelegate."""
462 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
464 def testMultiIndexDataFrame(self):
465 df1 = _makeMultiIndexDataFrame()
467 delegate = DataFrameDelegate("DataFrame")
469 # Read the whole DataFrame.
470 df2 = delegate.handleParameters(inMemoryDataset=df1)
471 self.assertTrue(df1.equals(df2))
472 # Read just the column descriptions.
473 columns2 = delegate.getComponent(composite=df1, componentName="columns")
474 self.assertTrue(df1.columns.equals(columns2))
476 # Read just some columns a few different ways.
477 with self.assertRaises(NotImplementedError) as cm:
478 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
479 self.assertIn("only supports string column names", str(cm.exception))
480 with self.assertRaises(NotImplementedError) as cm:
481 delegate.handleParameters(
482 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
483 )
484 self.assertIn("only supports string column names", str(cm.exception))
486 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
487 df1 = _makeMultiIndexDataFrame()
489 self.butler.put(df1, self.datasetType, dataId={})
491 with self.assertRaises(ValueError):
492 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
494 def testLegacyDataFrame(self):
495 # This test does not work with an inMemoryDatastore.
496 pass
498 def testBadInput(self):
499 df1, _ = _makeSingleIndexDataFrame()
500 delegate = DataFrameDelegate("DataFrame")
502 with self.assertRaises(ValueError):
503 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
505 with self.assertRaises(AttributeError):
506 delegate.getComponent(composite=df1, componentName="nothing")
508 def testStorageClass(self):
509 df1, allColumns = _makeSingleIndexDataFrame()
511 factory = StorageClassFactory()
512 factory.addFromConfig(StorageClassConfig())
514 storageClass = factory.findStorageClass(type(df1), compare_types=False)
515 # Force the name lookup to do name matching.
516 storageClass._pytype = None
517 self.assertEqual(storageClass.name, "DataFrame")
519 storageClass = factory.findStorageClass(type(df1), compare_types=True)
520 # Force the name lookup to do name matching.
521 storageClass._pytype = None
522 self.assertEqual(storageClass.name, "DataFrame")
525@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
526@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
527class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
528 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
530 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
532 def setUp(self):
533 """Create a new butler root for each test."""
534 self.root = makeTestTempDir(TESTDIR)
535 config = Config(self.configFile)
536 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
537 # No dimensions in dataset type so we don't have to worry about
538 # inserting dimension data or defining data IDs.
539 self.datasetType = DatasetType(
540 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
541 )
542 self.butler.registry.registerDatasetType(self.datasetType)
544 def tearDown(self):
545 removeTestTempDir(self.root)
547 def testAstropyTable(self):
548 tab1 = _makeSimpleAstropyTable()
550 self.butler.put(tab1, self.datasetType, dataId={})
551 # Read the whole Table.
552 tab2 = self.butler.get(self.datasetType, dataId={})
553 self._checkAstropyTableEquality(tab1, tab2)
554 # Read the columns.
555 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
556 self.assertEqual(len(columns2), len(tab1.dtype.names))
557 for i, name in enumerate(tab1.dtype.names):
558 self.assertEqual(columns2[i], name)
559 # Read the rowcount.
560 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
561 self.assertEqual(rowcount, len(tab1))
562 # Read the schema.
563 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
564 self.assertEqual(schema, ArrowAstropySchema(tab1))
565 # Read just some columns a few different ways.
566 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
567 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
568 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
569 self._checkAstropyTableEquality(tab1[("a",)], tab4)
570 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
571 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
572 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
573 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
574 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
575 self._checkAstropyTableEquality(tab1[("a",)], tab7)
576 # Passing an unrecognized column should be a ValueError.
577 with self.assertRaises(ValueError):
578 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
580 def testArrowAstropySchema(self):
581 tab1 = _makeSimpleAstropyTable()
582 tab1_arrow = astropy_to_arrow(tab1)
583 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
585 self.assertIsInstance(schema.schema, atable.Table)
586 self.assertEqual(repr(schema), repr(schema._schema))
587 self.assertNotEqual(schema, "not_a_schema")
588 self.assertEqual(schema, schema)
590 # Test various inequalities
591 tab2 = tab1.copy()
592 tab2.rename_column("index", "index2")
593 schema2 = ArrowAstropySchema(tab2)
594 self.assertNotEqual(schema2, schema)
596 tab2 = tab1.copy()
597 tab2["index"].unit = units.micron
598 schema2 = ArrowAstropySchema(tab2)
599 self.assertNotEqual(schema2, schema)
601 tab2 = tab1.copy()
602 tab2["index"].description = "Index column"
603 schema2 = ArrowAstropySchema(tab2)
604 self.assertNotEqual(schema2, schema)
606 tab2 = tab1.copy()
607 tab2["index"].format = "%05d"
608 schema2 = ArrowAstropySchema(tab2)
609 self.assertNotEqual(schema2, schema)
611 def testAstropyParquet(self):
612 """Test writing a dataframe to parquet via pandas (without additional
613 metadata) and ensure that we can read it back with all the new
614 functionality.
615 """
616 tab1 = _makeSimpleAstropyTable()
618 fname = os.path.join(self.root, "test_astropy.parq")
619 tab1.write(fname)
621 astropy_type = DatasetType(
622 "astropy_parquet",
623 dimensions=(),
624 storageClass="ArrowAstropy",
625 universe=self.butler.registry.dimensions,
626 )
627 self.butler.registry.registerDatasetType(astropy_type)
629 data_id = {}
630 ref = DatasetRef(astropy_type, data_id, id=None)
631 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
633 self.butler.ingest(dataset, transfer="copy")
635 self.butler.put(tab1, self.datasetType, dataId={})
637 tab2a = self.butler.get(self.datasetType, dataId={})
638 tab2b = self.butler.get("astropy_parquet", dataId={})
639 self._checkAstropyTableEquality(tab2a, tab2b)
641 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
642 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
643 self.assertEqual(len(columns2b), len(columns2a))
644 for i, name in enumerate(columns2a):
645 self.assertEqual(columns2b[i], name)
647 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
648 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
649 self.assertEqual(rowcount2a, rowcount2b)
651 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
652 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
653 self.assertEqual(schema2a, schema2b)
655 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
656 def testWriteAstropyReadAsArrowTable(self):
657 tab1 = _makeSimpleAstropyTable()
659 self.butler.put(tab1, self.datasetType, dataId={})
661 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
663 tab2_astropy = arrow_to_astropy(tab2)
664 self._checkAstropyTableEquality(tab1, tab2_astropy)
666 # Check reading the columns.
667 columns = tab2.schema.names
668 columns2 = self.butler.get(
669 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
670 )
671 self.assertEqual(columns2, columns)
673 # Check reading the schema.
674 schema = tab2.schema
675 schema2 = self.butler.get(
676 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
677 )
679 self.assertEqual(schema, schema2)
681 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
682 def testWriteAstropyReadAsDataFrame(self):
683 tab1 = _makeSimpleAstropyTable()
685 self.butler.put(tab1, self.datasetType, dataId={})
687 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
689 # This is tricky because it loses the units and gains a bonus pandas
690 # _index_ column, so we just test the dataframe form.
692 tab1_df = tab1.to_pandas()
693 self.assertTrue(tab1_df.equals(tab2))
695 # Check reading the columns.
696 columns = tab2.columns
697 columns2 = self.butler.get(
698 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
699 )
700 self.assertTrue(columns.equals(columns2))
702 # Check reading the schema.
703 schema = DataFrameSchema(tab2)
704 schema2 = self.butler.get(
705 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
706 )
708 self.assertEqual(schema2, schema)
710 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
711 def testWriteAstropyReadAsNumpyTable(self):
712 tab1 = _makeSimpleAstropyTable()
713 self.butler.put(tab1, self.datasetType, dataId={})
715 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
717 # This is tricky because it loses the units.
718 tab2_astropy = atable.Table(tab2)
720 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
722 # Check reading the columns.
723 columns = list(tab2.dtype.names)
724 columns2 = self.butler.get(
725 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
726 )
727 self.assertEqual(columns2, columns)
729 # Check reading the schema.
730 schema = ArrowNumpySchema(tab2.dtype)
731 schema2 = self.butler.get(
732 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
733 )
735 self.assertEqual(schema2, schema)
737 def _checkAstropyTableEquality(self, table1, table2, skip_units=False):
738 """Check if two astropy tables have the same columns/values.
740 Parameters
741 ----------
742 table1 : `astropy.table.Table`
743 table2 : `astropy.table.Table`
744 skip_units : `bool`
745 """
746 self.assertEqual(table1.dtype, table2.dtype)
747 if not skip_units:
748 for name in table1.columns:
749 self.assertEqual(table1[name].unit, table2[name].unit)
750 self.assertEqual(table1[name].description, table2[name].description)
751 self.assertEqual(table1[name].format, table2[name].format)
752 self.assertTrue(np.all(table1 == table2))
755@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
756class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
757 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
759 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
761 def testAstropyParquet(self):
762 # This test does not work with an inMemoryDatastore.
763 pass
765 def testBadInput(self):
766 tab1 = _makeSimpleAstropyTable()
767 delegate = ArrowAstropyDelegate("ArrowAstropy")
769 with self.assertRaises(ValueError):
770 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
772 with self.assertRaises(NotImplementedError):
773 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
775 with self.assertRaises(AttributeError):
776 delegate.getComponent(composite=tab1, componentName="nothing")
779@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
780@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
781class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
782 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
784 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
786 def setUp(self):
787 """Create a new butler root for each test."""
788 self.root = makeTestTempDir(TESTDIR)
789 config = Config(self.configFile)
790 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
791 # No dimensions in dataset type so we don't have to worry about
792 # inserting dimension data or defining data IDs.
793 self.datasetType = DatasetType(
794 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
795 )
796 self.butler.registry.registerDatasetType(self.datasetType)
798 def tearDown(self):
799 removeTestTempDir(self.root)
801 def testNumpyTable(self):
802 tab1 = _makeSimpleNumpyTable()
804 self.butler.put(tab1, self.datasetType, dataId={})
805 # Read the whole Table.
806 tab2 = self.butler.get(self.datasetType, dataId={})
807 self._checkNumpyTableEquality(tab1, tab2)
808 # Read the columns.
809 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
810 self.assertEqual(len(columns2), len(tab1.dtype.names))
811 for i, name in enumerate(tab1.dtype.names):
812 self.assertEqual(columns2[i], name)
813 # Read the rowcount.
814 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
815 self.assertEqual(rowcount, len(tab1))
816 # Read the schema.
817 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
818 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
819 # Read just some columns a few different ways.
820 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
821 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
822 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
823 self._checkNumpyTableEquality(
824 tab1[
825 [
826 "a",
827 ]
828 ],
829 tab4,
830 )
831 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
832 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
833 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
834 self._checkNumpyTableEquality(
835 tab1[
836 [
837 "ddd",
838 ]
839 ],
840 tab6,
841 )
842 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
843 self._checkNumpyTableEquality(
844 tab1[
845 [
846 "a",
847 ]
848 ],
849 tab7,
850 )
851 # Passing an unrecognized column should be a ValueError.
852 with self.assertRaises(ValueError):
853 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
855 def testArrowNumpySchema(self):
856 tab1 = _makeSimpleNumpyTable()
857 tab1_arrow = numpy_to_arrow(tab1)
858 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
860 self.assertIsInstance(schema.schema, np.dtype)
861 self.assertEqual(repr(schema), repr(schema._dtype))
862 self.assertNotEqual(schema, "not_a_schema")
863 self.assertEqual(schema, schema)
865 # Test inequality
866 tab2 = tab1.copy()
867 names = list(tab2.dtype.names)
868 names[0] = "index2"
869 tab2.dtype.names = names
870 schema2 = ArrowNumpySchema(tab2.dtype)
871 self.assertNotEqual(schema2, schema)
873 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
874 def testNumpyDictConversions(self):
875 tab1 = _makeSimpleNumpyTable()
877 # Verify that everything round-trips, including the schema.
878 tab1_arrow = numpy_to_arrow(tab1)
879 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
880 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
882 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
883 self.assertEqual(tab1_arrow, tab1_dict_arrow)
885 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
886 def testWriteNumpyTableReadAsArrowTable(self):
887 tab1 = _makeSimpleNumpyTable()
889 self.butler.put(tab1, self.datasetType, dataId={})
891 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
893 tab2_numpy = arrow_to_numpy(tab2)
895 self._checkNumpyTableEquality(tab1, tab2_numpy)
897 # Check reading the columns.
898 columns = tab2.schema.names
899 columns2 = self.butler.get(
900 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
901 )
902 self.assertEqual(columns2, columns)
904 # Check reading the schema.
905 schema = tab2.schema
906 schema2 = self.butler.get(
907 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
908 )
909 self.assertEqual(schema2, schema)
911 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
912 def testWriteNumpyTableReadAsDataFrame(self):
913 tab1 = _makeSimpleNumpyTable()
915 self.butler.put(tab1, self.datasetType, dataId={})
917 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
919 # Converting this back to numpy gets confused with the index column
920 # and changes the datatype of the string column.
922 tab1_df = pd.DataFrame(tab1)
924 self.assertTrue(tab1_df.equals(tab2))
926 # Check reading the columns.
927 columns = tab2.columns
928 columns2 = self.butler.get(
929 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
930 )
931 self.assertTrue(columns.equals(columns2))
933 # Check reading the schema.
934 schema = DataFrameSchema(tab2)
935 schema2 = self.butler.get(
936 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
937 )
939 self.assertEqual(schema2, schema)
941 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
942 def testWriteNumpyTableReadAsAstropyTable(self):
943 tab1 = _makeSimpleNumpyTable()
945 self.butler.put(tab1, self.datasetType, dataId={})
947 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
948 tab2_numpy = tab2.as_array()
950 self._checkNumpyTableEquality(tab1, tab2_numpy)
952 # Check reading the columns.
953 columns = list(tab2.columns.keys())
954 columns2 = self.butler.get(
955 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
956 )
957 self.assertEqual(columns2, columns)
959 # Check reading the schema.
960 schema = ArrowAstropySchema(tab2)
961 schema2 = self.butler.get(
962 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
963 )
965 self.assertEqual(schema2, schema)
967 def _checkNumpyTableEquality(self, table1, table2):
968 """Check if two numpy tables have the same columns/values
970 Parameters
971 ----------
972 table1 : `numpy.ndarray`
973 table2 : `numpy.ndarray`
974 """
975 self.assertEqual(table1.dtype.names, table2.dtype.names)
976 for name in table1.dtype.names:
977 self.assertEqual(table1.dtype[name], table2.dtype[name])
978 self.assertTrue(np.all(table1 == table2))
981@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
982class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
983 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
985 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
987 def testBadInput(self):
988 tab1 = _makeSimpleNumpyTable()
989 delegate = ArrowNumpyDelegate("ArrowNumpy")
991 with self.assertRaises(ValueError):
992 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
994 with self.assertRaises(NotImplementedError):
995 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
997 with self.assertRaises(AttributeError):
998 delegate.getComponent(composite=tab1, componentName="nothing")
1000 def testStorageClass(self):
1001 tab1 = _makeSimpleNumpyTable()
1003 factory = StorageClassFactory()
1004 factory.addFromConfig(StorageClassConfig())
1006 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1007 # Force the name lookup to do name matching.
1008 storageClass._pytype = None
1009 self.assertEqual(storageClass.name, "ArrowNumpy")
1011 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1012 # Force the name lookup to do name matching.
1013 storageClass._pytype = None
1014 self.assertEqual(storageClass.name, "ArrowNumpy")
1017@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1018class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1019 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1021 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1023 def setUp(self):
1024 """Create a new butler root for each test."""
1025 self.root = makeTestTempDir(TESTDIR)
1026 config = Config(self.configFile)
1027 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1028 # No dimensions in dataset type so we don't have to worry about
1029 # inserting dimension data or defining data IDs.
1030 self.datasetType = DatasetType(
1031 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1032 )
1033 self.butler.registry.registerDatasetType(self.datasetType)
1035 def tearDown(self):
1036 removeTestTempDir(self.root)
1038 def testArrowTable(self):
1039 tab1 = _makeSimpleArrowTable()
1041 self.butler.put(tab1, self.datasetType, dataId={})
1042 # Read the whole Table.
1043 tab2 = self.butler.get(self.datasetType, dataId={})
1044 self.assertEqual(tab2, tab1)
1045 # Read the columns.
1046 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1047 self.assertEqual(len(columns2), len(tab1.schema.names))
1048 for i, name in enumerate(tab1.schema.names):
1049 self.assertEqual(columns2[i], name)
1050 # Read the rowcount.
1051 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1052 self.assertEqual(rowcount, len(tab1))
1053 # Read the schema.
1054 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1055 self.assertEqual(schema, tab1.schema)
1056 # Read just some columns a few different ways.
1057 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1058 self.assertEqual(tab3, tab1.select(("a", "c")))
1059 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1060 self.assertEqual(tab4, tab1.select(("a",)))
1061 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1062 self.assertEqual(tab5, tab1.select(("index", "a")))
1063 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1064 self.assertEqual(tab6, tab1.select(("ddd",)))
1065 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1066 self.assertEqual(tab7, tab1.select(("a",)))
1067 # Passing an unrecognized column should be a ValueError.
1068 with self.assertRaises(ValueError):
1069 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1071 def testEmptyArrowTable(self):
1072 data = _makeSimpleNumpyTable()
1073 type_list = [(name, pa.from_numpy_dtype(data.dtype[name].type)) for name in data.dtype.names]
1075 schema = pa.schema(type_list)
1076 arrays = [[]] * len(schema.names)
1078 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1080 self.butler.put(tab1, self.datasetType, dataId={})
1081 tab2 = self.butler.get(self.datasetType, dataId={})
1082 self.assertEqual(tab2, tab1)
1084 tab1_numpy = arrow_to_numpy(tab1)
1085 self.assertEqual(len(tab1_numpy), 0)
1086 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1087 self.assertEqual(tab1_numpy_arrow, tab1)
1089 tab1_pandas = arrow_to_pandas(tab1)
1090 self.assertEqual(len(tab1_pandas), 0)
1091 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1092 # Unfortunately, string/byte columns get mangled when translated
1093 # through empty pandas dataframes.
1094 self.assertEqual(
1095 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1096 tab1.select(("index", "a", "b", "c", "ddd")),
1097 )
1099 tab1_astropy = arrow_to_astropy(tab1)
1100 self.assertEqual(len(tab1_astropy), 0)
1101 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1102 self.assertEqual(tab1_astropy_arrow, tab1)
1104 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1105 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1106 df1, allColumns = _makeSingleIndexDataFrame()
1108 self.butler.put(df1, self.datasetType, dataId={})
1110 # Read back out as a dataframe.
1111 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1112 self.assertTrue(df1.equals(df2))
1114 # Read back out as an arrow table, convert to dataframe.
1115 tab3 = self.butler.get(self.datasetType, dataId={})
1116 df3 = arrow_to_pandas(tab3)
1117 self.assertTrue(df1.equals(df3))
1119 # Check reading the columns.
1120 columns = df2.reset_index().columns
1121 columns2 = self.butler.get(
1122 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1123 )
1124 # We check the set because pandas reorders the columns.
1125 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1127 # Check reading the schema.
1128 schema = DataFrameSchema(df1)
1129 schema2 = self.butler.get(
1130 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1131 )
1132 self.assertEqual(schema2, schema)
1134 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1135 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1136 df1 = _makeMultiIndexDataFrame()
1138 self.butler.put(df1, self.datasetType, dataId={})
1140 # Read back out as a dataframe.
1141 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1142 self.assertTrue(df1.equals(df2))
1144 # Read back out as an arrow table, convert to dataframe.
1145 atab3 = self.butler.get(self.datasetType, dataId={})
1146 df3 = arrow_to_pandas(atab3)
1147 self.assertTrue(df1.equals(df3))
1149 # Check reading the columns.
1150 columns = df2.columns
1151 columns2 = self.butler.get(
1152 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1153 )
1154 self.assertTrue(columns2.equals(columns))
1156 # Check reading the schema.
1157 schema = DataFrameSchema(df1)
1158 schema2 = self.butler.get(
1159 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1160 )
1161 self.assertEqual(schema2, schema)
1163 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1164 def testWriteArrowTableReadAsAstropyTable(self):
1165 tab1 = _makeSimpleAstropyTable()
1167 self.butler.put(tab1, self.datasetType, dataId={})
1169 # Read back out as an astropy table.
1170 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1171 self._checkAstropyTableEquality(tab1, tab2)
1173 # Read back out as an arrow table, convert to astropy table.
1174 atab3 = self.butler.get(self.datasetType, dataId={})
1175 tab3 = arrow_to_astropy(atab3)
1176 self._checkAstropyTableEquality(tab1, tab3)
1178 # Check reading the columns.
1179 columns = list(tab2.columns.keys())
1180 columns2 = self.butler.get(
1181 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1182 )
1183 self.assertEqual(columns2, columns)
1185 # Check reading the schema.
1186 schema = ArrowAstropySchema(tab1)
1187 schema2 = self.butler.get(
1188 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1189 )
1190 self.assertEqual(schema2, schema)
1192 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1193 def testWriteArrowTableReadAsNumpyTable(self):
1194 tab1 = _makeSimpleNumpyTable()
1196 self.butler.put(tab1, self.datasetType, dataId={})
1198 # Read back out as a numpy table.
1199 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1200 self._checkNumpyTableEquality(tab1, tab2)
1202 # Read back out as an arrow table, convert to numpy table.
1203 atab3 = self.butler.get(self.datasetType, dataId={})
1204 tab3 = arrow_to_numpy(atab3)
1205 self._checkNumpyTableEquality(tab1, tab3)
1207 # Check reading the columns.
1208 columns = list(tab2.dtype.names)
1209 columns2 = self.butler.get(
1210 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1211 )
1212 self.assertEqual(columns2, columns)
1214 # Check reading the schema.
1215 schema = ArrowNumpySchema(tab1.dtype)
1216 schema2 = self.butler.get(
1217 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1218 )
1219 self.assertEqual(schema2, schema)
1221 def _checkAstropyTableEquality(self, table1, table2):
1222 """Check if two astropy tables have the same columns/values
1224 Parameters
1225 ----------
1226 table1 : `astropy.table.Table`
1227 table2 : `astropy.table.Table`
1228 """
1229 self.assertEqual(table1.dtype, table2.dtype)
1230 for name in table1.columns:
1231 self.assertEqual(table1[name].unit, table2[name].unit)
1232 self.assertEqual(table1[name].description, table2[name].description)
1233 self.assertEqual(table1[name].format, table2[name].format)
1234 self.assertTrue(np.all(table1 == table2))
1236 def _checkNumpyTableEquality(self, table1, table2):
1237 """Check if two numpy tables have the same columns/values
1239 Parameters
1240 ----------
1241 table1 : `numpy.ndarray`
1242 table2 : `numpy.ndarray`
1243 """
1244 self.assertEqual(table1.dtype.names, table2.dtype.names)
1245 for name in table1.dtype.names:
1246 self.assertEqual(table1.dtype[name], table2.dtype[name])
1247 self.assertTrue(np.all(table1 == table2))
1250@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1251class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1252 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1254 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1256 def testBadInput(self):
1257 tab1 = _makeSimpleArrowTable()
1258 delegate = ArrowTableDelegate("ArrowTable")
1260 with self.assertRaises(ValueError):
1261 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1263 with self.assertRaises(NotImplementedError):
1264 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1266 with self.assertRaises(AttributeError):
1267 delegate.getComponent(composite=tab1, componentName="nothing")
1269 def testStorageClass(self):
1270 tab1 = _makeSimpleArrowTable()
1272 factory = StorageClassFactory()
1273 factory.addFromConfig(StorageClassConfig())
1275 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1276 # Force the name lookup to do name matching.
1277 storageClass._pytype = None
1278 self.assertEqual(storageClass.name, "ArrowTable")
1280 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1281 # Force the name lookup to do name matching.
1282 storageClass._pytype = None
1283 self.assertEqual(storageClass.name, "ArrowTable")
1286if __name__ == "__main__": 1286 ↛ 1287line 1286 didn't jump to line 1287, because the condition on line 1286 was never true
1287 unittest.main()