Coverage for tests/test_parquet.py: 16%
792 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-04 02:06 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-04 02:06 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _numpy_dtype_to_arrow_types,
68 arrow_to_astropy,
69 arrow_to_numpy,
70 arrow_to_numpy_dict,
71 arrow_to_pandas,
72 astropy_to_arrow,
73 numpy_dict_to_arrow,
74 numpy_to_arrow,
75 pandas_to_arrow,
76)
77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
79TESTDIR = os.path.abspath(os.path.dirname(__file__))
82def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
83 """Make a simple numpy table with random data.
85 Parameters
86 ----------
87 include_multidim : `bool`
88 Include multi-dimensional columns.
89 include_bigendian : `bool`
90 Include big-endian columns.
92 Returns
93 -------
94 numpyTable : `numpy.ndarray`
95 """
96 nrow = 5
98 dtype = [
99 ("index", "i4"),
100 ("a", "f8"),
101 ("b", "f8"),
102 ("c", "f8"),
103 ("ddd", "f8"),
104 ("f", "i8"),
105 ("strcol", "U10"),
106 ("bytecol", "a10"),
107 ]
109 if include_multidim:
110 dtype.extend(
111 [
112 ("d1", "f4", (5,)),
113 ("d2", "i8", (5, 10)),
114 ("d3", "f8", (5, 10)),
115 ]
116 )
118 if include_bigendian:
119 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
121 data = np.zeros(nrow, dtype=dtype)
122 data["index"][:] = np.arange(nrow)
123 data["a"] = np.random.randn(nrow)
124 data["b"] = np.random.randn(nrow)
125 data["c"] = np.random.randn(nrow)
126 data["ddd"] = np.random.randn(nrow)
127 data["f"] = np.arange(nrow) * 10
128 data["strcol"][:] = "teststring"
129 data["bytecol"][:] = "teststring"
131 if include_multidim:
132 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
133 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
134 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
136 if include_bigendian:
137 data["a_bigendian"][:] = data["a"]
138 data["f_bigendian"][:] = data["f"]
140 return data
143def _makeSingleIndexDataFrame(include_masked=False):
144 """Make a single index data frame for testing.
146 Parameters
147 ----------
148 include_masked : `bool`
149 Include masked columns.
151 Returns
152 -------
153 dataFrame : `~pandas.DataFrame`
154 The test dataframe.
155 allColumns : `list` [`str`]
156 List of all the columns (including index columns).
157 """
158 data = _makeSimpleNumpyTable()
159 df = pd.DataFrame(data)
160 df = df.set_index("index")
162 if include_masked:
163 nrow = len(df)
165 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
166 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
167 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
168 df.loc[1, ["m1", "m2", "mstrcol"]] = None
170 allColumns = df.columns.append(pd.Index(df.index.names))
172 return df, allColumns
175def _makeMultiIndexDataFrame():
176 """Make a multi-index data frame for testing.
178 Returns
179 -------
180 dataFrame : `~pandas.DataFrame`
181 The test dataframe.
182 """
183 columns = pd.MultiIndex.from_tuples(
184 [
185 ("g", "a"),
186 ("g", "b"),
187 ("g", "c"),
188 ("r", "a"),
189 ("r", "b"),
190 ("r", "c"),
191 ],
192 names=["filter", "column"],
193 )
194 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
196 return df
199def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
200 """Make an astropy table for testing.
202 Parameters
203 ----------
204 include_multidim : `bool`
205 Include multi-dimensional columns.
206 include_masked : `bool`
207 Include masked columns.
208 include_bigendian : `bool`
209 Include big-endian columns.
211 Returns
212 -------
213 astropyTable : `astropy.table.Table`
214 The test table.
215 """
216 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
217 # Add a couple of units.
218 table = atable.Table(data)
219 table["a"].unit = units.degree
220 table["b"].unit = units.meter
222 # Add some masked columns.
223 if include_masked:
224 nrow = len(table)
225 mask = np.zeros(nrow, dtype=bool)
226 mask[1] = True
227 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
228 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
229 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
230 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
232 return table
235def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
236 """Make an arrow table for testing.
238 Parameters
239 ----------
240 include_multidim : `bool`
241 Include multi-dimensional columns.
242 include_masked : `bool`
243 Include masked columns.
245 Returns
246 -------
247 arrowTable : `pyarrow.Table`
248 The test table.
249 """
250 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
251 return astropy_to_arrow(data)
254@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
255@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
256class ParquetFormatterDataFrameTestCase(unittest.TestCase):
257 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
259 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
261 def setUp(self):
262 """Create a new butler root for each test."""
263 self.root = makeTestTempDir(TESTDIR)
264 config = Config(self.configFile)
265 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
266 # No dimensions in dataset type so we don't have to worry about
267 # inserting dimension data or defining data IDs.
268 self.datasetType = DatasetType(
269 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
270 )
271 self.butler.registry.registerDatasetType(self.datasetType)
273 def tearDown(self):
274 removeTestTempDir(self.root)
276 def testSingleIndexDataFrame(self):
277 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
279 self.butler.put(df1, self.datasetType, dataId={})
280 # Read the whole DataFrame.
281 df2 = self.butler.get(self.datasetType, dataId={})
282 self.assertTrue(df1.equals(df2))
283 # Read just the column descriptions.
284 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
285 self.assertTrue(allColumns.equals(columns2))
286 # Read the rowcount.
287 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
288 self.assertEqual(rowcount, len(df1))
289 # Read the schema.
290 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
291 self.assertEqual(schema, DataFrameSchema(df1))
292 # Read just some columns a few different ways.
293 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
294 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
295 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
296 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
297 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
298 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
299 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
300 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
301 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
302 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
303 # Passing an unrecognized column should be a ValueError.
304 with self.assertRaises(ValueError):
305 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
307 def testMultiIndexDataFrame(self):
308 df1 = _makeMultiIndexDataFrame()
310 self.butler.put(df1, self.datasetType, dataId={})
311 # Read the whole DataFrame.
312 df2 = self.butler.get(self.datasetType, dataId={})
313 self.assertTrue(df1.equals(df2))
314 # Read just the column descriptions.
315 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
316 self.assertTrue(df1.columns.equals(columns2))
317 # Read the rowcount.
318 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
319 self.assertEqual(rowcount, len(df1))
320 # Read the schema.
321 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
322 self.assertEqual(schema, DataFrameSchema(df1))
323 # Read just some columns a few different ways.
324 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
325 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
326 df4 = self.butler.get(
327 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
328 )
329 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
330 column_list = [("g", "a"), ("r", "c")]
331 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
332 self.assertTrue(df1.loc[:, column_list].equals(df5))
333 # Passing an unrecognized column should be a ValueError.
334 with self.assertRaises(ValueError):
335 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
337 def testSingleIndexDataFrameEmptyString(self):
338 """Test persisting a single index dataframe with empty strings."""
339 df1, _ = _makeSingleIndexDataFrame()
341 # Set one of the strings to None
342 df1.at[1, "strcol"] = None
344 self.butler.put(df1, self.datasetType, dataId={})
345 # Read the whole DataFrame.
346 df2 = self.butler.get(self.datasetType, dataId={})
347 self.assertTrue(df1.equals(df2))
349 def testSingleIndexDataFrameAllEmptyStrings(self):
350 """Test persisting a single index dataframe with an empty string
351 column.
352 """
353 df1, _ = _makeSingleIndexDataFrame()
355 # Set all of the strings to None
356 df1.loc[0:, "strcol"] = None
358 self.butler.put(df1, self.datasetType, dataId={})
359 # Read the whole DataFrame.
360 df2 = self.butler.get(self.datasetType, dataId={})
361 self.assertTrue(df1.equals(df2))
363 def testLegacyDataFrame(self):
364 """Test writing a dataframe to parquet via pandas (without additional
365 metadata) and ensure that we can read it back with all the new
366 functionality.
367 """
368 df1, allColumns = _makeSingleIndexDataFrame()
370 fname = os.path.join(self.root, "test_dataframe.parq")
371 df1.to_parquet(fname)
373 legacy_type = DatasetType(
374 "legacy_dataframe",
375 dimensions=(),
376 storageClass="DataFrame",
377 universe=self.butler.registry.dimensions,
378 )
379 self.butler.registry.registerDatasetType(legacy_type)
381 data_id = {}
382 ref = DatasetRef(legacy_type, data_id, id=None)
383 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
385 self.butler.ingest(dataset, transfer="copy")
387 self.butler.put(df1, self.datasetType, dataId={})
389 df2a = self.butler.get(self.datasetType, dataId={})
390 df2b = self.butler.get("legacy_dataframe", dataId={})
391 self.assertTrue(df2a.equals(df2b))
393 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
394 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
395 self.assertTrue(df3a.equals(df3b))
397 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
398 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
399 self.assertTrue(columns2a.equals(columns2b))
401 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
402 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
403 self.assertEqual(rowcount2a, rowcount2b)
405 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
406 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
407 self.assertEqual(schema2a, schema2b)
409 def testDataFrameSchema(self):
410 tab1 = _makeSimpleArrowTable()
412 schema = DataFrameSchema.from_arrow(tab1.schema)
414 self.assertIsInstance(schema.schema, pd.DataFrame)
415 self.assertEqual(repr(schema), repr(schema._schema))
416 self.assertNotEqual(schema, "not_a_schema")
417 self.assertEqual(schema, schema)
419 tab2 = _makeMultiIndexDataFrame()
420 schema2 = DataFrameSchema(tab2)
422 self.assertNotEqual(schema, schema2)
424 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
425 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
426 df1, allColumns = _makeSingleIndexDataFrame()
428 self.butler.put(df1, self.datasetType, dataId={})
430 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
432 tab2_df = tab2.to_pandas(index="index")
433 self.assertTrue(df1.equals(tab2_df))
435 # Check reading the columns.
436 columns = list(tab2.columns.keys())
437 columns2 = self.butler.get(
438 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
439 )
440 # We check the set because pandas reorders the columns.
441 self.assertEqual(set(columns2), set(columns))
443 # Check reading the schema.
444 schema = ArrowAstropySchema(tab2)
445 schema2 = self.butler.get(
446 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
447 )
449 # The string types are objectified by pandas, and the order
450 # will be changed because of pandas indexing.
451 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
452 for name in schema.schema.columns:
453 self.assertIn(name, schema2.schema.columns)
454 if schema2.schema[name].dtype != np.dtype("O"):
455 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
457 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
458 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
459 # We need to special-case the write-as-pandas read-as-astropy code
460 # with masks because pandas has multiple ways to use masked columns.
461 # (The string column mask handling in particular is frustratingly
462 # inconsistent.)
463 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
465 self.butler.put(df1, self.datasetType, dataId={})
467 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
468 tab2_df = tab2.to_pandas(index="index")
470 self.assertTrue(df1.columns.equals(tab2_df.columns))
471 for name in tab2_df.columns:
472 col1 = df1[name]
473 col2 = tab2_df[name]
475 if col1.hasnans:
476 notNull = col1.notnull()
477 self.assertTrue(notNull.equals(col2.notnull()))
478 # Need to check value-by-value because column may
479 # be made of objects, depending on what pandas decides.
480 for index in notNull.values.nonzero()[0]:
481 self.assertEqual(col1[index], col2[index])
482 else:
483 self.assertTrue(col1.equals(col2))
485 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
486 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
487 df1 = _makeMultiIndexDataFrame()
489 self.butler.put(df1, self.datasetType, dataId={})
491 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
493 # This is an odd duck, it doesn't really round-trip.
494 # This test simply checks that it's readable, but definitely not
495 # recommended.
497 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
498 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
499 df1, allColumns = _makeSingleIndexDataFrame()
501 self.butler.put(df1, self.datasetType, dataId={})
503 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
505 tab2_df = arrow_to_pandas(tab2)
506 self.assertTrue(df1.equals(tab2_df))
508 # Check reading the columns.
509 columns = list(tab2.schema.names)
510 columns2 = self.butler.get(
511 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
512 )
513 # We check the set because pandas reorders the columns.
514 self.assertEqual(set(columns), set(columns2))
516 # Check reading the schema.
517 schema = tab2.schema
518 schema2 = self.butler.get(
519 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
520 )
522 # These will not have the same metadata, nor will the string column
523 # information be maintained.
524 self.assertEqual(len(schema.names), len(schema2.names))
525 for name in schema.names:
526 if schema.field(name).type not in (pa.string(), pa.binary()):
527 self.assertEqual(schema.field(name).type, schema2.field(name).type)
529 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
530 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
531 df1 = _makeMultiIndexDataFrame()
533 self.butler.put(df1, self.datasetType, dataId={})
535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
537 tab2_df = arrow_to_pandas(tab2)
538 self.assertTrue(df1.equals(tab2_df))
540 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
541 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
542 df1, allColumns = _makeSingleIndexDataFrame()
544 self.butler.put(df1, self.datasetType, dataId={})
546 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
548 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
549 self.assertTrue(df1.equals(tab2_df))
551 # Check reading the columns.
552 columns = list(tab2.dtype.names)
553 columns2 = self.butler.get(
554 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
555 )
556 # We check the set because pandas reorders the columns.
557 self.assertEqual(set(columns2), set(columns))
559 # Check reading the schema.
560 schema = ArrowNumpySchema(tab2.dtype)
561 schema2 = self.butler.get(
562 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
563 )
565 # The string types will be objectified by pandas, and the order
566 # will be changed because of pandas indexing.
567 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
568 for name in schema.schema.names:
569 self.assertIn(name, schema2.schema.names)
570 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
572 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
573 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
574 df1 = _makeMultiIndexDataFrame()
576 self.butler.put(df1, self.datasetType, dataId={})
578 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
580 # This is an odd duck, it doesn't really round-trip.
581 # This test simply checks that it's readable, but definitely not
582 # recommended.
585@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
586class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
587 """Tests for InMemoryDatastore, using DataFrameDelegate."""
589 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
591 def testMultiIndexDataFrame(self):
592 df1 = _makeMultiIndexDataFrame()
594 delegate = DataFrameDelegate("DataFrame")
596 # Read the whole DataFrame.
597 df2 = delegate.handleParameters(inMemoryDataset=df1)
598 self.assertTrue(df1.equals(df2))
599 # Read just the column descriptions.
600 columns2 = delegate.getComponent(composite=df1, componentName="columns")
601 self.assertTrue(df1.columns.equals(columns2))
603 # Read just some columns a few different ways.
604 with self.assertRaises(NotImplementedError) as cm:
605 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
606 self.assertIn("only supports string column names", str(cm.exception))
607 with self.assertRaises(NotImplementedError) as cm:
608 delegate.handleParameters(
609 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
610 )
611 self.assertIn("only supports string column names", str(cm.exception))
613 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
614 df1 = _makeMultiIndexDataFrame()
616 self.butler.put(df1, self.datasetType, dataId={})
618 with self.assertRaises(ValueError):
619 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
621 def testLegacyDataFrame(self):
622 # This test does not work with an inMemoryDatastore.
623 pass
625 def testBadInput(self):
626 df1, _ = _makeSingleIndexDataFrame()
627 delegate = DataFrameDelegate("DataFrame")
629 with self.assertRaises(ValueError):
630 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
632 with self.assertRaises(AttributeError):
633 delegate.getComponent(composite=df1, componentName="nothing")
635 def testStorageClass(self):
636 df1, allColumns = _makeSingleIndexDataFrame()
638 factory = StorageClassFactory()
639 factory.addFromConfig(StorageClassConfig())
641 storageClass = factory.findStorageClass(type(df1), compare_types=False)
642 # Force the name lookup to do name matching.
643 storageClass._pytype = None
644 self.assertEqual(storageClass.name, "DataFrame")
646 storageClass = factory.findStorageClass(type(df1), compare_types=True)
647 # Force the name lookup to do name matching.
648 storageClass._pytype = None
649 self.assertEqual(storageClass.name, "DataFrame")
652@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
653@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
654class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
655 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
657 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
659 def setUp(self):
660 """Create a new butler root for each test."""
661 self.root = makeTestTempDir(TESTDIR)
662 config = Config(self.configFile)
663 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
664 # No dimensions in dataset type so we don't have to worry about
665 # inserting dimension data or defining data IDs.
666 self.datasetType = DatasetType(
667 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
668 )
669 self.butler.registry.registerDatasetType(self.datasetType)
671 def tearDown(self):
672 removeTestTempDir(self.root)
674 def testAstropyTable(self):
675 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
677 self.butler.put(tab1, self.datasetType, dataId={})
678 # Read the whole Table.
679 tab2 = self.butler.get(self.datasetType, dataId={})
680 self._checkAstropyTableEquality(tab1, tab2)
681 # Read the columns.
682 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
683 self.assertEqual(len(columns2), len(tab1.dtype.names))
684 for i, name in enumerate(tab1.dtype.names):
685 self.assertEqual(columns2[i], name)
686 # Read the rowcount.
687 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
688 self.assertEqual(rowcount, len(tab1))
689 # Read the schema.
690 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
691 self.assertEqual(schema, ArrowAstropySchema(tab1))
692 # Read just some columns a few different ways.
693 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
694 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
695 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
696 self._checkAstropyTableEquality(tab1[("a",)], tab4)
697 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
698 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
699 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
700 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
701 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
702 self._checkAstropyTableEquality(tab1[("a",)], tab7)
703 # Passing an unrecognized column should be a ValueError.
704 with self.assertRaises(ValueError):
705 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
707 def testAstropyTableBigEndian(self):
708 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
710 self.butler.put(tab1, self.datasetType, dataId={})
711 # Read the whole Table.
712 tab2 = self.butler.get(self.datasetType, dataId={})
713 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
715 def testAstropyTableWithMetadata(self):
716 tab1 = _makeSimpleAstropyTable(include_multidim=True)
718 meta = {
719 "meta_a": 5,
720 "meta_b": 10.0,
721 "meta_c": [1, 2, 3],
722 "meta_d": True,
723 "meta_e": "string",
724 }
726 tab1.meta.update(meta)
728 self.butler.put(tab1, self.datasetType, dataId={})
729 # Read the whole Table.
730 tab2 = self.butler.get(self.datasetType, dataId={})
731 # This will check that the metadata is equivalent as well.
732 self._checkAstropyTableEquality(tab1, tab2)
734 def testArrowAstropySchema(self):
735 tab1 = _makeSimpleAstropyTable()
736 tab1_arrow = astropy_to_arrow(tab1)
737 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
739 self.assertIsInstance(schema.schema, atable.Table)
740 self.assertEqual(repr(schema), repr(schema._schema))
741 self.assertNotEqual(schema, "not_a_schema")
742 self.assertEqual(schema, schema)
744 # Test various inequalities
745 tab2 = tab1.copy()
746 tab2.rename_column("index", "index2")
747 schema2 = ArrowAstropySchema(tab2)
748 self.assertNotEqual(schema2, schema)
750 tab2 = tab1.copy()
751 tab2["index"].unit = units.micron
752 schema2 = ArrowAstropySchema(tab2)
753 self.assertNotEqual(schema2, schema)
755 tab2 = tab1.copy()
756 tab2["index"].description = "Index column"
757 schema2 = ArrowAstropySchema(tab2)
758 self.assertNotEqual(schema2, schema)
760 tab2 = tab1.copy()
761 tab2["index"].format = "%05d"
762 schema2 = ArrowAstropySchema(tab2)
763 self.assertNotEqual(schema2, schema)
765 def testAstropyParquet(self):
766 tab1 = _makeSimpleAstropyTable()
768 fname = os.path.join(self.root, "test_astropy.parq")
769 tab1.write(fname)
771 astropy_type = DatasetType(
772 "astropy_parquet",
773 dimensions=(),
774 storageClass="ArrowAstropy",
775 universe=self.butler.registry.dimensions,
776 )
777 self.butler.registry.registerDatasetType(astropy_type)
779 data_id = {}
780 ref = DatasetRef(astropy_type, data_id, id=None)
781 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
783 self.butler.ingest(dataset, transfer="copy")
785 self.butler.put(tab1, self.datasetType, dataId={})
787 tab2a = self.butler.get(self.datasetType, dataId={})
788 tab2b = self.butler.get("astropy_parquet", dataId={})
789 self._checkAstropyTableEquality(tab2a, tab2b)
791 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
792 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
793 self.assertEqual(len(columns2b), len(columns2a))
794 for i, name in enumerate(columns2a):
795 self.assertEqual(columns2b[i], name)
797 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
798 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
799 self.assertEqual(rowcount2a, rowcount2b)
801 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
802 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
803 self.assertEqual(schema2a, schema2b)
805 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
806 def testWriteAstropyReadAsArrowTable(self):
807 # This astropy <-> arrow works fine with masked columns.
808 tab1 = _makeSimpleAstropyTable(include_masked=True)
810 self.butler.put(tab1, self.datasetType, dataId={})
812 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
814 tab2_astropy = arrow_to_astropy(tab2)
815 self._checkAstropyTableEquality(tab1, tab2_astropy)
817 # Check reading the columns.
818 columns = tab2.schema.names
819 columns2 = self.butler.get(
820 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
821 )
822 self.assertEqual(columns2, columns)
824 # Check reading the schema.
825 schema = tab2.schema
826 schema2 = self.butler.get(
827 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
828 )
830 self.assertEqual(schema, schema2)
832 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
833 def testWriteAstropyReadAsDataFrame(self):
834 tab1 = _makeSimpleAstropyTable()
836 self.butler.put(tab1, self.datasetType, dataId={})
838 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
840 # This is tricky because it loses the units and gains a bonus pandas
841 # _index_ column, so we just test the dataframe form.
843 tab1_df = tab1.to_pandas()
844 self.assertTrue(tab1_df.equals(tab2))
846 # Check reading the columns.
847 columns = tab2.columns
848 columns2 = self.butler.get(
849 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
850 )
851 self.assertTrue(columns.equals(columns2))
853 # Check reading the schema.
854 schema = DataFrameSchema(tab2)
855 schema2 = self.butler.get(
856 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
857 )
859 self.assertEqual(schema2, schema)
861 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
862 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
863 # We need to special-case the write-as-astropy read-as-pandas code
864 # with masks because pandas has multiple ways to use masked columns.
865 # (When writing an astropy table with masked columns we get an object
866 # column back, but each unmasked element has the correct type.)
867 tab1 = _makeSimpleAstropyTable(include_masked=True)
869 self.butler.put(tab1, self.datasetType, dataId={})
871 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
873 tab1_df = tab1.to_pandas()
875 self.assertTrue(tab1_df.columns.equals(tab2.columns))
876 for name in tab2.columns:
877 col1 = tab1_df[name]
878 col2 = tab2[name]
880 if col1.hasnans:
881 notNull = col1.notnull()
882 self.assertTrue(notNull.equals(col2.notnull()))
883 # Need to check value-by-value because column may
884 # be made of objects, depending on what pandas decides.
885 for index in notNull.values.nonzero()[0]:
886 self.assertEqual(col1[index], col2[index])
887 else:
888 self.assertTrue(col1.equals(col2))
890 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
891 def testWriteAstropyReadAsNumpyTable(self):
892 tab1 = _makeSimpleAstropyTable()
893 self.butler.put(tab1, self.datasetType, dataId={})
895 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
897 # This is tricky because it loses the units.
898 tab2_astropy = atable.Table(tab2)
900 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
902 # Check reading the columns.
903 columns = list(tab2.dtype.names)
904 columns2 = self.butler.get(
905 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
906 )
907 self.assertEqual(columns2, columns)
909 # Check reading the schema.
910 schema = ArrowNumpySchema(tab2.dtype)
911 schema2 = self.butler.get(
912 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
913 )
915 self.assertEqual(schema2, schema)
917 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
918 """Check if two astropy tables have the same columns/values.
920 Parameters
921 ----------
922 table1 : `astropy.table.Table`
923 table2 : `astropy.table.Table`
924 skip_units : `bool`
925 has_bigendian : `bool`
926 """
927 if not has_bigendian:
928 self.assertEqual(table1.dtype, table2.dtype)
929 else:
930 for name in table1.dtype.names:
931 # Only check type matches, force to little-endian.
932 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
934 self.assertEqual(table1.meta, table2.meta)
935 if not skip_units:
936 for name in table1.columns:
937 self.assertEqual(table1[name].unit, table2[name].unit)
938 self.assertEqual(table1[name].description, table2[name].description)
939 self.assertEqual(table1[name].format, table2[name].format)
940 self.assertTrue(np.all(table1 == table2))
943@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
944class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
945 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
947 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
949 def testAstropyParquet(self):
950 # This test does not work with an inMemoryDatastore.
951 pass
953 def testBadInput(self):
954 tab1 = _makeSimpleAstropyTable()
955 delegate = ArrowAstropyDelegate("ArrowAstropy")
957 with self.assertRaises(ValueError):
958 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
960 with self.assertRaises(NotImplementedError):
961 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
963 with self.assertRaises(AttributeError):
964 delegate.getComponent(composite=tab1, componentName="nothing")
967@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
968@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
969class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
970 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
972 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
974 def setUp(self):
975 """Create a new butler root for each test."""
976 self.root = makeTestTempDir(TESTDIR)
977 config = Config(self.configFile)
978 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
979 # No dimensions in dataset type so we don't have to worry about
980 # inserting dimension data or defining data IDs.
981 self.datasetType = DatasetType(
982 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
983 )
984 self.butler.registry.registerDatasetType(self.datasetType)
986 def tearDown(self):
987 removeTestTempDir(self.root)
989 def testNumpyTable(self):
990 tab1 = _makeSimpleNumpyTable(include_multidim=True)
992 self.butler.put(tab1, self.datasetType, dataId={})
993 # Read the whole Table.
994 tab2 = self.butler.get(self.datasetType, dataId={})
995 self._checkNumpyTableEquality(tab1, tab2)
996 # Read the columns.
997 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
998 self.assertEqual(len(columns2), len(tab1.dtype.names))
999 for i, name in enumerate(tab1.dtype.names):
1000 self.assertEqual(columns2[i], name)
1001 # Read the rowcount.
1002 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1003 self.assertEqual(rowcount, len(tab1))
1004 # Read the schema.
1005 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1006 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1007 # Read just some columns a few different ways.
1008 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1009 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1010 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1011 self._checkNumpyTableEquality(
1012 tab1[
1013 [
1014 "a",
1015 ]
1016 ],
1017 tab4,
1018 )
1019 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1020 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1021 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1022 self._checkNumpyTableEquality(
1023 tab1[
1024 [
1025 "ddd",
1026 ]
1027 ],
1028 tab6,
1029 )
1030 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1031 self._checkNumpyTableEquality(
1032 tab1[
1033 [
1034 "a",
1035 ]
1036 ],
1037 tab7,
1038 )
1039 # Passing an unrecognized column should be a ValueError.
1040 with self.assertRaises(ValueError):
1041 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1043 def testNumpyTableBigEndian(self):
1044 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1046 self.butler.put(tab1, self.datasetType, dataId={})
1047 # Read the whole Table.
1048 tab2 = self.butler.get(self.datasetType, dataId={})
1049 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1051 def testArrowNumpySchema(self):
1052 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1053 tab1_arrow = numpy_to_arrow(tab1)
1054 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1056 self.assertIsInstance(schema.schema, np.dtype)
1057 self.assertEqual(repr(schema), repr(schema._dtype))
1058 self.assertNotEqual(schema, "not_a_schema")
1059 self.assertEqual(schema, schema)
1061 # Test inequality
1062 tab2 = tab1.copy()
1063 names = list(tab2.dtype.names)
1064 names[0] = "index2"
1065 tab2.dtype.names = names
1066 schema2 = ArrowNumpySchema(tab2.dtype)
1067 self.assertNotEqual(schema2, schema)
1069 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1070 def testNumpyDictConversions(self):
1071 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1073 # Verify that everything round-trips, including the schema.
1074 tab1_arrow = numpy_to_arrow(tab1)
1075 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1076 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1078 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1079 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1081 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1082 def testWriteNumpyTableReadAsArrowTable(self):
1083 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1085 self.butler.put(tab1, self.datasetType, dataId={})
1087 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1089 tab2_numpy = arrow_to_numpy(tab2)
1091 self._checkNumpyTableEquality(tab1, tab2_numpy)
1093 # Check reading the columns.
1094 columns = tab2.schema.names
1095 columns2 = self.butler.get(
1096 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1097 )
1098 self.assertEqual(columns2, columns)
1100 # Check reading the schema.
1101 schema = tab2.schema
1102 schema2 = self.butler.get(
1103 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1104 )
1105 self.assertEqual(schema2, schema)
1107 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1108 def testWriteNumpyTableReadAsDataFrame(self):
1109 tab1 = _makeSimpleNumpyTable()
1111 self.butler.put(tab1, self.datasetType, dataId={})
1113 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1115 # Converting this back to numpy gets confused with the index column
1116 # and changes the datatype of the string column.
1118 tab1_df = pd.DataFrame(tab1)
1120 self.assertTrue(tab1_df.equals(tab2))
1122 # Check reading the columns.
1123 columns = tab2.columns
1124 columns2 = self.butler.get(
1125 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1126 )
1127 self.assertTrue(columns.equals(columns2))
1129 # Check reading the schema.
1130 schema = DataFrameSchema(tab2)
1131 schema2 = self.butler.get(
1132 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1133 )
1135 self.assertEqual(schema2, schema)
1137 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1138 def testWriteNumpyTableReadAsAstropyTable(self):
1139 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1141 self.butler.put(tab1, self.datasetType, dataId={})
1143 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1144 tab2_numpy = tab2.as_array()
1146 self._checkNumpyTableEquality(tab1, tab2_numpy)
1148 # Check reading the columns.
1149 columns = list(tab2.columns.keys())
1150 columns2 = self.butler.get(
1151 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1152 )
1153 self.assertEqual(columns2, columns)
1155 # Check reading the schema.
1156 schema = ArrowAstropySchema(tab2)
1157 schema2 = self.butler.get(
1158 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1159 )
1161 self.assertEqual(schema2, schema)
1163 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1164 """Check if two numpy tables have the same columns/values
1166 Parameters
1167 ----------
1168 table1 : `numpy.ndarray`
1169 table2 : `numpy.ndarray`
1170 has_bigendian : `bool`
1171 """
1172 self.assertEqual(table1.dtype.names, table2.dtype.names)
1173 for name in table1.dtype.names:
1174 if not has_bigendian:
1175 self.assertEqual(table1.dtype[name], table2.dtype[name])
1176 else:
1177 # Only check type matches, force to little-endian.
1178 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1179 self.assertTrue(np.all(table1 == table2))
1182@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1183class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1184 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1186 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1188 def testBadInput(self):
1189 tab1 = _makeSimpleNumpyTable()
1190 delegate = ArrowNumpyDelegate("ArrowNumpy")
1192 with self.assertRaises(ValueError):
1193 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1195 with self.assertRaises(NotImplementedError):
1196 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1198 with self.assertRaises(AttributeError):
1199 delegate.getComponent(composite=tab1, componentName="nothing")
1201 def testStorageClass(self):
1202 tab1 = _makeSimpleNumpyTable()
1204 factory = StorageClassFactory()
1205 factory.addFromConfig(StorageClassConfig())
1207 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1208 # Force the name lookup to do name matching.
1209 storageClass._pytype = None
1210 self.assertEqual(storageClass.name, "ArrowNumpy")
1212 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1213 # Force the name lookup to do name matching.
1214 storageClass._pytype = None
1215 self.assertEqual(storageClass.name, "ArrowNumpy")
1218@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1219class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1220 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1222 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1224 def setUp(self):
1225 """Create a new butler root for each test."""
1226 self.root = makeTestTempDir(TESTDIR)
1227 config = Config(self.configFile)
1228 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1229 # No dimensions in dataset type so we don't have to worry about
1230 # inserting dimension data or defining data IDs.
1231 self.datasetType = DatasetType(
1232 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1233 )
1234 self.butler.registry.registerDatasetType(self.datasetType)
1236 def tearDown(self):
1237 removeTestTempDir(self.root)
1239 def testArrowTable(self):
1240 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1242 self.butler.put(tab1, self.datasetType, dataId={})
1243 # Read the whole Table.
1244 tab2 = self.butler.get(self.datasetType, dataId={})
1245 self.assertEqual(tab2, tab1)
1246 # Read the columns.
1247 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1248 self.assertEqual(len(columns2), len(tab1.schema.names))
1249 for i, name in enumerate(tab1.schema.names):
1250 self.assertEqual(columns2[i], name)
1251 # Read the rowcount.
1252 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1253 self.assertEqual(rowcount, len(tab1))
1254 # Read the schema.
1255 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1256 self.assertEqual(schema, tab1.schema)
1257 # Read just some columns a few different ways.
1258 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1259 self.assertEqual(tab3, tab1.select(("a", "c")))
1260 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1261 self.assertEqual(tab4, tab1.select(("a",)))
1262 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1263 self.assertEqual(tab5, tab1.select(("index", "a")))
1264 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1265 self.assertEqual(tab6, tab1.select(("ddd",)))
1266 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1267 self.assertEqual(tab7, tab1.select(("a",)))
1268 # Passing an unrecognized column should be a ValueError.
1269 with self.assertRaises(ValueError):
1270 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1272 def testEmptyArrowTable(self):
1273 data = _makeSimpleNumpyTable()
1274 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1276 schema = pa.schema(type_list)
1277 arrays = [[]] * len(schema.names)
1279 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1281 self.butler.put(tab1, self.datasetType, dataId={})
1282 tab2 = self.butler.get(self.datasetType, dataId={})
1283 self.assertEqual(tab2, tab1)
1285 tab1_numpy = arrow_to_numpy(tab1)
1286 self.assertEqual(len(tab1_numpy), 0)
1287 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1288 self.assertEqual(tab1_numpy_arrow, tab1)
1290 tab1_pandas = arrow_to_pandas(tab1)
1291 self.assertEqual(len(tab1_pandas), 0)
1292 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1293 # Unfortunately, string/byte columns get mangled when translated
1294 # through empty pandas dataframes.
1295 self.assertEqual(
1296 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1297 tab1.select(("index", "a", "b", "c", "ddd")),
1298 )
1300 tab1_astropy = arrow_to_astropy(tab1)
1301 self.assertEqual(len(tab1_astropy), 0)
1302 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1303 self.assertEqual(tab1_astropy_arrow, tab1)
1305 def testEmptyArrowTableMultidim(self):
1306 data = _makeSimpleNumpyTable(include_multidim=True)
1307 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1309 md = {}
1310 for name in data.dtype.names:
1311 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1313 schema = pa.schema(type_list, metadata=md)
1314 arrays = [[]] * len(schema.names)
1316 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1318 self.butler.put(tab1, self.datasetType, dataId={})
1319 tab2 = self.butler.get(self.datasetType, dataId={})
1320 self.assertEqual(tab2, tab1)
1322 tab1_numpy = arrow_to_numpy(tab1)
1323 self.assertEqual(len(tab1_numpy), 0)
1324 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1325 self.assertEqual(tab1_numpy_arrow, tab1)
1327 tab1_astropy = arrow_to_astropy(tab1)
1328 self.assertEqual(len(tab1_astropy), 0)
1329 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1330 self.assertEqual(tab1_astropy_arrow, tab1)
1332 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1333 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1334 df1, allColumns = _makeSingleIndexDataFrame()
1336 self.butler.put(df1, self.datasetType, dataId={})
1338 # Read back out as a dataframe.
1339 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1340 self.assertTrue(df1.equals(df2))
1342 # Read back out as an arrow table, convert to dataframe.
1343 tab3 = self.butler.get(self.datasetType, dataId={})
1344 df3 = arrow_to_pandas(tab3)
1345 self.assertTrue(df1.equals(df3))
1347 # Check reading the columns.
1348 columns = df2.reset_index().columns
1349 columns2 = self.butler.get(
1350 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1351 )
1352 # We check the set because pandas reorders the columns.
1353 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1355 # Check reading the schema.
1356 schema = DataFrameSchema(df1)
1357 schema2 = self.butler.get(
1358 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1359 )
1360 self.assertEqual(schema2, schema)
1362 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1363 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1364 df1 = _makeMultiIndexDataFrame()
1366 self.butler.put(df1, self.datasetType, dataId={})
1368 # Read back out as a dataframe.
1369 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1370 self.assertTrue(df1.equals(df2))
1372 # Read back out as an arrow table, convert to dataframe.
1373 atab3 = self.butler.get(self.datasetType, dataId={})
1374 df3 = arrow_to_pandas(atab3)
1375 self.assertTrue(df1.equals(df3))
1377 # Check reading the columns.
1378 columns = df2.columns
1379 columns2 = self.butler.get(
1380 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1381 )
1382 self.assertTrue(columns2.equals(columns))
1384 # Check reading the schema.
1385 schema = DataFrameSchema(df1)
1386 schema2 = self.butler.get(
1387 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1388 )
1389 self.assertEqual(schema2, schema)
1391 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1392 def testWriteArrowTableReadAsAstropyTable(self):
1393 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1395 self.butler.put(tab1, self.datasetType, dataId={})
1397 # Read back out as an astropy table.
1398 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1399 self._checkAstropyTableEquality(tab1, tab2)
1401 # Read back out as an arrow table, convert to astropy table.
1402 atab3 = self.butler.get(self.datasetType, dataId={})
1403 tab3 = arrow_to_astropy(atab3)
1404 self._checkAstropyTableEquality(tab1, tab3)
1406 # Check reading the columns.
1407 columns = list(tab2.columns.keys())
1408 columns2 = self.butler.get(
1409 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1410 )
1411 self.assertEqual(columns2, columns)
1413 # Check reading the schema.
1414 schema = ArrowAstropySchema(tab1)
1415 schema2 = self.butler.get(
1416 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1417 )
1418 self.assertEqual(schema2, schema)
1420 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1421 def testWriteArrowTableReadAsNumpyTable(self):
1422 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1424 self.butler.put(tab1, self.datasetType, dataId={})
1426 # Read back out as a numpy table.
1427 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1428 self._checkNumpyTableEquality(tab1, tab2)
1430 # Read back out as an arrow table, convert to numpy table.
1431 atab3 = self.butler.get(self.datasetType, dataId={})
1432 tab3 = arrow_to_numpy(atab3)
1433 self._checkNumpyTableEquality(tab1, tab3)
1435 # Check reading the columns.
1436 columns = list(tab2.dtype.names)
1437 columns2 = self.butler.get(
1438 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1439 )
1440 self.assertEqual(columns2, columns)
1442 # Check reading the schema.
1443 schema = ArrowNumpySchema(tab1.dtype)
1444 schema2 = self.butler.get(
1445 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1446 )
1447 self.assertEqual(schema2, schema)
1449 def _checkAstropyTableEquality(self, table1, table2):
1450 """Check if two astropy tables have the same columns/values
1452 Parameters
1453 ----------
1454 table1 : `astropy.table.Table`
1455 table2 : `astropy.table.Table`
1456 """
1457 self.assertEqual(table1.dtype, table2.dtype)
1458 for name in table1.columns:
1459 self.assertEqual(table1[name].unit, table2[name].unit)
1460 self.assertEqual(table1[name].description, table2[name].description)
1461 self.assertEqual(table1[name].format, table2[name].format)
1462 self.assertTrue(np.all(table1 == table2))
1464 def _checkNumpyTableEquality(self, table1, table2):
1465 """Check if two numpy tables have the same columns/values
1467 Parameters
1468 ----------
1469 table1 : `numpy.ndarray`
1470 table2 : `numpy.ndarray`
1471 """
1472 self.assertEqual(table1.dtype.names, table2.dtype.names)
1473 for name in table1.dtype.names:
1474 self.assertEqual(table1.dtype[name], table2.dtype[name])
1475 self.assertTrue(np.all(table1 == table2))
1478@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1479class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1480 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1482 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1484 def testBadInput(self):
1485 tab1 = _makeSimpleArrowTable()
1486 delegate = ArrowTableDelegate("ArrowTable")
1488 with self.assertRaises(ValueError):
1489 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1491 with self.assertRaises(NotImplementedError):
1492 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1494 with self.assertRaises(AttributeError):
1495 delegate.getComponent(composite=tab1, componentName="nothing")
1497 def testStorageClass(self):
1498 tab1 = _makeSimpleArrowTable()
1500 factory = StorageClassFactory()
1501 factory.addFromConfig(StorageClassConfig())
1503 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1504 # Force the name lookup to do name matching.
1505 storageClass._pytype = None
1506 self.assertEqual(storageClass.name, "ArrowTable")
1508 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1509 # Force the name lookup to do name matching.
1510 storageClass._pytype = None
1511 self.assertEqual(storageClass.name, "ArrowTable")
1514if __name__ == "__main__":
1515 unittest.main()