Coverage for tests/test_parquet.py: 17%
774 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-08 10:28 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-08 10:28 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _numpy_dtype_to_arrow_types,
68 arrow_to_astropy,
69 arrow_to_numpy,
70 arrow_to_numpy_dict,
71 arrow_to_pandas,
72 astropy_to_arrow,
73 numpy_dict_to_arrow,
74 numpy_to_arrow,
75 pandas_to_arrow,
76)
77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
79TESTDIR = os.path.abspath(os.path.dirname(__file__))
82def _makeSimpleNumpyTable(include_multidim=False):
83 """Make a simple numpy table with random data.
85 Parameters
86 ----------
87 include_multidim : `bool`
88 Include multi-dimensional columns.
90 Returns
91 -------
92 numpyTable : `numpy.ndarray`
93 """
94 nrow = 5
96 dtype = [
97 ("index", "i4"),
98 ("a", "f8"),
99 ("b", "f8"),
100 ("c", "f8"),
101 ("ddd", "f8"),
102 ("f", "i8"),
103 ("strcol", "U10"),
104 ("bytecol", "a10"),
105 ]
107 if include_multidim:
108 dtype.extend(
109 [
110 ("d1", "f4", (5,)),
111 ("d2", "i8", (5, 10)),
112 ("d3", "f8", (5, 10)),
113 ]
114 )
116 data = np.zeros(nrow, dtype=dtype)
117 data["index"][:] = np.arange(nrow)
118 data["a"] = np.random.randn(nrow)
119 data["b"] = np.random.randn(nrow)
120 data["c"] = np.random.randn(nrow)
121 data["ddd"] = np.random.randn(nrow)
122 data["f"] = np.arange(nrow) * 10
123 data["strcol"][:] = "teststring"
124 data["bytecol"][:] = "teststring"
126 if include_multidim:
127 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
128 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
129 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
131 return data
134def _makeSingleIndexDataFrame(include_masked=False):
135 """Make a single index data frame for testing.
137 Parameters
138 ----------
139 include_masked : `bool`
140 Include masked columns.
142 Returns
143 -------
144 dataFrame : `~pandas.DataFrame`
145 The test dataframe.
146 allColumns : `list` [`str`]
147 List of all the columns (including index columns).
148 """
149 data = _makeSimpleNumpyTable()
150 df = pd.DataFrame(data)
151 df = df.set_index("index")
153 if include_masked:
154 nrow = len(df)
156 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
157 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
158 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
159 df.loc[1, ["m1", "m2", "mstrcol"]] = None
161 allColumns = df.columns.append(pd.Index(df.index.names))
163 return df, allColumns
166def _makeMultiIndexDataFrame():
167 """Make a multi-index data frame for testing.
169 Returns
170 -------
171 dataFrame : `~pandas.DataFrame`
172 The test dataframe.
173 """
174 columns = pd.MultiIndex.from_tuples(
175 [
176 ("g", "a"),
177 ("g", "b"),
178 ("g", "c"),
179 ("r", "a"),
180 ("r", "b"),
181 ("r", "c"),
182 ],
183 names=["filter", "column"],
184 )
185 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
187 return df
190def _makeSimpleAstropyTable(include_multidim=False, include_masked=False):
191 """Make an astropy table for testing.
193 Parameters
194 ----------
195 include_multidim : `bool`
196 Include multi-dimensional columns.
197 include_masked : `bool`
198 Include masked columns.
200 Returns
201 -------
202 astropyTable : `astropy.table.Table`
203 The test table.
204 """
205 data = _makeSimpleNumpyTable(include_multidim=include_multidim)
206 # Add a couple of units.
207 table = atable.Table(data)
208 table["a"].unit = units.degree
209 table["b"].unit = units.meter
211 # Add some masked columns.
212 if include_masked:
213 nrow = len(table)
214 mask = np.zeros(nrow, dtype=bool)
215 mask[1] = True
216 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
217 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
218 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
219 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
221 return table
224def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
225 """Make an arrow table for testing.
227 Parameters
228 ----------
229 include_multidim : `bool`
230 Include multi-dimensional columns.
231 include_masked : `bool`
232 Include masked columns.
234 Returns
235 -------
236 arrowTable : `pyarrow.Table`
237 The test table.
238 """
239 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
240 return astropy_to_arrow(data)
243@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
244@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
245class ParquetFormatterDataFrameTestCase(unittest.TestCase):
246 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
248 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
250 def setUp(self):
251 """Create a new butler root for each test."""
252 self.root = makeTestTempDir(TESTDIR)
253 config = Config(self.configFile)
254 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
255 # No dimensions in dataset type so we don't have to worry about
256 # inserting dimension data or defining data IDs.
257 self.datasetType = DatasetType(
258 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
259 )
260 self.butler.registry.registerDatasetType(self.datasetType)
262 def tearDown(self):
263 removeTestTempDir(self.root)
265 def testSingleIndexDataFrame(self):
266 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
268 self.butler.put(df1, self.datasetType, dataId={})
269 # Read the whole DataFrame.
270 df2 = self.butler.get(self.datasetType, dataId={})
271 self.assertTrue(df1.equals(df2))
272 # Read just the column descriptions.
273 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
274 self.assertTrue(allColumns.equals(columns2))
275 # Read the rowcount.
276 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
277 self.assertEqual(rowcount, len(df1))
278 # Read the schema.
279 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
280 self.assertEqual(schema, DataFrameSchema(df1))
281 # Read just some columns a few different ways.
282 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
283 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
284 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
285 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
286 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
287 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
288 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
289 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
290 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
291 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
292 # Passing an unrecognized column should be a ValueError.
293 with self.assertRaises(ValueError):
294 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
296 def testMultiIndexDataFrame(self):
297 df1 = _makeMultiIndexDataFrame()
299 self.butler.put(df1, self.datasetType, dataId={})
300 # Read the whole DataFrame.
301 df2 = self.butler.get(self.datasetType, dataId={})
302 self.assertTrue(df1.equals(df2))
303 # Read just the column descriptions.
304 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
305 self.assertTrue(df1.columns.equals(columns2))
306 # Read the rowcount.
307 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
308 self.assertEqual(rowcount, len(df1))
309 # Read the schema.
310 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
311 self.assertEqual(schema, DataFrameSchema(df1))
312 # Read just some columns a few different ways.
313 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
314 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
315 df4 = self.butler.get(
316 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
317 )
318 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
319 column_list = [("g", "a"), ("r", "c")]
320 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
321 self.assertTrue(df1.loc[:, column_list].equals(df5))
322 # Passing an unrecognized column should be a ValueError.
323 with self.assertRaises(ValueError):
324 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
326 def testSingleIndexDataFrameEmptyString(self):
327 """Test persisting a single index dataframe with empty strings."""
328 df1, _ = _makeSingleIndexDataFrame()
330 # Set one of the strings to None
331 df1.at[1, "strcol"] = None
333 self.butler.put(df1, self.datasetType, dataId={})
334 # Read the whole DataFrame.
335 df2 = self.butler.get(self.datasetType, dataId={})
336 self.assertTrue(df1.equals(df2))
338 def testSingleIndexDataFrameAllEmptyStrings(self):
339 """Test persisting a single index dataframe with an empty string
340 column.
341 """
342 df1, _ = _makeSingleIndexDataFrame()
344 # Set all of the strings to None
345 df1.loc[0:, "strcol"] = None
347 self.butler.put(df1, self.datasetType, dataId={})
348 # Read the whole DataFrame.
349 df2 = self.butler.get(self.datasetType, dataId={})
350 self.assertTrue(df1.equals(df2))
352 def testLegacyDataFrame(self):
353 """Test writing a dataframe to parquet via pandas (without additional
354 metadata) and ensure that we can read it back with all the new
355 functionality.
356 """
357 df1, allColumns = _makeSingleIndexDataFrame()
359 fname = os.path.join(self.root, "test_dataframe.parq")
360 df1.to_parquet(fname)
362 legacy_type = DatasetType(
363 "legacy_dataframe",
364 dimensions=(),
365 storageClass="DataFrame",
366 universe=self.butler.registry.dimensions,
367 )
368 self.butler.registry.registerDatasetType(legacy_type)
370 data_id = {}
371 ref = DatasetRef(legacy_type, data_id, id=None)
372 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
374 self.butler.ingest(dataset, transfer="copy")
376 self.butler.put(df1, self.datasetType, dataId={})
378 df2a = self.butler.get(self.datasetType, dataId={})
379 df2b = self.butler.get("legacy_dataframe", dataId={})
380 self.assertTrue(df2a.equals(df2b))
382 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
383 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
384 self.assertTrue(df3a.equals(df3b))
386 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
387 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
388 self.assertTrue(columns2a.equals(columns2b))
390 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
391 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
392 self.assertEqual(rowcount2a, rowcount2b)
394 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
395 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
396 self.assertEqual(schema2a, schema2b)
398 def testDataFrameSchema(self):
399 tab1 = _makeSimpleArrowTable()
401 schema = DataFrameSchema.from_arrow(tab1.schema)
403 self.assertIsInstance(schema.schema, pd.DataFrame)
404 self.assertEqual(repr(schema), repr(schema._schema))
405 self.assertNotEqual(schema, "not_a_schema")
406 self.assertEqual(schema, schema)
408 tab2 = _makeMultiIndexDataFrame()
409 schema2 = DataFrameSchema(tab2)
411 self.assertNotEqual(schema, schema2)
413 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
414 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
415 df1, allColumns = _makeSingleIndexDataFrame()
417 self.butler.put(df1, self.datasetType, dataId={})
419 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
421 tab2_df = tab2.to_pandas(index="index")
422 self.assertTrue(df1.equals(tab2_df))
424 # Check reading the columns.
425 columns = list(tab2.columns.keys())
426 columns2 = self.butler.get(
427 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
428 )
429 # We check the set because pandas reorders the columns.
430 self.assertEqual(set(columns2), set(columns))
432 # Check reading the schema.
433 schema = ArrowAstropySchema(tab2)
434 schema2 = self.butler.get(
435 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
436 )
438 # The string types are objectified by pandas, and the order
439 # will be changed because of pandas indexing.
440 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
441 for name in schema.schema.columns:
442 self.assertIn(name, schema2.schema.columns)
443 if schema2.schema[name].dtype != np.dtype("O"):
444 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
446 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
447 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
448 # We need to special-case the write-as-pandas read-as-astropy code
449 # with masks because pandas has multiple ways to use masked columns.
450 # (The string column mask handling in particular is frustratingly
451 # inconsistent.)
452 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
454 self.butler.put(df1, self.datasetType, dataId={})
456 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
457 tab2_df = tab2.to_pandas(index="index")
459 self.assertTrue(df1.columns.equals(tab2_df.columns))
460 for name in tab2_df.columns:
461 col1 = df1[name]
462 col2 = tab2_df[name]
464 if col1.hasnans:
465 notNull = col1.notnull()
466 self.assertTrue(notNull.equals(col2.notnull()))
467 # Need to check value-by-value because column may
468 # be made of objects, depending on what pandas decides.
469 for index in notNull.values.nonzero()[0]:
470 self.assertEqual(col1[index], col2[index])
471 else:
472 self.assertTrue(col1.equals(col2))
474 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
475 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
476 df1 = _makeMultiIndexDataFrame()
478 self.butler.put(df1, self.datasetType, dataId={})
480 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
482 # This is an odd duck, it doesn't really round-trip.
483 # This test simply checks that it's readable, but definitely not
484 # recommended.
486 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
487 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
488 df1, allColumns = _makeSingleIndexDataFrame()
490 self.butler.put(df1, self.datasetType, dataId={})
492 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
494 tab2_df = arrow_to_pandas(tab2)
495 self.assertTrue(df1.equals(tab2_df))
497 # Check reading the columns.
498 columns = list(tab2.schema.names)
499 columns2 = self.butler.get(
500 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
501 )
502 # We check the set because pandas reorders the columns.
503 self.assertEqual(set(columns), set(columns2))
505 # Check reading the schema.
506 schema = tab2.schema
507 schema2 = self.butler.get(
508 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
509 )
511 # These will not have the same metadata, nor will the string column
512 # information be maintained.
513 self.assertEqual(len(schema.names), len(schema2.names))
514 for name in schema.names:
515 if schema.field(name).type not in (pa.string(), pa.binary()):
516 self.assertEqual(schema.field(name).type, schema2.field(name).type)
518 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
519 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
520 df1 = _makeMultiIndexDataFrame()
522 self.butler.put(df1, self.datasetType, dataId={})
524 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
526 tab2_df = arrow_to_pandas(tab2)
527 self.assertTrue(df1.equals(tab2_df))
529 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
530 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
531 df1, allColumns = _makeSingleIndexDataFrame()
533 self.butler.put(df1, self.datasetType, dataId={})
535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
537 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
538 self.assertTrue(df1.equals(tab2_df))
540 # Check reading the columns.
541 columns = list(tab2.dtype.names)
542 columns2 = self.butler.get(
543 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
544 )
545 # We check the set because pandas reorders the columns.
546 self.assertEqual(set(columns2), set(columns))
548 # Check reading the schema.
549 schema = ArrowNumpySchema(tab2.dtype)
550 schema2 = self.butler.get(
551 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
552 )
554 # The string types will be objectified by pandas, and the order
555 # will be changed because of pandas indexing.
556 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
557 for name in schema.schema.names:
558 self.assertIn(name, schema2.schema.names)
559 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
561 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
562 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
563 df1 = _makeMultiIndexDataFrame()
565 self.butler.put(df1, self.datasetType, dataId={})
567 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
569 # This is an odd duck, it doesn't really round-trip.
570 # This test simply checks that it's readable, but definitely not
571 # recommended.
574@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
575class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
576 """Tests for InMemoryDatastore, using DataFrameDelegate."""
578 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
580 def testMultiIndexDataFrame(self):
581 df1 = _makeMultiIndexDataFrame()
583 delegate = DataFrameDelegate("DataFrame")
585 # Read the whole DataFrame.
586 df2 = delegate.handleParameters(inMemoryDataset=df1)
587 self.assertTrue(df1.equals(df2))
588 # Read just the column descriptions.
589 columns2 = delegate.getComponent(composite=df1, componentName="columns")
590 self.assertTrue(df1.columns.equals(columns2))
592 # Read just some columns a few different ways.
593 with self.assertRaises(NotImplementedError) as cm:
594 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
595 self.assertIn("only supports string column names", str(cm.exception))
596 with self.assertRaises(NotImplementedError) as cm:
597 delegate.handleParameters(
598 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
599 )
600 self.assertIn("only supports string column names", str(cm.exception))
602 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
603 df1 = _makeMultiIndexDataFrame()
605 self.butler.put(df1, self.datasetType, dataId={})
607 with self.assertRaises(ValueError):
608 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
610 def testLegacyDataFrame(self):
611 # This test does not work with an inMemoryDatastore.
612 pass
614 def testBadInput(self):
615 df1, _ = _makeSingleIndexDataFrame()
616 delegate = DataFrameDelegate("DataFrame")
618 with self.assertRaises(ValueError):
619 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
621 with self.assertRaises(AttributeError):
622 delegate.getComponent(composite=df1, componentName="nothing")
624 def testStorageClass(self):
625 df1, allColumns = _makeSingleIndexDataFrame()
627 factory = StorageClassFactory()
628 factory.addFromConfig(StorageClassConfig())
630 storageClass = factory.findStorageClass(type(df1), compare_types=False)
631 # Force the name lookup to do name matching.
632 storageClass._pytype = None
633 self.assertEqual(storageClass.name, "DataFrame")
635 storageClass = factory.findStorageClass(type(df1), compare_types=True)
636 # Force the name lookup to do name matching.
637 storageClass._pytype = None
638 self.assertEqual(storageClass.name, "DataFrame")
641@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
642@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
643class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
644 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
646 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
648 def setUp(self):
649 """Create a new butler root for each test."""
650 self.root = makeTestTempDir(TESTDIR)
651 config = Config(self.configFile)
652 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
653 # No dimensions in dataset type so we don't have to worry about
654 # inserting dimension data or defining data IDs.
655 self.datasetType = DatasetType(
656 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
657 )
658 self.butler.registry.registerDatasetType(self.datasetType)
660 def tearDown(self):
661 removeTestTempDir(self.root)
663 def testAstropyTable(self):
664 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
666 self.butler.put(tab1, self.datasetType, dataId={})
667 # Read the whole Table.
668 tab2 = self.butler.get(self.datasetType, dataId={})
669 self._checkAstropyTableEquality(tab1, tab2)
670 # Read the columns.
671 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
672 self.assertEqual(len(columns2), len(tab1.dtype.names))
673 for i, name in enumerate(tab1.dtype.names):
674 self.assertEqual(columns2[i], name)
675 # Read the rowcount.
676 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
677 self.assertEqual(rowcount, len(tab1))
678 # Read the schema.
679 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
680 self.assertEqual(schema, ArrowAstropySchema(tab1))
681 # Read just some columns a few different ways.
682 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
683 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
684 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
685 self._checkAstropyTableEquality(tab1[("a",)], tab4)
686 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
687 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
688 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
689 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
690 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
691 self._checkAstropyTableEquality(tab1[("a",)], tab7)
692 # Passing an unrecognized column should be a ValueError.
693 with self.assertRaises(ValueError):
694 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
696 def testAstropyTableWithMetadata(self):
697 tab1 = _makeSimpleAstropyTable(include_multidim=True)
699 meta = {
700 "meta_a": 5,
701 "meta_b": 10.0,
702 "meta_c": [1, 2, 3],
703 "meta_d": True,
704 "meta_e": "string",
705 }
707 tab1.meta.update(meta)
709 self.butler.put(tab1, self.datasetType, dataId={})
710 # Read the whole Table.
711 tab2 = self.butler.get(self.datasetType, dataId={})
712 # This will check that the metadata is equivalent as well.
713 self._checkAstropyTableEquality(tab1, tab2)
715 def testArrowAstropySchema(self):
716 tab1 = _makeSimpleAstropyTable()
717 tab1_arrow = astropy_to_arrow(tab1)
718 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
720 self.assertIsInstance(schema.schema, atable.Table)
721 self.assertEqual(repr(schema), repr(schema._schema))
722 self.assertNotEqual(schema, "not_a_schema")
723 self.assertEqual(schema, schema)
725 # Test various inequalities
726 tab2 = tab1.copy()
727 tab2.rename_column("index", "index2")
728 schema2 = ArrowAstropySchema(tab2)
729 self.assertNotEqual(schema2, schema)
731 tab2 = tab1.copy()
732 tab2["index"].unit = units.micron
733 schema2 = ArrowAstropySchema(tab2)
734 self.assertNotEqual(schema2, schema)
736 tab2 = tab1.copy()
737 tab2["index"].description = "Index column"
738 schema2 = ArrowAstropySchema(tab2)
739 self.assertNotEqual(schema2, schema)
741 tab2 = tab1.copy()
742 tab2["index"].format = "%05d"
743 schema2 = ArrowAstropySchema(tab2)
744 self.assertNotEqual(schema2, schema)
746 def testAstropyParquet(self):
747 tab1 = _makeSimpleAstropyTable()
749 fname = os.path.join(self.root, "test_astropy.parq")
750 tab1.write(fname)
752 astropy_type = DatasetType(
753 "astropy_parquet",
754 dimensions=(),
755 storageClass="ArrowAstropy",
756 universe=self.butler.registry.dimensions,
757 )
758 self.butler.registry.registerDatasetType(astropy_type)
760 data_id = {}
761 ref = DatasetRef(astropy_type, data_id, id=None)
762 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
764 self.butler.ingest(dataset, transfer="copy")
766 self.butler.put(tab1, self.datasetType, dataId={})
768 tab2a = self.butler.get(self.datasetType, dataId={})
769 tab2b = self.butler.get("astropy_parquet", dataId={})
770 self._checkAstropyTableEquality(tab2a, tab2b)
772 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
773 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
774 self.assertEqual(len(columns2b), len(columns2a))
775 for i, name in enumerate(columns2a):
776 self.assertEqual(columns2b[i], name)
778 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
779 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
780 self.assertEqual(rowcount2a, rowcount2b)
782 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
783 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
784 self.assertEqual(schema2a, schema2b)
786 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
787 def testWriteAstropyReadAsArrowTable(self):
788 # This astropy <-> arrow works fine with masked columns.
789 tab1 = _makeSimpleAstropyTable(include_masked=True)
791 self.butler.put(tab1, self.datasetType, dataId={})
793 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
795 tab2_astropy = arrow_to_astropy(tab2)
796 self._checkAstropyTableEquality(tab1, tab2_astropy)
798 # Check reading the columns.
799 columns = tab2.schema.names
800 columns2 = self.butler.get(
801 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
802 )
803 self.assertEqual(columns2, columns)
805 # Check reading the schema.
806 schema = tab2.schema
807 schema2 = self.butler.get(
808 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
809 )
811 self.assertEqual(schema, schema2)
813 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
814 def testWriteAstropyReadAsDataFrame(self):
815 tab1 = _makeSimpleAstropyTable()
817 self.butler.put(tab1, self.datasetType, dataId={})
819 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
821 # This is tricky because it loses the units and gains a bonus pandas
822 # _index_ column, so we just test the dataframe form.
824 tab1_df = tab1.to_pandas()
825 self.assertTrue(tab1_df.equals(tab2))
827 # Check reading the columns.
828 columns = tab2.columns
829 columns2 = self.butler.get(
830 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
831 )
832 self.assertTrue(columns.equals(columns2))
834 # Check reading the schema.
835 schema = DataFrameSchema(tab2)
836 schema2 = self.butler.get(
837 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
838 )
840 self.assertEqual(schema2, schema)
842 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
843 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
844 # We need to special-case the write-as-astropy read-as-pandas code
845 # with masks because pandas has multiple ways to use masked columns.
846 # (When writing an astropy table with masked columns we get an object
847 # column back, but each unmasked element has the correct type.)
848 tab1 = _makeSimpleAstropyTable(include_masked=True)
850 self.butler.put(tab1, self.datasetType, dataId={})
852 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
854 tab1_df = tab1.to_pandas()
856 self.assertTrue(tab1_df.columns.equals(tab2.columns))
857 for name in tab2.columns:
858 col1 = tab1_df[name]
859 col2 = tab2[name]
861 if col1.hasnans:
862 notNull = col1.notnull()
863 self.assertTrue(notNull.equals(col2.notnull()))
864 # Need to check value-by-value because column may
865 # be made of objects, depending on what pandas decides.
866 for index in notNull.values.nonzero()[0]:
867 self.assertEqual(col1[index], col2[index])
868 else:
869 self.assertTrue(col1.equals(col2))
871 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
872 def testWriteAstropyReadAsNumpyTable(self):
873 tab1 = _makeSimpleAstropyTable()
874 self.butler.put(tab1, self.datasetType, dataId={})
876 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
878 # This is tricky because it loses the units.
879 tab2_astropy = atable.Table(tab2)
881 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
883 # Check reading the columns.
884 columns = list(tab2.dtype.names)
885 columns2 = self.butler.get(
886 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
887 )
888 self.assertEqual(columns2, columns)
890 # Check reading the schema.
891 schema = ArrowNumpySchema(tab2.dtype)
892 schema2 = self.butler.get(
893 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
894 )
896 self.assertEqual(schema2, schema)
898 def _checkAstropyTableEquality(self, table1, table2, skip_units=False):
899 """Check if two astropy tables have the same columns/values.
901 Parameters
902 ----------
903 table1 : `astropy.table.Table`
904 table2 : `astropy.table.Table`
905 skip_units : `bool`
906 """
907 self.assertEqual(table1.dtype, table2.dtype)
908 self.assertEqual(table1.meta, table2.meta)
909 if not skip_units:
910 for name in table1.columns:
911 self.assertEqual(table1[name].unit, table2[name].unit)
912 self.assertEqual(table1[name].description, table2[name].description)
913 self.assertEqual(table1[name].format, table2[name].format)
914 self.assertTrue(np.all(table1 == table2))
917@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
918class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
919 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
921 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
923 def testAstropyParquet(self):
924 # This test does not work with an inMemoryDatastore.
925 pass
927 def testBadInput(self):
928 tab1 = _makeSimpleAstropyTable()
929 delegate = ArrowAstropyDelegate("ArrowAstropy")
931 with self.assertRaises(ValueError):
932 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
934 with self.assertRaises(NotImplementedError):
935 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
937 with self.assertRaises(AttributeError):
938 delegate.getComponent(composite=tab1, componentName="nothing")
941@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
942@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
943class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
944 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
946 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
948 def setUp(self):
949 """Create a new butler root for each test."""
950 self.root = makeTestTempDir(TESTDIR)
951 config = Config(self.configFile)
952 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
953 # No dimensions in dataset type so we don't have to worry about
954 # inserting dimension data or defining data IDs.
955 self.datasetType = DatasetType(
956 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
957 )
958 self.butler.registry.registerDatasetType(self.datasetType)
960 def tearDown(self):
961 removeTestTempDir(self.root)
963 def testNumpyTable(self):
964 tab1 = _makeSimpleNumpyTable(include_multidim=True)
966 self.butler.put(tab1, self.datasetType, dataId={})
967 # Read the whole Table.
968 tab2 = self.butler.get(self.datasetType, dataId={})
969 self._checkNumpyTableEquality(tab1, tab2)
970 # Read the columns.
971 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
972 self.assertEqual(len(columns2), len(tab1.dtype.names))
973 for i, name in enumerate(tab1.dtype.names):
974 self.assertEqual(columns2[i], name)
975 # Read the rowcount.
976 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
977 self.assertEqual(rowcount, len(tab1))
978 # Read the schema.
979 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
980 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
981 # Read just some columns a few different ways.
982 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
983 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
984 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
985 self._checkNumpyTableEquality(
986 tab1[
987 [
988 "a",
989 ]
990 ],
991 tab4,
992 )
993 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
994 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
995 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
996 self._checkNumpyTableEquality(
997 tab1[
998 [
999 "ddd",
1000 ]
1001 ],
1002 tab6,
1003 )
1004 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1005 self._checkNumpyTableEquality(
1006 tab1[
1007 [
1008 "a",
1009 ]
1010 ],
1011 tab7,
1012 )
1013 # Passing an unrecognized column should be a ValueError.
1014 with self.assertRaises(ValueError):
1015 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1017 def testArrowNumpySchema(self):
1018 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1019 tab1_arrow = numpy_to_arrow(tab1)
1020 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1022 self.assertIsInstance(schema.schema, np.dtype)
1023 self.assertEqual(repr(schema), repr(schema._dtype))
1024 self.assertNotEqual(schema, "not_a_schema")
1025 self.assertEqual(schema, schema)
1027 # Test inequality
1028 tab2 = tab1.copy()
1029 names = list(tab2.dtype.names)
1030 names[0] = "index2"
1031 tab2.dtype.names = names
1032 schema2 = ArrowNumpySchema(tab2.dtype)
1033 self.assertNotEqual(schema2, schema)
1035 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1036 def testNumpyDictConversions(self):
1037 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1039 # Verify that everything round-trips, including the schema.
1040 tab1_arrow = numpy_to_arrow(tab1)
1041 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1042 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1044 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1045 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1047 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1048 def testWriteNumpyTableReadAsArrowTable(self):
1049 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1051 self.butler.put(tab1, self.datasetType, dataId={})
1053 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1055 tab2_numpy = arrow_to_numpy(tab2)
1057 self._checkNumpyTableEquality(tab1, tab2_numpy)
1059 # Check reading the columns.
1060 columns = tab2.schema.names
1061 columns2 = self.butler.get(
1062 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1063 )
1064 self.assertEqual(columns2, columns)
1066 # Check reading the schema.
1067 schema = tab2.schema
1068 schema2 = self.butler.get(
1069 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1070 )
1071 self.assertEqual(schema2, schema)
1073 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1074 def testWriteNumpyTableReadAsDataFrame(self):
1075 tab1 = _makeSimpleNumpyTable()
1077 self.butler.put(tab1, self.datasetType, dataId={})
1079 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1081 # Converting this back to numpy gets confused with the index column
1082 # and changes the datatype of the string column.
1084 tab1_df = pd.DataFrame(tab1)
1086 self.assertTrue(tab1_df.equals(tab2))
1088 # Check reading the columns.
1089 columns = tab2.columns
1090 columns2 = self.butler.get(
1091 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1092 )
1093 self.assertTrue(columns.equals(columns2))
1095 # Check reading the schema.
1096 schema = DataFrameSchema(tab2)
1097 schema2 = self.butler.get(
1098 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1099 )
1101 self.assertEqual(schema2, schema)
1103 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1104 def testWriteNumpyTableReadAsAstropyTable(self):
1105 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1107 self.butler.put(tab1, self.datasetType, dataId={})
1109 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1110 tab2_numpy = tab2.as_array()
1112 self._checkNumpyTableEquality(tab1, tab2_numpy)
1114 # Check reading the columns.
1115 columns = list(tab2.columns.keys())
1116 columns2 = self.butler.get(
1117 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1118 )
1119 self.assertEqual(columns2, columns)
1121 # Check reading the schema.
1122 schema = ArrowAstropySchema(tab2)
1123 schema2 = self.butler.get(
1124 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1125 )
1127 self.assertEqual(schema2, schema)
1129 def _checkNumpyTableEquality(self, table1, table2):
1130 """Check if two numpy tables have the same columns/values
1132 Parameters
1133 ----------
1134 table1 : `numpy.ndarray`
1135 table2 : `numpy.ndarray`
1136 """
1137 self.assertEqual(table1.dtype.names, table2.dtype.names)
1138 for name in table1.dtype.names:
1139 self.assertEqual(table1.dtype[name], table2.dtype[name])
1140 self.assertTrue(np.all(table1 == table2))
1143@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1144class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1145 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1147 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1149 def testBadInput(self):
1150 tab1 = _makeSimpleNumpyTable()
1151 delegate = ArrowNumpyDelegate("ArrowNumpy")
1153 with self.assertRaises(ValueError):
1154 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1156 with self.assertRaises(NotImplementedError):
1157 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1159 with self.assertRaises(AttributeError):
1160 delegate.getComponent(composite=tab1, componentName="nothing")
1162 def testStorageClass(self):
1163 tab1 = _makeSimpleNumpyTable()
1165 factory = StorageClassFactory()
1166 factory.addFromConfig(StorageClassConfig())
1168 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1169 # Force the name lookup to do name matching.
1170 storageClass._pytype = None
1171 self.assertEqual(storageClass.name, "ArrowNumpy")
1173 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1174 # Force the name lookup to do name matching.
1175 storageClass._pytype = None
1176 self.assertEqual(storageClass.name, "ArrowNumpy")
1179@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1180class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1181 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1183 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1185 def setUp(self):
1186 """Create a new butler root for each test."""
1187 self.root = makeTestTempDir(TESTDIR)
1188 config = Config(self.configFile)
1189 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1190 # No dimensions in dataset type so we don't have to worry about
1191 # inserting dimension data or defining data IDs.
1192 self.datasetType = DatasetType(
1193 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1194 )
1195 self.butler.registry.registerDatasetType(self.datasetType)
1197 def tearDown(self):
1198 removeTestTempDir(self.root)
1200 def testArrowTable(self):
1201 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1203 self.butler.put(tab1, self.datasetType, dataId={})
1204 # Read the whole Table.
1205 tab2 = self.butler.get(self.datasetType, dataId={})
1206 self.assertEqual(tab2, tab1)
1207 # Read the columns.
1208 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1209 self.assertEqual(len(columns2), len(tab1.schema.names))
1210 for i, name in enumerate(tab1.schema.names):
1211 self.assertEqual(columns2[i], name)
1212 # Read the rowcount.
1213 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1214 self.assertEqual(rowcount, len(tab1))
1215 # Read the schema.
1216 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1217 self.assertEqual(schema, tab1.schema)
1218 # Read just some columns a few different ways.
1219 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1220 self.assertEqual(tab3, tab1.select(("a", "c")))
1221 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1222 self.assertEqual(tab4, tab1.select(("a",)))
1223 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1224 self.assertEqual(tab5, tab1.select(("index", "a")))
1225 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1226 self.assertEqual(tab6, tab1.select(("ddd",)))
1227 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1228 self.assertEqual(tab7, tab1.select(("a",)))
1229 # Passing an unrecognized column should be a ValueError.
1230 with self.assertRaises(ValueError):
1231 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1233 def testEmptyArrowTable(self):
1234 data = _makeSimpleNumpyTable()
1235 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1237 schema = pa.schema(type_list)
1238 arrays = [[]] * len(schema.names)
1240 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1242 self.butler.put(tab1, self.datasetType, dataId={})
1243 tab2 = self.butler.get(self.datasetType, dataId={})
1244 self.assertEqual(tab2, tab1)
1246 tab1_numpy = arrow_to_numpy(tab1)
1247 self.assertEqual(len(tab1_numpy), 0)
1248 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1249 self.assertEqual(tab1_numpy_arrow, tab1)
1251 tab1_pandas = arrow_to_pandas(tab1)
1252 self.assertEqual(len(tab1_pandas), 0)
1253 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1254 # Unfortunately, string/byte columns get mangled when translated
1255 # through empty pandas dataframes.
1256 self.assertEqual(
1257 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1258 tab1.select(("index", "a", "b", "c", "ddd")),
1259 )
1261 tab1_astropy = arrow_to_astropy(tab1)
1262 self.assertEqual(len(tab1_astropy), 0)
1263 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1264 self.assertEqual(tab1_astropy_arrow, tab1)
1266 def testEmptyArrowTableMultidim(self):
1267 data = _makeSimpleNumpyTable(include_multidim=True)
1268 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1270 md = {}
1271 for name in data.dtype.names:
1272 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1274 schema = pa.schema(type_list, metadata=md)
1275 arrays = [[]] * len(schema.names)
1277 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1279 self.butler.put(tab1, self.datasetType, dataId={})
1280 tab2 = self.butler.get(self.datasetType, dataId={})
1281 self.assertEqual(tab2, tab1)
1283 tab1_numpy = arrow_to_numpy(tab1)
1284 self.assertEqual(len(tab1_numpy), 0)
1285 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1286 self.assertEqual(tab1_numpy_arrow, tab1)
1288 tab1_astropy = arrow_to_astropy(tab1)
1289 self.assertEqual(len(tab1_astropy), 0)
1290 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1291 self.assertEqual(tab1_astropy_arrow, tab1)
1293 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1294 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1295 df1, allColumns = _makeSingleIndexDataFrame()
1297 self.butler.put(df1, self.datasetType, dataId={})
1299 # Read back out as a dataframe.
1300 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1301 self.assertTrue(df1.equals(df2))
1303 # Read back out as an arrow table, convert to dataframe.
1304 tab3 = self.butler.get(self.datasetType, dataId={})
1305 df3 = arrow_to_pandas(tab3)
1306 self.assertTrue(df1.equals(df3))
1308 # Check reading the columns.
1309 columns = df2.reset_index().columns
1310 columns2 = self.butler.get(
1311 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1312 )
1313 # We check the set because pandas reorders the columns.
1314 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1316 # Check reading the schema.
1317 schema = DataFrameSchema(df1)
1318 schema2 = self.butler.get(
1319 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1320 )
1321 self.assertEqual(schema2, schema)
1323 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1324 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1325 df1 = _makeMultiIndexDataFrame()
1327 self.butler.put(df1, self.datasetType, dataId={})
1329 # Read back out as a dataframe.
1330 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1331 self.assertTrue(df1.equals(df2))
1333 # Read back out as an arrow table, convert to dataframe.
1334 atab3 = self.butler.get(self.datasetType, dataId={})
1335 df3 = arrow_to_pandas(atab3)
1336 self.assertTrue(df1.equals(df3))
1338 # Check reading the columns.
1339 columns = df2.columns
1340 columns2 = self.butler.get(
1341 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1342 )
1343 self.assertTrue(columns2.equals(columns))
1345 # Check reading the schema.
1346 schema = DataFrameSchema(df1)
1347 schema2 = self.butler.get(
1348 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1349 )
1350 self.assertEqual(schema2, schema)
1352 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1353 def testWriteArrowTableReadAsAstropyTable(self):
1354 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1356 self.butler.put(tab1, self.datasetType, dataId={})
1358 # Read back out as an astropy table.
1359 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1360 self._checkAstropyTableEquality(tab1, tab2)
1362 # Read back out as an arrow table, convert to astropy table.
1363 atab3 = self.butler.get(self.datasetType, dataId={})
1364 tab3 = arrow_to_astropy(atab3)
1365 self._checkAstropyTableEquality(tab1, tab3)
1367 # Check reading the columns.
1368 columns = list(tab2.columns.keys())
1369 columns2 = self.butler.get(
1370 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1371 )
1372 self.assertEqual(columns2, columns)
1374 # Check reading the schema.
1375 schema = ArrowAstropySchema(tab1)
1376 schema2 = self.butler.get(
1377 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1378 )
1379 self.assertEqual(schema2, schema)
1381 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1382 def testWriteArrowTableReadAsNumpyTable(self):
1383 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1385 self.butler.put(tab1, self.datasetType, dataId={})
1387 # Read back out as a numpy table.
1388 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1389 self._checkNumpyTableEquality(tab1, tab2)
1391 # Read back out as an arrow table, convert to numpy table.
1392 atab3 = self.butler.get(self.datasetType, dataId={})
1393 tab3 = arrow_to_numpy(atab3)
1394 self._checkNumpyTableEquality(tab1, tab3)
1396 # Check reading the columns.
1397 columns = list(tab2.dtype.names)
1398 columns2 = self.butler.get(
1399 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1400 )
1401 self.assertEqual(columns2, columns)
1403 # Check reading the schema.
1404 schema = ArrowNumpySchema(tab1.dtype)
1405 schema2 = self.butler.get(
1406 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1407 )
1408 self.assertEqual(schema2, schema)
1410 def _checkAstropyTableEquality(self, table1, table2):
1411 """Check if two astropy tables have the same columns/values
1413 Parameters
1414 ----------
1415 table1 : `astropy.table.Table`
1416 table2 : `astropy.table.Table`
1417 """
1418 self.assertEqual(table1.dtype, table2.dtype)
1419 for name in table1.columns:
1420 self.assertEqual(table1[name].unit, table2[name].unit)
1421 self.assertEqual(table1[name].description, table2[name].description)
1422 self.assertEqual(table1[name].format, table2[name].format)
1423 self.assertTrue(np.all(table1 == table2))
1425 def _checkNumpyTableEquality(self, table1, table2):
1426 """Check if two numpy tables have the same columns/values
1428 Parameters
1429 ----------
1430 table1 : `numpy.ndarray`
1431 table2 : `numpy.ndarray`
1432 """
1433 self.assertEqual(table1.dtype.names, table2.dtype.names)
1434 for name in table1.dtype.names:
1435 self.assertEqual(table1.dtype[name], table2.dtype[name])
1436 self.assertTrue(np.all(table1 == table2))
1439@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1440class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1441 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1443 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1445 def testBadInput(self):
1446 tab1 = _makeSimpleArrowTable()
1447 delegate = ArrowTableDelegate("ArrowTable")
1449 with self.assertRaises(ValueError):
1450 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1452 with self.assertRaises(NotImplementedError):
1453 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1455 with self.assertRaises(AttributeError):
1456 delegate.getComponent(composite=tab1, componentName="nothing")
1458 def testStorageClass(self):
1459 tab1 = _makeSimpleArrowTable()
1461 factory = StorageClassFactory()
1462 factory.addFromConfig(StorageClassConfig())
1464 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1465 # Force the name lookup to do name matching.
1466 storageClass._pytype = None
1467 self.assertEqual(storageClass.name, "ArrowTable")
1469 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1470 # Force the name lookup to do name matching.
1471 storageClass._pytype = None
1472 self.assertEqual(storageClass.name, "ArrowTable")
1475if __name__ == "__main__": 1475 ↛ 1476line 1475 didn't jump to line 1476, because the condition on line 1475 was never true
1476 unittest.main()