Coverage for tests/test_parquet.py: 17%
727 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-25 02:36 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-25 02:36 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _numpy_dtype_to_arrow_types,
68 arrow_to_astropy,
69 arrow_to_numpy,
70 arrow_to_numpy_dict,
71 arrow_to_pandas,
72 astropy_to_arrow,
73 numpy_dict_to_arrow,
74 numpy_to_arrow,
75 pandas_to_arrow,
76)
77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
79TESTDIR = os.path.abspath(os.path.dirname(__file__))
82def _makeSimpleNumpyTable(include_multidim=False):
83 """Make a simple numpy table with random data.
85 Parameters
86 ----------
87 include_multidim : `bool`
88 Include multi-dimensional columns.
90 Returns
91 -------
92 numpyTable : `numpy.ndarray`
93 """
94 nrow = 5
96 dtype = [
97 ("index", "i4"),
98 ("a", "f8"),
99 ("b", "f8"),
100 ("c", "f8"),
101 ("ddd", "f8"),
102 ("strcol", "U10"),
103 ("bytecol", "a10"),
104 ]
106 if include_multidim:
107 dtype.extend(
108 [
109 ("d1", "f4", (5,)),
110 ("d2", "i8", (5, 10)),
111 ("d3", "f8", (5, 10)),
112 ]
113 )
115 data = np.zeros(nrow, dtype=dtype)
116 data["index"][:] = np.arange(nrow)
117 data["a"] = np.random.randn(nrow)
118 data["b"] = np.random.randn(nrow)
119 data["c"] = np.random.randn(nrow)
120 data["ddd"] = np.random.randn(nrow)
121 data["strcol"][:] = "teststring"
122 data["bytecol"][:] = "teststring"
124 if include_multidim:
125 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
126 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
127 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
129 return data
132def _makeSingleIndexDataFrame():
133 """Make a single index data frame for testing.
135 Returns
136 -------
137 dataFrame : `~pandas.DataFrame`
138 The test dataframe.
139 allColumns : `list` [`str`]
140 List of all the columns (including index columns).
141 """
142 data = _makeSimpleNumpyTable()
143 df = pd.DataFrame(data)
144 df = df.set_index("index")
145 allColumns = df.columns.append(pd.Index(df.index.names))
147 return df, allColumns
150def _makeMultiIndexDataFrame():
151 """Make a multi-index data frame for testing.
153 Returns
154 -------
155 dataFrame : `~pandas.DataFrame`
156 The test dataframe.
157 """
158 columns = pd.MultiIndex.from_tuples(
159 [
160 ("g", "a"),
161 ("g", "b"),
162 ("g", "c"),
163 ("r", "a"),
164 ("r", "b"),
165 ("r", "c"),
166 ],
167 names=["filter", "column"],
168 )
169 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
171 return df
174def _makeSimpleAstropyTable(include_multidim=False):
175 """Make an astropy table for testing.
177 Parameters
178 ----------
179 include_multidim : `bool`
180 Include multi-dimensional columns.
182 Returns
183 -------
184 astropyTable : `astropy.table.Table`
185 The test table.
186 """
187 data = _makeSimpleNumpyTable(include_multidim=include_multidim)
188 # Add a couple of units.
189 table = atable.Table(data)
190 table["a"].unit = units.degree
191 table["b"].unit = units.meter
192 return table
195def _makeSimpleArrowTable(include_multidim=False):
196 """Make an arrow table for testing.
198 Parameters
199 ----------
200 include_multidim : `bool`
201 Include multi-dimensional columns.
203 Returns
204 -------
205 arrowTable : `pyarrow.Table`
206 The test table.
207 """
208 data = _makeSimpleNumpyTable(include_multidim=include_multidim)
209 return numpy_to_arrow(data)
212@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
213@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
214class ParquetFormatterDataFrameTestCase(unittest.TestCase):
215 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
217 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
219 def setUp(self):
220 """Create a new butler root for each test."""
221 self.root = makeTestTempDir(TESTDIR)
222 config = Config(self.configFile)
223 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
224 # No dimensions in dataset type so we don't have to worry about
225 # inserting dimension data or defining data IDs.
226 self.datasetType = DatasetType(
227 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
228 )
229 self.butler.registry.registerDatasetType(self.datasetType)
231 def tearDown(self):
232 removeTestTempDir(self.root)
234 def testSingleIndexDataFrame(self):
235 df1, allColumns = _makeSingleIndexDataFrame()
237 self.butler.put(df1, self.datasetType, dataId={})
238 # Read the whole DataFrame.
239 df2 = self.butler.get(self.datasetType, dataId={})
240 self.assertTrue(df1.equals(df2))
241 # Read just the column descriptions.
242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
243 self.assertTrue(allColumns.equals(columns2))
244 # Read the rowcount.
245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
246 self.assertEqual(rowcount, len(df1))
247 # Read the schema.
248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
249 self.assertEqual(schema, DataFrameSchema(df1))
250 # Read just some columns a few different ways.
251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
252 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
253 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
254 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
255 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
256 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
257 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
258 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
259 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
260 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
261 # Passing an unrecognized column should be a ValueError.
262 with self.assertRaises(ValueError):
263 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
265 def testMultiIndexDataFrame(self):
266 df1 = _makeMultiIndexDataFrame()
268 self.butler.put(df1, self.datasetType, dataId={})
269 # Read the whole DataFrame.
270 df2 = self.butler.get(self.datasetType, dataId={})
271 self.assertTrue(df1.equals(df2))
272 # Read just the column descriptions.
273 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
274 self.assertTrue(df1.columns.equals(columns2))
275 # Read the rowcount.
276 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
277 self.assertEqual(rowcount, len(df1))
278 # Read the schema.
279 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
280 self.assertEqual(schema, DataFrameSchema(df1))
281 # Read just some columns a few different ways.
282 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
283 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
284 df4 = self.butler.get(
285 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
286 )
287 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
288 column_list = [("g", "a"), ("r", "c")]
289 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
290 self.assertTrue(df1.loc[:, column_list].equals(df5))
291 # Passing an unrecognized column should be a ValueError.
292 with self.assertRaises(ValueError):
293 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
295 def testSingleIndexDataFrameEmptyString(self):
296 """Test persisting a single index dataframe with empty strings."""
297 df1, _ = _makeSingleIndexDataFrame()
299 # Set one of the strings to None
300 df1.at[1, "strcol"] = None
302 self.butler.put(df1, self.datasetType, dataId={})
303 # Read the whole DataFrame.
304 df2 = self.butler.get(self.datasetType, dataId={})
305 self.assertTrue(df1.equals(df2))
307 def testSingleIndexDataFrameAllEmptyStrings(self):
308 """Test persisting a single index dataframe with an empty string
309 column.
310 """
311 df1, _ = _makeSingleIndexDataFrame()
313 # Set all of the strings to None
314 df1.loc[0:, "strcol"] = None
316 self.butler.put(df1, self.datasetType, dataId={})
317 # Read the whole DataFrame.
318 df2 = self.butler.get(self.datasetType, dataId={})
319 self.assertTrue(df1.equals(df2))
321 def testLegacyDataFrame(self):
322 """Test writing a dataframe to parquet via pandas (without additional
323 metadata) and ensure that we can read it back with all the new
324 functionality.
325 """
326 df1, allColumns = _makeSingleIndexDataFrame()
328 fname = os.path.join(self.root, "test_dataframe.parq")
329 df1.to_parquet(fname)
331 legacy_type = DatasetType(
332 "legacy_dataframe",
333 dimensions=(),
334 storageClass="DataFrame",
335 universe=self.butler.registry.dimensions,
336 )
337 self.butler.registry.registerDatasetType(legacy_type)
339 data_id = {}
340 ref = DatasetRef(legacy_type, data_id, id=None)
341 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
343 self.butler.ingest(dataset, transfer="copy")
345 self.butler.put(df1, self.datasetType, dataId={})
347 df2a = self.butler.get(self.datasetType, dataId={})
348 df2b = self.butler.get("legacy_dataframe", dataId={})
349 self.assertTrue(df2a.equals(df2b))
351 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
352 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
353 self.assertTrue(df3a.equals(df3b))
355 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
356 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
357 self.assertTrue(columns2a.equals(columns2b))
359 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
360 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
361 self.assertEqual(rowcount2a, rowcount2b)
363 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
364 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
365 self.assertEqual(schema2a, schema2b)
367 def testDataFrameSchema(self):
368 tab1 = _makeSimpleArrowTable()
370 schema = DataFrameSchema.from_arrow(tab1.schema)
372 self.assertIsInstance(schema.schema, pd.DataFrame)
373 self.assertEqual(repr(schema), repr(schema._schema))
374 self.assertNotEqual(schema, "not_a_schema")
375 self.assertEqual(schema, schema)
377 tab2 = _makeMultiIndexDataFrame()
378 schema2 = DataFrameSchema(tab2)
380 self.assertNotEqual(schema, schema2)
382 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
383 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
384 df1, allColumns = _makeSingleIndexDataFrame()
386 self.butler.put(df1, self.datasetType, dataId={})
388 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
390 tab2_df = tab2.to_pandas(index="index")
391 self.assertTrue(df1.equals(tab2_df))
393 # Check reading the columns.
394 columns = list(tab2.columns.keys())
395 columns2 = self.butler.get(
396 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
397 )
398 # We check the set because pandas reorders the columns.
399 self.assertEqual(set(columns2), set(columns))
401 # Check reading the schema.
402 schema = ArrowAstropySchema(tab2)
403 schema2 = self.butler.get(
404 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
405 )
407 # The string types are objectified by pandas, and the order
408 # will be changed because of pandas indexing.
409 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
410 for name in schema.schema.columns:
411 self.assertIn(name, schema2.schema.columns)
412 if schema2.schema[name].dtype != np.dtype("O"):
413 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
415 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
416 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
417 df1 = _makeMultiIndexDataFrame()
419 self.butler.put(df1, self.datasetType, dataId={})
421 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
423 # This is an odd duck, it doesn't really round-trip.
424 # This test simply checks that it's readable, but definitely not
425 # recommended.
427 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
428 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
429 df1, allColumns = _makeSingleIndexDataFrame()
431 self.butler.put(df1, self.datasetType, dataId={})
433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
435 tab2_df = arrow_to_pandas(tab2)
436 self.assertTrue(df1.equals(tab2_df))
438 # Check reading the columns.
439 columns = list(tab2.schema.names)
440 columns2 = self.butler.get(
441 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
442 )
443 # We check the set because pandas reorders the columns.
444 self.assertEqual(set(columns), set(columns2))
446 # Check reading the schema.
447 schema = tab2.schema
448 schema2 = self.butler.get(
449 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
450 )
452 # These will not have the same metadata, nor will the string column
453 # information be maintained.
454 self.assertEqual(len(schema.names), len(schema2.names))
455 for name in schema.names:
456 if schema.field(name).type not in (pa.string(), pa.binary()):
457 self.assertEqual(schema.field(name).type, schema2.field(name).type)
459 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
460 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
461 df1 = _makeMultiIndexDataFrame()
463 self.butler.put(df1, self.datasetType, dataId={})
465 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
467 tab2_df = arrow_to_pandas(tab2)
468 self.assertTrue(df1.equals(tab2_df))
470 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
471 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
472 df1, allColumns = _makeSingleIndexDataFrame()
474 self.butler.put(df1, self.datasetType, dataId={})
476 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
478 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
479 self.assertTrue(df1.equals(tab2_df))
481 # Check reading the columns.
482 columns = list(tab2.dtype.names)
483 columns2 = self.butler.get(
484 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
485 )
486 # We check the set because pandas reorders the columns.
487 self.assertEqual(set(columns2), set(columns))
489 # Check reading the schema.
490 schema = ArrowNumpySchema(tab2.dtype)
491 schema2 = self.butler.get(
492 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
493 )
495 # The string types will be objectified by pandas, and the order
496 # will be changed because of pandas indexing.
497 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
498 for name in schema.schema.names:
499 self.assertIn(name, schema2.schema.names)
500 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
502 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
503 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
504 df1 = _makeMultiIndexDataFrame()
506 self.butler.put(df1, self.datasetType, dataId={})
508 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
510 # This is an odd duck, it doesn't really round-trip.
511 # This test simply checks that it's readable, but definitely not
512 # recommended.
515@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
516class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
517 """Tests for InMemoryDatastore, using DataFrameDelegate."""
519 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
521 def testMultiIndexDataFrame(self):
522 df1 = _makeMultiIndexDataFrame()
524 delegate = DataFrameDelegate("DataFrame")
526 # Read the whole DataFrame.
527 df2 = delegate.handleParameters(inMemoryDataset=df1)
528 self.assertTrue(df1.equals(df2))
529 # Read just the column descriptions.
530 columns2 = delegate.getComponent(composite=df1, componentName="columns")
531 self.assertTrue(df1.columns.equals(columns2))
533 # Read just some columns a few different ways.
534 with self.assertRaises(NotImplementedError) as cm:
535 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
536 self.assertIn("only supports string column names", str(cm.exception))
537 with self.assertRaises(NotImplementedError) as cm:
538 delegate.handleParameters(
539 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
540 )
541 self.assertIn("only supports string column names", str(cm.exception))
543 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
544 df1 = _makeMultiIndexDataFrame()
546 self.butler.put(df1, self.datasetType, dataId={})
548 with self.assertRaises(ValueError):
549 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
551 def testLegacyDataFrame(self):
552 # This test does not work with an inMemoryDatastore.
553 pass
555 def testBadInput(self):
556 df1, _ = _makeSingleIndexDataFrame()
557 delegate = DataFrameDelegate("DataFrame")
559 with self.assertRaises(ValueError):
560 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
562 with self.assertRaises(AttributeError):
563 delegate.getComponent(composite=df1, componentName="nothing")
565 def testStorageClass(self):
566 df1, allColumns = _makeSingleIndexDataFrame()
568 factory = StorageClassFactory()
569 factory.addFromConfig(StorageClassConfig())
571 storageClass = factory.findStorageClass(type(df1), compare_types=False)
572 # Force the name lookup to do name matching.
573 storageClass._pytype = None
574 self.assertEqual(storageClass.name, "DataFrame")
576 storageClass = factory.findStorageClass(type(df1), compare_types=True)
577 # Force the name lookup to do name matching.
578 storageClass._pytype = None
579 self.assertEqual(storageClass.name, "DataFrame")
582@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
583@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
584class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
585 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
587 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
589 def setUp(self):
590 """Create a new butler root for each test."""
591 self.root = makeTestTempDir(TESTDIR)
592 config = Config(self.configFile)
593 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
594 # No dimensions in dataset type so we don't have to worry about
595 # inserting dimension data or defining data IDs.
596 self.datasetType = DatasetType(
597 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
598 )
599 self.butler.registry.registerDatasetType(self.datasetType)
601 def tearDown(self):
602 removeTestTempDir(self.root)
604 def testAstropyTable(self):
605 tab1 = _makeSimpleAstropyTable(include_multidim=True)
607 self.butler.put(tab1, self.datasetType, dataId={})
608 # Read the whole Table.
609 tab2 = self.butler.get(self.datasetType, dataId={})
610 self._checkAstropyTableEquality(tab1, tab2)
611 # Read the columns.
612 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
613 self.assertEqual(len(columns2), len(tab1.dtype.names))
614 for i, name in enumerate(tab1.dtype.names):
615 self.assertEqual(columns2[i], name)
616 # Read the rowcount.
617 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
618 self.assertEqual(rowcount, len(tab1))
619 # Read the schema.
620 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
621 self.assertEqual(schema, ArrowAstropySchema(tab1))
622 # Read just some columns a few different ways.
623 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
624 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
625 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
626 self._checkAstropyTableEquality(tab1[("a",)], tab4)
627 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
628 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
629 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
630 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
631 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
632 self._checkAstropyTableEquality(tab1[("a",)], tab7)
633 # Passing an unrecognized column should be a ValueError.
634 with self.assertRaises(ValueError):
635 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
637 def testAstropyTableWithMetadata(self):
638 tab1 = _makeSimpleAstropyTable(include_multidim=True)
640 meta = {
641 "meta_a": 5,
642 "meta_b": 10.0,
643 "meta_c": [1, 2, 3],
644 "meta_d": True,
645 "meta_e": "string",
646 }
648 tab1.meta.update(meta)
650 self.butler.put(tab1, self.datasetType, dataId={})
651 # Read the whole Table.
652 tab2 = self.butler.get(self.datasetType, dataId={})
653 # This will check that the metadata is equivalent as well.
654 self._checkAstropyTableEquality(tab1, tab2)
656 def testArrowAstropySchema(self):
657 tab1 = _makeSimpleAstropyTable()
658 tab1_arrow = astropy_to_arrow(tab1)
659 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
661 self.assertIsInstance(schema.schema, atable.Table)
662 self.assertEqual(repr(schema), repr(schema._schema))
663 self.assertNotEqual(schema, "not_a_schema")
664 self.assertEqual(schema, schema)
666 # Test various inequalities
667 tab2 = tab1.copy()
668 tab2.rename_column("index", "index2")
669 schema2 = ArrowAstropySchema(tab2)
670 self.assertNotEqual(schema2, schema)
672 tab2 = tab1.copy()
673 tab2["index"].unit = units.micron
674 schema2 = ArrowAstropySchema(tab2)
675 self.assertNotEqual(schema2, schema)
677 tab2 = tab1.copy()
678 tab2["index"].description = "Index column"
679 schema2 = ArrowAstropySchema(tab2)
680 self.assertNotEqual(schema2, schema)
682 tab2 = tab1.copy()
683 tab2["index"].format = "%05d"
684 schema2 = ArrowAstropySchema(tab2)
685 self.assertNotEqual(schema2, schema)
687 def testAstropyParquet(self):
688 """Test writing a dataframe to parquet via pandas (without additional
689 metadata) and ensure that we can read it back with all the new
690 functionality.
691 """
692 tab1 = _makeSimpleAstropyTable()
694 fname = os.path.join(self.root, "test_astropy.parq")
695 tab1.write(fname)
697 astropy_type = DatasetType(
698 "astropy_parquet",
699 dimensions=(),
700 storageClass="ArrowAstropy",
701 universe=self.butler.registry.dimensions,
702 )
703 self.butler.registry.registerDatasetType(astropy_type)
705 data_id = {}
706 ref = DatasetRef(astropy_type, data_id, id=None)
707 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
709 self.butler.ingest(dataset, transfer="copy")
711 self.butler.put(tab1, self.datasetType, dataId={})
713 tab2a = self.butler.get(self.datasetType, dataId={})
714 tab2b = self.butler.get("astropy_parquet", dataId={})
715 self._checkAstropyTableEquality(tab2a, tab2b)
717 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
718 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
719 self.assertEqual(len(columns2b), len(columns2a))
720 for i, name in enumerate(columns2a):
721 self.assertEqual(columns2b[i], name)
723 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
724 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
725 self.assertEqual(rowcount2a, rowcount2b)
727 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
728 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
729 self.assertEqual(schema2a, schema2b)
731 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
732 def testWriteAstropyReadAsArrowTable(self):
733 tab1 = _makeSimpleAstropyTable()
735 self.butler.put(tab1, self.datasetType, dataId={})
737 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
739 tab2_astropy = arrow_to_astropy(tab2)
740 self._checkAstropyTableEquality(tab1, tab2_astropy)
742 # Check reading the columns.
743 columns = tab2.schema.names
744 columns2 = self.butler.get(
745 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
746 )
747 self.assertEqual(columns2, columns)
749 # Check reading the schema.
750 schema = tab2.schema
751 schema2 = self.butler.get(
752 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
753 )
755 self.assertEqual(schema, schema2)
757 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
758 def testWriteAstropyReadAsDataFrame(self):
759 tab1 = _makeSimpleAstropyTable()
761 self.butler.put(tab1, self.datasetType, dataId={})
763 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
765 # This is tricky because it loses the units and gains a bonus pandas
766 # _index_ column, so we just test the dataframe form.
768 tab1_df = tab1.to_pandas()
769 self.assertTrue(tab1_df.equals(tab2))
771 # Check reading the columns.
772 columns = tab2.columns
773 columns2 = self.butler.get(
774 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
775 )
776 self.assertTrue(columns.equals(columns2))
778 # Check reading the schema.
779 schema = DataFrameSchema(tab2)
780 schema2 = self.butler.get(
781 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
782 )
784 self.assertEqual(schema2, schema)
786 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
787 def testWriteAstropyReadAsNumpyTable(self):
788 tab1 = _makeSimpleAstropyTable()
789 self.butler.put(tab1, self.datasetType, dataId={})
791 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
793 # This is tricky because it loses the units.
794 tab2_astropy = atable.Table(tab2)
796 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
798 # Check reading the columns.
799 columns = list(tab2.dtype.names)
800 columns2 = self.butler.get(
801 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
802 )
803 self.assertEqual(columns2, columns)
805 # Check reading the schema.
806 schema = ArrowNumpySchema(tab2.dtype)
807 schema2 = self.butler.get(
808 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
809 )
811 self.assertEqual(schema2, schema)
813 def _checkAstropyTableEquality(self, table1, table2, skip_units=False):
814 """Check if two astropy tables have the same columns/values.
816 Parameters
817 ----------
818 table1 : `astropy.table.Table`
819 table2 : `astropy.table.Table`
820 skip_units : `bool`
821 """
822 self.assertEqual(table1.dtype, table2.dtype)
823 self.assertEqual(table1.meta, table2.meta)
824 if not skip_units:
825 for name in table1.columns:
826 self.assertEqual(table1[name].unit, table2[name].unit)
827 self.assertEqual(table1[name].description, table2[name].description)
828 self.assertEqual(table1[name].format, table2[name].format)
829 self.assertTrue(np.all(table1 == table2))
832@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
833class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
834 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
836 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
838 def testAstropyParquet(self):
839 # This test does not work with an inMemoryDatastore.
840 pass
842 def testBadInput(self):
843 tab1 = _makeSimpleAstropyTable()
844 delegate = ArrowAstropyDelegate("ArrowAstropy")
846 with self.assertRaises(ValueError):
847 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
849 with self.assertRaises(NotImplementedError):
850 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
852 with self.assertRaises(AttributeError):
853 delegate.getComponent(composite=tab1, componentName="nothing")
856@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
857@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
858class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
859 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
861 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
863 def setUp(self):
864 """Create a new butler root for each test."""
865 self.root = makeTestTempDir(TESTDIR)
866 config = Config(self.configFile)
867 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
868 # No dimensions in dataset type so we don't have to worry about
869 # inserting dimension data or defining data IDs.
870 self.datasetType = DatasetType(
871 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
872 )
873 self.butler.registry.registerDatasetType(self.datasetType)
875 def tearDown(self):
876 removeTestTempDir(self.root)
878 def testNumpyTable(self):
879 tab1 = _makeSimpleNumpyTable(include_multidim=True)
881 self.butler.put(tab1, self.datasetType, dataId={})
882 # Read the whole Table.
883 tab2 = self.butler.get(self.datasetType, dataId={})
884 self._checkNumpyTableEquality(tab1, tab2)
885 # Read the columns.
886 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
887 self.assertEqual(len(columns2), len(tab1.dtype.names))
888 for i, name in enumerate(tab1.dtype.names):
889 self.assertEqual(columns2[i], name)
890 # Read the rowcount.
891 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
892 self.assertEqual(rowcount, len(tab1))
893 # Read the schema.
894 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
895 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
896 # Read just some columns a few different ways.
897 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
898 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
899 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
900 self._checkNumpyTableEquality(
901 tab1[
902 [
903 "a",
904 ]
905 ],
906 tab4,
907 )
908 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
909 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
910 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
911 self._checkNumpyTableEquality(
912 tab1[
913 [
914 "ddd",
915 ]
916 ],
917 tab6,
918 )
919 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
920 self._checkNumpyTableEquality(
921 tab1[
922 [
923 "a",
924 ]
925 ],
926 tab7,
927 )
928 # Passing an unrecognized column should be a ValueError.
929 with self.assertRaises(ValueError):
930 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
932 def testArrowNumpySchema(self):
933 tab1 = _makeSimpleNumpyTable(include_multidim=True)
934 tab1_arrow = numpy_to_arrow(tab1)
935 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
937 self.assertIsInstance(schema.schema, np.dtype)
938 self.assertEqual(repr(schema), repr(schema._dtype))
939 self.assertNotEqual(schema, "not_a_schema")
940 self.assertEqual(schema, schema)
942 # Test inequality
943 tab2 = tab1.copy()
944 names = list(tab2.dtype.names)
945 names[0] = "index2"
946 tab2.dtype.names = names
947 schema2 = ArrowNumpySchema(tab2.dtype)
948 self.assertNotEqual(schema2, schema)
950 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
951 def testNumpyDictConversions(self):
952 tab1 = _makeSimpleNumpyTable(include_multidim=True)
954 # Verify that everything round-trips, including the schema.
955 tab1_arrow = numpy_to_arrow(tab1)
956 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
957 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
959 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
960 self.assertEqual(tab1_arrow, tab1_dict_arrow)
962 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
963 def testWriteNumpyTableReadAsArrowTable(self):
964 tab1 = _makeSimpleNumpyTable(include_multidim=True)
966 self.butler.put(tab1, self.datasetType, dataId={})
968 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
970 tab2_numpy = arrow_to_numpy(tab2)
972 self._checkNumpyTableEquality(tab1, tab2_numpy)
974 # Check reading the columns.
975 columns = tab2.schema.names
976 columns2 = self.butler.get(
977 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
978 )
979 self.assertEqual(columns2, columns)
981 # Check reading the schema.
982 schema = tab2.schema
983 schema2 = self.butler.get(
984 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
985 )
986 self.assertEqual(schema2, schema)
988 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
989 def testWriteNumpyTableReadAsDataFrame(self):
990 tab1 = _makeSimpleNumpyTable()
992 self.butler.put(tab1, self.datasetType, dataId={})
994 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
996 # Converting this back to numpy gets confused with the index column
997 # and changes the datatype of the string column.
999 tab1_df = pd.DataFrame(tab1)
1001 self.assertTrue(tab1_df.equals(tab2))
1003 # Check reading the columns.
1004 columns = tab2.columns
1005 columns2 = self.butler.get(
1006 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1007 )
1008 self.assertTrue(columns.equals(columns2))
1010 # Check reading the schema.
1011 schema = DataFrameSchema(tab2)
1012 schema2 = self.butler.get(
1013 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1014 )
1016 self.assertEqual(schema2, schema)
1018 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1019 def testWriteNumpyTableReadAsAstropyTable(self):
1020 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1022 self.butler.put(tab1, self.datasetType, dataId={})
1024 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1025 tab2_numpy = tab2.as_array()
1027 self._checkNumpyTableEquality(tab1, tab2_numpy)
1029 # Check reading the columns.
1030 columns = list(tab2.columns.keys())
1031 columns2 = self.butler.get(
1032 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1033 )
1034 self.assertEqual(columns2, columns)
1036 # Check reading the schema.
1037 schema = ArrowAstropySchema(tab2)
1038 schema2 = self.butler.get(
1039 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1040 )
1042 self.assertEqual(schema2, schema)
1044 def _checkNumpyTableEquality(self, table1, table2):
1045 """Check if two numpy tables have the same columns/values
1047 Parameters
1048 ----------
1049 table1 : `numpy.ndarray`
1050 table2 : `numpy.ndarray`
1051 """
1052 self.assertEqual(table1.dtype.names, table2.dtype.names)
1053 for name in table1.dtype.names:
1054 self.assertEqual(table1.dtype[name], table2.dtype[name])
1055 self.assertTrue(np.all(table1 == table2))
1058@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1059class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1060 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1062 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1064 def testBadInput(self):
1065 tab1 = _makeSimpleNumpyTable()
1066 delegate = ArrowNumpyDelegate("ArrowNumpy")
1068 with self.assertRaises(ValueError):
1069 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1071 with self.assertRaises(NotImplementedError):
1072 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1074 with self.assertRaises(AttributeError):
1075 delegate.getComponent(composite=tab1, componentName="nothing")
1077 def testStorageClass(self):
1078 tab1 = _makeSimpleNumpyTable()
1080 factory = StorageClassFactory()
1081 factory.addFromConfig(StorageClassConfig())
1083 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1084 # Force the name lookup to do name matching.
1085 storageClass._pytype = None
1086 self.assertEqual(storageClass.name, "ArrowNumpy")
1088 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1089 # Force the name lookup to do name matching.
1090 storageClass._pytype = None
1091 self.assertEqual(storageClass.name, "ArrowNumpy")
1094@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1095class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1096 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1098 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1100 def setUp(self):
1101 """Create a new butler root for each test."""
1102 self.root = makeTestTempDir(TESTDIR)
1103 config = Config(self.configFile)
1104 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1105 # No dimensions in dataset type so we don't have to worry about
1106 # inserting dimension data or defining data IDs.
1107 self.datasetType = DatasetType(
1108 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1109 )
1110 self.butler.registry.registerDatasetType(self.datasetType)
1112 def tearDown(self):
1113 removeTestTempDir(self.root)
1115 def testArrowTable(self):
1116 tab1 = _makeSimpleArrowTable(include_multidim=True)
1118 self.butler.put(tab1, self.datasetType, dataId={})
1119 # Read the whole Table.
1120 tab2 = self.butler.get(self.datasetType, dataId={})
1121 self.assertEqual(tab2, tab1)
1122 # Read the columns.
1123 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1124 self.assertEqual(len(columns2), len(tab1.schema.names))
1125 for i, name in enumerate(tab1.schema.names):
1126 self.assertEqual(columns2[i], name)
1127 # Read the rowcount.
1128 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1129 self.assertEqual(rowcount, len(tab1))
1130 # Read the schema.
1131 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1132 self.assertEqual(schema, tab1.schema)
1133 # Read just some columns a few different ways.
1134 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1135 self.assertEqual(tab3, tab1.select(("a", "c")))
1136 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1137 self.assertEqual(tab4, tab1.select(("a",)))
1138 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1139 self.assertEqual(tab5, tab1.select(("index", "a")))
1140 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1141 self.assertEqual(tab6, tab1.select(("ddd",)))
1142 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1143 self.assertEqual(tab7, tab1.select(("a",)))
1144 # Passing an unrecognized column should be a ValueError.
1145 with self.assertRaises(ValueError):
1146 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1148 def testEmptyArrowTable(self):
1149 data = _makeSimpleNumpyTable()
1150 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1152 schema = pa.schema(type_list)
1153 arrays = [[]] * len(schema.names)
1155 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1157 self.butler.put(tab1, self.datasetType, dataId={})
1158 tab2 = self.butler.get(self.datasetType, dataId={})
1159 self.assertEqual(tab2, tab1)
1161 tab1_numpy = arrow_to_numpy(tab1)
1162 self.assertEqual(len(tab1_numpy), 0)
1163 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1164 self.assertEqual(tab1_numpy_arrow, tab1)
1166 tab1_pandas = arrow_to_pandas(tab1)
1167 self.assertEqual(len(tab1_pandas), 0)
1168 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1169 # Unfortunately, string/byte columns get mangled when translated
1170 # through empty pandas dataframes.
1171 self.assertEqual(
1172 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1173 tab1.select(("index", "a", "b", "c", "ddd")),
1174 )
1176 tab1_astropy = arrow_to_astropy(tab1)
1177 self.assertEqual(len(tab1_astropy), 0)
1178 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1179 self.assertEqual(tab1_astropy_arrow, tab1)
1181 def testEmptyArrowTableMultidim(self):
1182 data = _makeSimpleNumpyTable(include_multidim=True)
1183 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1185 md = {}
1186 for name in data.dtype.names:
1187 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1189 schema = pa.schema(type_list, metadata=md)
1190 arrays = [[]] * len(schema.names)
1192 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1194 self.butler.put(tab1, self.datasetType, dataId={})
1195 tab2 = self.butler.get(self.datasetType, dataId={})
1196 self.assertEqual(tab2, tab1)
1198 tab1_numpy = arrow_to_numpy(tab1)
1199 self.assertEqual(len(tab1_numpy), 0)
1200 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1201 self.assertEqual(tab1_numpy_arrow, tab1)
1203 tab1_astropy = arrow_to_astropy(tab1)
1204 self.assertEqual(len(tab1_astropy), 0)
1205 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1206 self.assertEqual(tab1_astropy_arrow, tab1)
1208 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1209 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1210 df1, allColumns = _makeSingleIndexDataFrame()
1212 self.butler.put(df1, self.datasetType, dataId={})
1214 # Read back out as a dataframe.
1215 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1216 self.assertTrue(df1.equals(df2))
1218 # Read back out as an arrow table, convert to dataframe.
1219 tab3 = self.butler.get(self.datasetType, dataId={})
1220 df3 = arrow_to_pandas(tab3)
1221 self.assertTrue(df1.equals(df3))
1223 # Check reading the columns.
1224 columns = df2.reset_index().columns
1225 columns2 = self.butler.get(
1226 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1227 )
1228 # We check the set because pandas reorders the columns.
1229 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1231 # Check reading the schema.
1232 schema = DataFrameSchema(df1)
1233 schema2 = self.butler.get(
1234 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1235 )
1236 self.assertEqual(schema2, schema)
1238 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1239 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1240 df1 = _makeMultiIndexDataFrame()
1242 self.butler.put(df1, self.datasetType, dataId={})
1244 # Read back out as a dataframe.
1245 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1246 self.assertTrue(df1.equals(df2))
1248 # Read back out as an arrow table, convert to dataframe.
1249 atab3 = self.butler.get(self.datasetType, dataId={})
1250 df3 = arrow_to_pandas(atab3)
1251 self.assertTrue(df1.equals(df3))
1253 # Check reading the columns.
1254 columns = df2.columns
1255 columns2 = self.butler.get(
1256 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1257 )
1258 self.assertTrue(columns2.equals(columns))
1260 # Check reading the schema.
1261 schema = DataFrameSchema(df1)
1262 schema2 = self.butler.get(
1263 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1264 )
1265 self.assertEqual(schema2, schema)
1267 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1268 def testWriteArrowTableReadAsAstropyTable(self):
1269 tab1 = _makeSimpleAstropyTable(include_multidim=True)
1271 self.butler.put(tab1, self.datasetType, dataId={})
1273 # Read back out as an astropy table.
1274 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1275 self._checkAstropyTableEquality(tab1, tab2)
1277 # Read back out as an arrow table, convert to astropy table.
1278 atab3 = self.butler.get(self.datasetType, dataId={})
1279 tab3 = arrow_to_astropy(atab3)
1280 self._checkAstropyTableEquality(tab1, tab3)
1282 # Check reading the columns.
1283 columns = list(tab2.columns.keys())
1284 columns2 = self.butler.get(
1285 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1286 )
1287 self.assertEqual(columns2, columns)
1289 # Check reading the schema.
1290 schema = ArrowAstropySchema(tab1)
1291 schema2 = self.butler.get(
1292 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1293 )
1294 self.assertEqual(schema2, schema)
1296 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1297 def testWriteArrowTableReadAsNumpyTable(self):
1298 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1300 self.butler.put(tab1, self.datasetType, dataId={})
1302 # Read back out as a numpy table.
1303 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1304 self._checkNumpyTableEquality(tab1, tab2)
1306 # Read back out as an arrow table, convert to numpy table.
1307 atab3 = self.butler.get(self.datasetType, dataId={})
1308 tab3 = arrow_to_numpy(atab3)
1309 self._checkNumpyTableEquality(tab1, tab3)
1311 # Check reading the columns.
1312 columns = list(tab2.dtype.names)
1313 columns2 = self.butler.get(
1314 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1315 )
1316 self.assertEqual(columns2, columns)
1318 # Check reading the schema.
1319 schema = ArrowNumpySchema(tab1.dtype)
1320 schema2 = self.butler.get(
1321 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1322 )
1323 self.assertEqual(schema2, schema)
1325 def _checkAstropyTableEquality(self, table1, table2):
1326 """Check if two astropy tables have the same columns/values
1328 Parameters
1329 ----------
1330 table1 : `astropy.table.Table`
1331 table2 : `astropy.table.Table`
1332 """
1333 self.assertEqual(table1.dtype, table2.dtype)
1334 for name in table1.columns:
1335 self.assertEqual(table1[name].unit, table2[name].unit)
1336 self.assertEqual(table1[name].description, table2[name].description)
1337 self.assertEqual(table1[name].format, table2[name].format)
1338 self.assertTrue(np.all(table1 == table2))
1340 def _checkNumpyTableEquality(self, table1, table2):
1341 """Check if two numpy tables have the same columns/values
1343 Parameters
1344 ----------
1345 table1 : `numpy.ndarray`
1346 table2 : `numpy.ndarray`
1347 """
1348 self.assertEqual(table1.dtype.names, table2.dtype.names)
1349 for name in table1.dtype.names:
1350 self.assertEqual(table1.dtype[name], table2.dtype[name])
1351 self.assertTrue(np.all(table1 == table2))
1354@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1355class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1356 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1358 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1360 def testBadInput(self):
1361 tab1 = _makeSimpleArrowTable()
1362 delegate = ArrowTableDelegate("ArrowTable")
1364 with self.assertRaises(ValueError):
1365 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1367 with self.assertRaises(NotImplementedError):
1368 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1370 with self.assertRaises(AttributeError):
1371 delegate.getComponent(composite=tab1, componentName="nothing")
1373 def testStorageClass(self):
1374 tab1 = _makeSimpleArrowTable()
1376 factory = StorageClassFactory()
1377 factory.addFromConfig(StorageClassConfig())
1379 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1380 # Force the name lookup to do name matching.
1381 storageClass._pytype = None
1382 self.assertEqual(storageClass.name, "ArrowTable")
1384 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1385 # Force the name lookup to do name matching.
1386 storageClass._pytype = None
1387 self.assertEqual(storageClass.name, "ArrowTable")
1390if __name__ == "__main__": 1390 ↛ 1391line 1390 didn't jump to line 1391, because the condition on line 1390 was never true
1391 unittest.main()