Coverage for tests/test_parquet.py: 17%
923 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-07 00:58 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-07 00:58 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _astropy_to_numpy_dict,
68 _numpy_dict_to_numpy,
69 _numpy_dtype_to_arrow_types,
70 _numpy_to_numpy_dict,
71 arrow_to_astropy,
72 arrow_to_numpy,
73 arrow_to_numpy_dict,
74 arrow_to_pandas,
75 astropy_to_arrow,
76 numpy_dict_to_arrow,
77 numpy_to_arrow,
78 pandas_to_arrow,
79)
80from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
82TESTDIR = os.path.abspath(os.path.dirname(__file__))
85def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
86 """Make a simple numpy table with random data.
88 Parameters
89 ----------
90 include_multidim : `bool`
91 Include multi-dimensional columns.
92 include_bigendian : `bool`
93 Include big-endian columns.
95 Returns
96 -------
97 numpyTable : `numpy.ndarray`
98 """
99 nrow = 5
101 dtype = [
102 ("index", "i4"),
103 ("a", "f8"),
104 ("b", "f8"),
105 ("c", "f8"),
106 ("ddd", "f8"),
107 ("f", "i8"),
108 ("strcol", "U10"),
109 ("bytecol", "a10"),
110 ]
112 if include_multidim:
113 dtype.extend(
114 [
115 ("d1", "f4", (5,)),
116 ("d2", "i8", (5, 10)),
117 ("d3", "f8", (5, 10)),
118 ]
119 )
121 if include_bigendian:
122 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
124 data = np.zeros(nrow, dtype=dtype)
125 data["index"][:] = np.arange(nrow)
126 data["a"] = np.random.randn(nrow)
127 data["b"] = np.random.randn(nrow)
128 data["c"] = np.random.randn(nrow)
129 data["ddd"] = np.random.randn(nrow)
130 data["f"] = np.arange(nrow) * 10
131 data["strcol"][:] = "teststring"
132 data["bytecol"][:] = "teststring"
134 if include_multidim:
135 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
136 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
137 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
139 if include_bigendian:
140 data["a_bigendian"][:] = data["a"]
141 data["f_bigendian"][:] = data["f"]
143 return data
146def _makeSingleIndexDataFrame(include_masked=False):
147 """Make a single index data frame for testing.
149 Parameters
150 ----------
151 include_masked : `bool`
152 Include masked columns.
154 Returns
155 -------
156 dataFrame : `~pandas.DataFrame`
157 The test dataframe.
158 allColumns : `list` [`str`]
159 List of all the columns (including index columns).
160 """
161 data = _makeSimpleNumpyTable()
162 df = pd.DataFrame(data)
163 df = df.set_index("index")
165 if include_masked:
166 nrow = len(df)
168 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
169 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
170 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
171 df.loc[1, ["m1", "m2", "mstrcol"]] = None
173 allColumns = df.columns.append(pd.Index(df.index.names))
175 return df, allColumns
178def _makeMultiIndexDataFrame():
179 """Make a multi-index data frame for testing.
181 Returns
182 -------
183 dataFrame : `~pandas.DataFrame`
184 The test dataframe.
185 """
186 columns = pd.MultiIndex.from_tuples(
187 [
188 ("g", "a"),
189 ("g", "b"),
190 ("g", "c"),
191 ("r", "a"),
192 ("r", "b"),
193 ("r", "c"),
194 ],
195 names=["filter", "column"],
196 )
197 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
199 return df
202def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
203 """Make an astropy table for testing.
205 Parameters
206 ----------
207 include_multidim : `bool`
208 Include multi-dimensional columns.
209 include_masked : `bool`
210 Include masked columns.
211 include_bigendian : `bool`
212 Include big-endian columns.
214 Returns
215 -------
216 astropyTable : `astropy.table.Table`
217 The test table.
218 """
219 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
220 # Add a couple of units.
221 table = atable.Table(data)
222 table["a"].unit = units.degree
223 table["b"].unit = units.meter
225 # Add some masked columns.
226 if include_masked:
227 nrow = len(table)
228 mask = np.zeros(nrow, dtype=bool)
229 mask[1] = True
230 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
231 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
232 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
233 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
235 return table
238def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
239 """Make an arrow table for testing.
241 Parameters
242 ----------
243 include_multidim : `bool`
244 Include multi-dimensional columns.
245 include_masked : `bool`
246 Include masked columns.
248 Returns
249 -------
250 arrowTable : `pyarrow.Table`
251 The test table.
252 """
253 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
254 return astropy_to_arrow(data)
257@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
258@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
259class ParquetFormatterDataFrameTestCase(unittest.TestCase):
260 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
262 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
264 def setUp(self):
265 """Create a new butler root for each test."""
266 self.root = makeTestTempDir(TESTDIR)
267 config = Config(self.configFile)
268 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
269 # No dimensions in dataset type so we don't have to worry about
270 # inserting dimension data or defining data IDs.
271 self.datasetType = DatasetType(
272 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
273 )
274 self.butler.registry.registerDatasetType(self.datasetType)
276 def tearDown(self):
277 removeTestTempDir(self.root)
279 def testSingleIndexDataFrame(self):
280 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
282 self.butler.put(df1, self.datasetType, dataId={})
283 # Read the whole DataFrame.
284 df2 = self.butler.get(self.datasetType, dataId={})
285 self.assertTrue(df1.equals(df2))
286 # Read just the column descriptions.
287 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
288 self.assertTrue(allColumns.equals(columns2))
289 # Read the rowcount.
290 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
291 self.assertEqual(rowcount, len(df1))
292 # Read the schema.
293 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
294 self.assertEqual(schema, DataFrameSchema(df1))
295 # Read just some columns a few different ways.
296 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
297 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
298 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
299 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
300 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
301 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
302 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
303 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
304 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
305 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
306 # Passing an unrecognized column should be a ValueError.
307 with self.assertRaises(ValueError):
308 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
310 def testMultiIndexDataFrame(self):
311 df1 = _makeMultiIndexDataFrame()
313 self.butler.put(df1, self.datasetType, dataId={})
314 # Read the whole DataFrame.
315 df2 = self.butler.get(self.datasetType, dataId={})
316 self.assertTrue(df1.equals(df2))
317 # Read just the column descriptions.
318 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
319 self.assertTrue(df1.columns.equals(columns2))
320 # Read the rowcount.
321 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
322 self.assertEqual(rowcount, len(df1))
323 # Read the schema.
324 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
325 self.assertEqual(schema, DataFrameSchema(df1))
326 # Read just some columns a few different ways.
327 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
328 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
329 df4 = self.butler.get(
330 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
331 )
332 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
333 column_list = [("g", "a"), ("r", "c")]
334 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
335 self.assertTrue(df1.loc[:, column_list].equals(df5))
336 # Passing an unrecognized column should be a ValueError.
337 with self.assertRaises(ValueError):
338 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
340 def testSingleIndexDataFrameEmptyString(self):
341 """Test persisting a single index dataframe with empty strings."""
342 df1, _ = _makeSingleIndexDataFrame()
344 # Set one of the strings to None
345 df1.at[1, "strcol"] = None
347 self.butler.put(df1, self.datasetType, dataId={})
348 # Read the whole DataFrame.
349 df2 = self.butler.get(self.datasetType, dataId={})
350 self.assertTrue(df1.equals(df2))
352 def testSingleIndexDataFrameAllEmptyStrings(self):
353 """Test persisting a single index dataframe with an empty string
354 column.
355 """
356 df1, _ = _makeSingleIndexDataFrame()
358 # Set all of the strings to None
359 df1.loc[0:, "strcol"] = None
361 self.butler.put(df1, self.datasetType, dataId={})
362 # Read the whole DataFrame.
363 df2 = self.butler.get(self.datasetType, dataId={})
364 self.assertTrue(df1.equals(df2))
366 def testLegacyDataFrame(self):
367 """Test writing a dataframe to parquet via pandas (without additional
368 metadata) and ensure that we can read it back with all the new
369 functionality.
370 """
371 df1, allColumns = _makeSingleIndexDataFrame()
373 fname = os.path.join(self.root, "test_dataframe.parq")
374 df1.to_parquet(fname)
376 legacy_type = DatasetType(
377 "legacy_dataframe",
378 dimensions=(),
379 storageClass="DataFrame",
380 universe=self.butler.registry.dimensions,
381 )
382 self.butler.registry.registerDatasetType(legacy_type)
384 data_id = {}
385 ref = DatasetRef(legacy_type, data_id, id=None)
386 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
388 self.butler.ingest(dataset, transfer="copy")
390 self.butler.put(df1, self.datasetType, dataId={})
392 df2a = self.butler.get(self.datasetType, dataId={})
393 df2b = self.butler.get("legacy_dataframe", dataId={})
394 self.assertTrue(df2a.equals(df2b))
396 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
397 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
398 self.assertTrue(df3a.equals(df3b))
400 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
401 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
402 self.assertTrue(columns2a.equals(columns2b))
404 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
405 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
406 self.assertEqual(rowcount2a, rowcount2b)
408 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
409 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
410 self.assertEqual(schema2a, schema2b)
412 def testDataFrameSchema(self):
413 tab1 = _makeSimpleArrowTable()
415 schema = DataFrameSchema.from_arrow(tab1.schema)
417 self.assertIsInstance(schema.schema, pd.DataFrame)
418 self.assertEqual(repr(schema), repr(schema._schema))
419 self.assertNotEqual(schema, "not_a_schema")
420 self.assertEqual(schema, schema)
422 tab2 = _makeMultiIndexDataFrame()
423 schema2 = DataFrameSchema(tab2)
425 self.assertNotEqual(schema, schema2)
427 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
428 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
429 df1, allColumns = _makeSingleIndexDataFrame()
431 self.butler.put(df1, self.datasetType, dataId={})
433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
435 tab2_df = tab2.to_pandas(index="index")
436 self.assertTrue(df1.equals(tab2_df))
438 # Check reading the columns.
439 columns = list(tab2.columns.keys())
440 columns2 = self.butler.get(
441 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
442 )
443 # We check the set because pandas reorders the columns.
444 self.assertEqual(set(columns2), set(columns))
446 # Check reading the schema.
447 schema = ArrowAstropySchema(tab2)
448 schema2 = self.butler.get(
449 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
450 )
452 # The string types are objectified by pandas, and the order
453 # will be changed because of pandas indexing.
454 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
455 for name in schema.schema.columns:
456 self.assertIn(name, schema2.schema.columns)
457 if schema2.schema[name].dtype != np.dtype("O"):
458 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
460 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
461 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
462 # We need to special-case the write-as-pandas read-as-astropy code
463 # with masks because pandas has multiple ways to use masked columns.
464 # (The string column mask handling in particular is frustratingly
465 # inconsistent.)
466 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
468 self.butler.put(df1, self.datasetType, dataId={})
470 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
471 tab2_df = tab2.to_pandas(index="index")
473 self.assertTrue(df1.columns.equals(tab2_df.columns))
474 for name in tab2_df.columns:
475 col1 = df1[name]
476 col2 = tab2_df[name]
478 if col1.hasnans:
479 notNull = col1.notnull()
480 self.assertTrue(notNull.equals(col2.notnull()))
481 # Need to check value-by-value because column may
482 # be made of objects, depending on what pandas decides.
483 for index in notNull.values.nonzero()[0]:
484 self.assertEqual(col1[index], col2[index])
485 else:
486 self.assertTrue(col1.equals(col2))
488 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
489 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
490 df1 = _makeMultiIndexDataFrame()
492 self.butler.put(df1, self.datasetType, dataId={})
494 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
496 # This is an odd duck, it doesn't really round-trip.
497 # This test simply checks that it's readable, but definitely not
498 # recommended.
500 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
501 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
502 df1, allColumns = _makeSingleIndexDataFrame()
504 self.butler.put(df1, self.datasetType, dataId={})
506 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
508 tab2_df = arrow_to_pandas(tab2)
509 self.assertTrue(df1.equals(tab2_df))
511 # Check reading the columns.
512 columns = list(tab2.schema.names)
513 columns2 = self.butler.get(
514 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
515 )
516 # We check the set because pandas reorders the columns.
517 self.assertEqual(set(columns), set(columns2))
519 # Check reading the schema.
520 schema = tab2.schema
521 schema2 = self.butler.get(
522 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
523 )
525 # These will not have the same metadata, nor will the string column
526 # information be maintained.
527 self.assertEqual(len(schema.names), len(schema2.names))
528 for name in schema.names:
529 if schema.field(name).type not in (pa.string(), pa.binary()):
530 self.assertEqual(schema.field(name).type, schema2.field(name).type)
532 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
533 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
534 df1 = _makeMultiIndexDataFrame()
536 self.butler.put(df1, self.datasetType, dataId={})
538 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
540 tab2_df = arrow_to_pandas(tab2)
541 self.assertTrue(df1.equals(tab2_df))
543 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
544 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
545 df1, allColumns = _makeSingleIndexDataFrame()
547 self.butler.put(df1, self.datasetType, dataId={})
549 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
551 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
552 self.assertTrue(df1.equals(tab2_df))
554 # Check reading the columns.
555 columns = list(tab2.dtype.names)
556 columns2 = self.butler.get(
557 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
558 )
559 # We check the set because pandas reorders the columns.
560 self.assertEqual(set(columns2), set(columns))
562 # Check reading the schema.
563 schema = ArrowNumpySchema(tab2.dtype)
564 schema2 = self.butler.get(
565 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
566 )
568 # The string types will be objectified by pandas, and the order
569 # will be changed because of pandas indexing.
570 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
571 for name in schema.schema.names:
572 self.assertIn(name, schema2.schema.names)
573 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
575 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
576 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
577 df1 = _makeMultiIndexDataFrame()
579 self.butler.put(df1, self.datasetType, dataId={})
581 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
583 # This is an odd duck, it doesn't really round-trip.
584 # This test simply checks that it's readable, but definitely not
585 # recommended.
587 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
588 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
589 df1, allColumns = _makeSingleIndexDataFrame()
591 self.butler.put(df1, self.datasetType, dataId={})
593 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
595 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
596 # The column order is not maintained.
597 self.assertEqual(set(df1.columns), set(tab2_df.columns))
598 for col in df1.columns:
599 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
601 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
602 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
603 df1 = _makeMultiIndexDataFrame()
605 self.butler.put(df1, self.datasetType, dataId={})
607 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
609 # This is an odd duck, it doesn't really round-trip.
610 # This test simply checks that it's readable, but definitely not
611 # recommended.
614@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
615class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
616 """Tests for InMemoryDatastore, using DataFrameDelegate."""
618 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
620 def testMultiIndexDataFrame(self):
621 df1 = _makeMultiIndexDataFrame()
623 delegate = DataFrameDelegate("DataFrame")
625 # Read the whole DataFrame.
626 df2 = delegate.handleParameters(inMemoryDataset=df1)
627 self.assertTrue(df1.equals(df2))
628 # Read just the column descriptions.
629 columns2 = delegate.getComponent(composite=df1, componentName="columns")
630 self.assertTrue(df1.columns.equals(columns2))
632 # Read just some columns a few different ways.
633 with self.assertRaises(NotImplementedError) as cm:
634 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}})
635 self.assertIn("only supports string column names", str(cm.exception))
636 with self.assertRaises(NotImplementedError) as cm:
637 delegate.handleParameters(
638 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}}
639 )
640 self.assertIn("only supports string column names", str(cm.exception))
642 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
643 df1 = _makeMultiIndexDataFrame()
645 self.butler.put(df1, self.datasetType, dataId={})
647 with self.assertRaises(ValueError):
648 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
650 def testLegacyDataFrame(self):
651 # This test does not work with an inMemoryDatastore.
652 pass
654 def testBadInput(self):
655 df1, _ = _makeSingleIndexDataFrame()
656 delegate = DataFrameDelegate("DataFrame")
658 with self.assertRaises(ValueError):
659 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
661 with self.assertRaises(AttributeError):
662 delegate.getComponent(composite=df1, componentName="nothing")
664 def testStorageClass(self):
665 df1, allColumns = _makeSingleIndexDataFrame()
667 factory = StorageClassFactory()
668 factory.addFromConfig(StorageClassConfig())
670 storageClass = factory.findStorageClass(type(df1), compare_types=False)
671 # Force the name lookup to do name matching.
672 storageClass._pytype = None
673 self.assertEqual(storageClass.name, "DataFrame")
675 storageClass = factory.findStorageClass(type(df1), compare_types=True)
676 # Force the name lookup to do name matching.
677 storageClass._pytype = None
678 self.assertEqual(storageClass.name, "DataFrame")
681@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
682@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
683class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
684 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
686 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
688 def setUp(self):
689 """Create a new butler root for each test."""
690 self.root = makeTestTempDir(TESTDIR)
691 config = Config(self.configFile)
692 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
693 # No dimensions in dataset type so we don't have to worry about
694 # inserting dimension data or defining data IDs.
695 self.datasetType = DatasetType(
696 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
697 )
698 self.butler.registry.registerDatasetType(self.datasetType)
700 def tearDown(self):
701 removeTestTempDir(self.root)
703 def testAstropyTable(self):
704 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
706 self.butler.put(tab1, self.datasetType, dataId={})
707 # Read the whole Table.
708 tab2 = self.butler.get(self.datasetType, dataId={})
709 self._checkAstropyTableEquality(tab1, tab2)
710 # Read the columns.
711 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
712 self.assertEqual(len(columns2), len(tab1.dtype.names))
713 for i, name in enumerate(tab1.dtype.names):
714 self.assertEqual(columns2[i], name)
715 # Read the rowcount.
716 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
717 self.assertEqual(rowcount, len(tab1))
718 # Read the schema.
719 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
720 self.assertEqual(schema, ArrowAstropySchema(tab1))
721 # Read just some columns a few different ways.
722 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
723 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
724 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
725 self._checkAstropyTableEquality(tab1[("a",)], tab4)
726 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
727 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
728 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
729 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
730 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
731 self._checkAstropyTableEquality(tab1[("a",)], tab7)
732 # Passing an unrecognized column should be a ValueError.
733 with self.assertRaises(ValueError):
734 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
736 def testAstropyTableBigEndian(self):
737 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
739 self.butler.put(tab1, self.datasetType, dataId={})
740 # Read the whole Table.
741 tab2 = self.butler.get(self.datasetType, dataId={})
742 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
744 def testAstropyTableWithMetadata(self):
745 tab1 = _makeSimpleAstropyTable(include_multidim=True)
747 meta = {
748 "meta_a": 5,
749 "meta_b": 10.0,
750 "meta_c": [1, 2, 3],
751 "meta_d": True,
752 "meta_e": "string",
753 }
755 tab1.meta.update(meta)
757 self.butler.put(tab1, self.datasetType, dataId={})
758 # Read the whole Table.
759 tab2 = self.butler.get(self.datasetType, dataId={})
760 # This will check that the metadata is equivalent as well.
761 self._checkAstropyTableEquality(tab1, tab2)
763 def testArrowAstropySchema(self):
764 tab1 = _makeSimpleAstropyTable()
765 tab1_arrow = astropy_to_arrow(tab1)
766 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
768 self.assertIsInstance(schema.schema, atable.Table)
769 self.assertEqual(repr(schema), repr(schema._schema))
770 self.assertNotEqual(schema, "not_a_schema")
771 self.assertEqual(schema, schema)
773 # Test various inequalities
774 tab2 = tab1.copy()
775 tab2.rename_column("index", "index2")
776 schema2 = ArrowAstropySchema(tab2)
777 self.assertNotEqual(schema2, schema)
779 tab2 = tab1.copy()
780 tab2["index"].unit = units.micron
781 schema2 = ArrowAstropySchema(tab2)
782 self.assertNotEqual(schema2, schema)
784 tab2 = tab1.copy()
785 tab2["index"].description = "Index column"
786 schema2 = ArrowAstropySchema(tab2)
787 self.assertNotEqual(schema2, schema)
789 tab2 = tab1.copy()
790 tab2["index"].format = "%05d"
791 schema2 = ArrowAstropySchema(tab2)
792 self.assertNotEqual(schema2, schema)
794 def testAstropyParquet(self):
795 tab1 = _makeSimpleAstropyTable()
797 fname = os.path.join(self.root, "test_astropy.parq")
798 tab1.write(fname)
800 astropy_type = DatasetType(
801 "astropy_parquet",
802 dimensions=(),
803 storageClass="ArrowAstropy",
804 universe=self.butler.registry.dimensions,
805 )
806 self.butler.registry.registerDatasetType(astropy_type)
808 data_id = {}
809 ref = DatasetRef(astropy_type, data_id, id=None)
810 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
812 self.butler.ingest(dataset, transfer="copy")
814 self.butler.put(tab1, self.datasetType, dataId={})
816 tab2a = self.butler.get(self.datasetType, dataId={})
817 tab2b = self.butler.get("astropy_parquet", dataId={})
818 self._checkAstropyTableEquality(tab2a, tab2b)
820 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
821 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
822 self.assertEqual(len(columns2b), len(columns2a))
823 for i, name in enumerate(columns2a):
824 self.assertEqual(columns2b[i], name)
826 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
827 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
828 self.assertEqual(rowcount2a, rowcount2b)
830 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
831 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
832 self.assertEqual(schema2a, schema2b)
834 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
835 def testWriteAstropyReadAsArrowTable(self):
836 # This astropy <-> arrow works fine with masked columns.
837 tab1 = _makeSimpleAstropyTable(include_masked=True)
839 self.butler.put(tab1, self.datasetType, dataId={})
841 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
843 tab2_astropy = arrow_to_astropy(tab2)
844 self._checkAstropyTableEquality(tab1, tab2_astropy)
846 # Check reading the columns.
847 columns = tab2.schema.names
848 columns2 = self.butler.get(
849 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
850 )
851 self.assertEqual(columns2, columns)
853 # Check reading the schema.
854 schema = tab2.schema
855 schema2 = self.butler.get(
856 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
857 )
859 self.assertEqual(schema, schema2)
861 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
862 def testWriteAstropyReadAsDataFrame(self):
863 tab1 = _makeSimpleAstropyTable()
865 self.butler.put(tab1, self.datasetType, dataId={})
867 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
869 # This is tricky because it loses the units and gains a bonus pandas
870 # _index_ column, so we just test the dataframe form.
872 tab1_df = tab1.to_pandas()
873 self.assertTrue(tab1_df.equals(tab2))
875 # Check reading the columns.
876 columns = tab2.columns
877 columns2 = self.butler.get(
878 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
879 )
880 self.assertTrue(columns.equals(columns2))
882 # Check reading the schema.
883 schema = DataFrameSchema(tab2)
884 schema2 = self.butler.get(
885 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
886 )
888 self.assertEqual(schema2, schema)
890 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
891 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
892 # We need to special-case the write-as-astropy read-as-pandas code
893 # with masks because pandas has multiple ways to use masked columns.
894 # (When writing an astropy table with masked columns we get an object
895 # column back, but each unmasked element has the correct type.)
896 tab1 = _makeSimpleAstropyTable(include_masked=True)
898 self.butler.put(tab1, self.datasetType, dataId={})
900 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
902 tab1_df = tab1.to_pandas()
904 self.assertTrue(tab1_df.columns.equals(tab2.columns))
905 for name in tab2.columns:
906 col1 = tab1_df[name]
907 col2 = tab2[name]
909 if col1.hasnans:
910 notNull = col1.notnull()
911 self.assertTrue(notNull.equals(col2.notnull()))
912 # Need to check value-by-value because column may
913 # be made of objects, depending on what pandas decides.
914 for index in notNull.values.nonzero()[0]:
915 self.assertEqual(col1[index], col2[index])
916 else:
917 self.assertTrue(col1.equals(col2))
919 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
920 def testWriteAstropyReadAsNumpyTable(self):
921 tab1 = _makeSimpleAstropyTable()
922 self.butler.put(tab1, self.datasetType, dataId={})
924 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
926 # This is tricky because it loses the units.
927 tab2_astropy = atable.Table(tab2)
929 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
931 # Check reading the columns.
932 columns = list(tab2.dtype.names)
933 columns2 = self.butler.get(
934 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
935 )
936 self.assertEqual(columns2, columns)
938 # Check reading the schema.
939 schema = ArrowNumpySchema(tab2.dtype)
940 schema2 = self.butler.get(
941 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
942 )
944 self.assertEqual(schema2, schema)
946 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
947 def testWriteAstropyReadAsNumpyDict(self):
948 tab1 = _makeSimpleAstropyTable()
949 self.butler.put(tab1, self.datasetType, dataId={})
951 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
953 # This is tricky because it loses the units.
954 tab2_astropy = atable.Table(tab2)
956 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
958 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
959 """Check if two astropy tables have the same columns/values.
961 Parameters
962 ----------
963 table1 : `astropy.table.Table`
964 table2 : `astropy.table.Table`
965 skip_units : `bool`
966 has_bigendian : `bool`
967 """
968 if not has_bigendian:
969 self.assertEqual(table1.dtype, table2.dtype)
970 else:
971 for name in table1.dtype.names:
972 # Only check type matches, force to little-endian.
973 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
975 self.assertEqual(table1.meta, table2.meta)
976 if not skip_units:
977 for name in table1.columns:
978 self.assertEqual(table1[name].unit, table2[name].unit)
979 self.assertEqual(table1[name].description, table2[name].description)
980 self.assertEqual(table1[name].format, table2[name].format)
981 self.assertTrue(np.all(table1 == table2))
984@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
985class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
986 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
988 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
990 def testAstropyParquet(self):
991 # This test does not work with an inMemoryDatastore.
992 pass
994 def testBadInput(self):
995 tab1 = _makeSimpleAstropyTable()
996 delegate = ArrowAstropyDelegate("ArrowAstropy")
998 with self.assertRaises(ValueError):
999 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1001 with self.assertRaises(NotImplementedError):
1002 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1004 with self.assertRaises(AttributeError):
1005 delegate.getComponent(composite=tab1, componentName="nothing")
1008@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1009@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1010class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1011 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1013 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1015 def setUp(self):
1016 """Create a new butler root for each test."""
1017 self.root = makeTestTempDir(TESTDIR)
1018 config = Config(self.configFile)
1019 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1020 # No dimensions in dataset type so we don't have to worry about
1021 # inserting dimension data or defining data IDs.
1022 self.datasetType = DatasetType(
1023 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
1024 )
1025 self.butler.registry.registerDatasetType(self.datasetType)
1027 def tearDown(self):
1028 removeTestTempDir(self.root)
1030 def testNumpyTable(self):
1031 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1033 self.butler.put(tab1, self.datasetType, dataId={})
1034 # Read the whole Table.
1035 tab2 = self.butler.get(self.datasetType, dataId={})
1036 self._checkNumpyTableEquality(tab1, tab2)
1037 # Read the columns.
1038 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1039 self.assertEqual(len(columns2), len(tab1.dtype.names))
1040 for i, name in enumerate(tab1.dtype.names):
1041 self.assertEqual(columns2[i], name)
1042 # Read the rowcount.
1043 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1044 self.assertEqual(rowcount, len(tab1))
1045 # Read the schema.
1046 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1047 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1048 # Read just some columns a few different ways.
1049 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1050 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1051 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1052 self._checkNumpyTableEquality(
1053 tab1[
1054 [
1055 "a",
1056 ]
1057 ],
1058 tab4,
1059 )
1060 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1061 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1062 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1063 self._checkNumpyTableEquality(
1064 tab1[
1065 [
1066 "ddd",
1067 ]
1068 ],
1069 tab6,
1070 )
1071 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1072 self._checkNumpyTableEquality(
1073 tab1[
1074 [
1075 "a",
1076 ]
1077 ],
1078 tab7,
1079 )
1080 # Passing an unrecognized column should be a ValueError.
1081 with self.assertRaises(ValueError):
1082 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1084 def testNumpyTableBigEndian(self):
1085 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1087 self.butler.put(tab1, self.datasetType, dataId={})
1088 # Read the whole Table.
1089 tab2 = self.butler.get(self.datasetType, dataId={})
1090 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1092 def testArrowNumpySchema(self):
1093 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1094 tab1_arrow = numpy_to_arrow(tab1)
1095 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1097 self.assertIsInstance(schema.schema, np.dtype)
1098 self.assertEqual(repr(schema), repr(schema._dtype))
1099 self.assertNotEqual(schema, "not_a_schema")
1100 self.assertEqual(schema, schema)
1102 # Test inequality
1103 tab2 = tab1.copy()
1104 names = list(tab2.dtype.names)
1105 names[0] = "index2"
1106 tab2.dtype.names = names
1107 schema2 = ArrowNumpySchema(tab2.dtype)
1108 self.assertNotEqual(schema2, schema)
1110 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1111 def testNumpyDictConversions(self):
1112 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1114 # Verify that everything round-trips, including the schema.
1115 tab1_arrow = numpy_to_arrow(tab1)
1116 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1117 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1119 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1120 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1122 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1123 def testWriteNumpyTableReadAsArrowTable(self):
1124 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1126 self.butler.put(tab1, self.datasetType, dataId={})
1128 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1130 tab2_numpy = arrow_to_numpy(tab2)
1132 self._checkNumpyTableEquality(tab1, tab2_numpy)
1134 # Check reading the columns.
1135 columns = tab2.schema.names
1136 columns2 = self.butler.get(
1137 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1138 )
1139 self.assertEqual(columns2, columns)
1141 # Check reading the schema.
1142 schema = tab2.schema
1143 schema2 = self.butler.get(
1144 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1145 )
1146 self.assertEqual(schema2, schema)
1148 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1149 def testWriteNumpyTableReadAsDataFrame(self):
1150 tab1 = _makeSimpleNumpyTable()
1152 self.butler.put(tab1, self.datasetType, dataId={})
1154 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1156 # Converting this back to numpy gets confused with the index column
1157 # and changes the datatype of the string column.
1159 tab1_df = pd.DataFrame(tab1)
1161 self.assertTrue(tab1_df.equals(tab2))
1163 # Check reading the columns.
1164 columns = tab2.columns
1165 columns2 = self.butler.get(
1166 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1167 )
1168 self.assertTrue(columns.equals(columns2))
1170 # Check reading the schema.
1171 schema = DataFrameSchema(tab2)
1172 schema2 = self.butler.get(
1173 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1174 )
1176 self.assertEqual(schema2, schema)
1178 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1179 def testWriteNumpyTableReadAsAstropyTable(self):
1180 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1182 self.butler.put(tab1, self.datasetType, dataId={})
1184 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1185 tab2_numpy = tab2.as_array()
1187 self._checkNumpyTableEquality(tab1, tab2_numpy)
1189 # Check reading the columns.
1190 columns = list(tab2.columns.keys())
1191 columns2 = self.butler.get(
1192 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1193 )
1194 self.assertEqual(columns2, columns)
1196 # Check reading the schema.
1197 schema = ArrowAstropySchema(tab2)
1198 schema2 = self.butler.get(
1199 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1200 )
1202 self.assertEqual(schema2, schema)
1204 def testWriteNumpyTableReadAsNumpyDict(self):
1205 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1207 self.butler.put(tab1, self.datasetType, dataId={})
1209 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1210 tab2_numpy = _numpy_dict_to_numpy(tab2)
1212 self._checkNumpyTableEquality(tab1, tab2_numpy)
1214 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1215 """Check if two numpy tables have the same columns/values
1217 Parameters
1218 ----------
1219 table1 : `numpy.ndarray`
1220 table2 : `numpy.ndarray`
1221 has_bigendian : `bool`
1222 """
1223 self.assertEqual(table1.dtype.names, table2.dtype.names)
1224 for name in table1.dtype.names:
1225 if not has_bigendian:
1226 self.assertEqual(table1.dtype[name], table2.dtype[name])
1227 else:
1228 # Only check type matches, force to little-endian.
1229 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1230 self.assertTrue(np.all(table1 == table2))
1233@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1234class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1235 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1237 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1239 def testBadInput(self):
1240 tab1 = _makeSimpleNumpyTable()
1241 delegate = ArrowNumpyDelegate("ArrowNumpy")
1243 with self.assertRaises(ValueError):
1244 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1246 with self.assertRaises(NotImplementedError):
1247 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1249 with self.assertRaises(AttributeError):
1250 delegate.getComponent(composite=tab1, componentName="nothing")
1252 def testStorageClass(self):
1253 tab1 = _makeSimpleNumpyTable()
1255 factory = StorageClassFactory()
1256 factory.addFromConfig(StorageClassConfig())
1258 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1259 # Force the name lookup to do name matching.
1260 storageClass._pytype = None
1261 self.assertEqual(storageClass.name, "ArrowNumpy")
1263 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1264 # Force the name lookup to do name matching.
1265 storageClass._pytype = None
1266 self.assertEqual(storageClass.name, "ArrowNumpy")
1269@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1270class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1271 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1273 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1275 def setUp(self):
1276 """Create a new butler root for each test."""
1277 self.root = makeTestTempDir(TESTDIR)
1278 config = Config(self.configFile)
1279 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1280 # No dimensions in dataset type so we don't have to worry about
1281 # inserting dimension data or defining data IDs.
1282 self.datasetType = DatasetType(
1283 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1284 )
1285 self.butler.registry.registerDatasetType(self.datasetType)
1287 def tearDown(self):
1288 removeTestTempDir(self.root)
1290 def testArrowTable(self):
1291 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1293 self.butler.put(tab1, self.datasetType, dataId={})
1294 # Read the whole Table.
1295 tab2 = self.butler.get(self.datasetType, dataId={})
1296 self.assertEqual(tab2, tab1)
1297 # Read the columns.
1298 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1299 self.assertEqual(len(columns2), len(tab1.schema.names))
1300 for i, name in enumerate(tab1.schema.names):
1301 self.assertEqual(columns2[i], name)
1302 # Read the rowcount.
1303 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1304 self.assertEqual(rowcount, len(tab1))
1305 # Read the schema.
1306 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1307 self.assertEqual(schema, tab1.schema)
1308 # Read just some columns a few different ways.
1309 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1310 self.assertEqual(tab3, tab1.select(("a", "c")))
1311 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1312 self.assertEqual(tab4, tab1.select(("a",)))
1313 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1314 self.assertEqual(tab5, tab1.select(("index", "a")))
1315 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1316 self.assertEqual(tab6, tab1.select(("ddd",)))
1317 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1318 self.assertEqual(tab7, tab1.select(("a",)))
1319 # Passing an unrecognized column should be a ValueError.
1320 with self.assertRaises(ValueError):
1321 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1323 def testEmptyArrowTable(self):
1324 data = _makeSimpleNumpyTable()
1325 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1327 schema = pa.schema(type_list)
1328 arrays = [[]] * len(schema.names)
1330 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1332 self.butler.put(tab1, self.datasetType, dataId={})
1333 tab2 = self.butler.get(self.datasetType, dataId={})
1334 self.assertEqual(tab2, tab1)
1336 tab1_numpy = arrow_to_numpy(tab1)
1337 self.assertEqual(len(tab1_numpy), 0)
1338 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1339 self.assertEqual(tab1_numpy_arrow, tab1)
1341 tab1_pandas = arrow_to_pandas(tab1)
1342 self.assertEqual(len(tab1_pandas), 0)
1343 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1344 # Unfortunately, string/byte columns get mangled when translated
1345 # through empty pandas dataframes.
1346 self.assertEqual(
1347 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1348 tab1.select(("index", "a", "b", "c", "ddd")),
1349 )
1351 tab1_astropy = arrow_to_astropy(tab1)
1352 self.assertEqual(len(tab1_astropy), 0)
1353 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1354 self.assertEqual(tab1_astropy_arrow, tab1)
1356 def testEmptyArrowTableMultidim(self):
1357 data = _makeSimpleNumpyTable(include_multidim=True)
1358 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1360 md = {}
1361 for name in data.dtype.names:
1362 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1364 schema = pa.schema(type_list, metadata=md)
1365 arrays = [[]] * len(schema.names)
1367 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1369 self.butler.put(tab1, self.datasetType, dataId={})
1370 tab2 = self.butler.get(self.datasetType, dataId={})
1371 self.assertEqual(tab2, tab1)
1373 tab1_numpy = arrow_to_numpy(tab1)
1374 self.assertEqual(len(tab1_numpy), 0)
1375 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1376 self.assertEqual(tab1_numpy_arrow, tab1)
1378 tab1_astropy = arrow_to_astropy(tab1)
1379 self.assertEqual(len(tab1_astropy), 0)
1380 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1381 self.assertEqual(tab1_astropy_arrow, tab1)
1383 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1384 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1385 df1, allColumns = _makeSingleIndexDataFrame()
1387 self.butler.put(df1, self.datasetType, dataId={})
1389 # Read back out as a dataframe.
1390 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1391 self.assertTrue(df1.equals(df2))
1393 # Read back out as an arrow table, convert to dataframe.
1394 tab3 = self.butler.get(self.datasetType, dataId={})
1395 df3 = arrow_to_pandas(tab3)
1396 self.assertTrue(df1.equals(df3))
1398 # Check reading the columns.
1399 columns = df2.reset_index().columns
1400 columns2 = self.butler.get(
1401 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1402 )
1403 # We check the set because pandas reorders the columns.
1404 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1406 # Check reading the schema.
1407 schema = DataFrameSchema(df1)
1408 schema2 = self.butler.get(
1409 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1410 )
1411 self.assertEqual(schema2, schema)
1413 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1414 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1415 df1 = _makeMultiIndexDataFrame()
1417 self.butler.put(df1, self.datasetType, dataId={})
1419 # Read back out as a dataframe.
1420 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1421 self.assertTrue(df1.equals(df2))
1423 # Read back out as an arrow table, convert to dataframe.
1424 atab3 = self.butler.get(self.datasetType, dataId={})
1425 df3 = arrow_to_pandas(atab3)
1426 self.assertTrue(df1.equals(df3))
1428 # Check reading the columns.
1429 columns = df2.columns
1430 columns2 = self.butler.get(
1431 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1432 )
1433 self.assertTrue(columns2.equals(columns))
1435 # Check reading the schema.
1436 schema = DataFrameSchema(df1)
1437 schema2 = self.butler.get(
1438 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1439 )
1440 self.assertEqual(schema2, schema)
1442 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1443 def testWriteArrowTableReadAsAstropyTable(self):
1444 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1446 self.butler.put(tab1, self.datasetType, dataId={})
1448 # Read back out as an astropy table.
1449 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1450 self._checkAstropyTableEquality(tab1, tab2)
1452 # Read back out as an arrow table, convert to astropy table.
1453 atab3 = self.butler.get(self.datasetType, dataId={})
1454 tab3 = arrow_to_astropy(atab3)
1455 self._checkAstropyTableEquality(tab1, tab3)
1457 # Check reading the columns.
1458 columns = list(tab2.columns.keys())
1459 columns2 = self.butler.get(
1460 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1461 )
1462 self.assertEqual(columns2, columns)
1464 # Check reading the schema.
1465 schema = ArrowAstropySchema(tab1)
1466 schema2 = self.butler.get(
1467 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1468 )
1469 self.assertEqual(schema2, schema)
1471 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1472 def testWriteArrowTableReadAsNumpyTable(self):
1473 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1475 self.butler.put(tab1, self.datasetType, dataId={})
1477 # Read back out as a numpy table.
1478 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1479 self._checkNumpyTableEquality(tab1, tab2)
1481 # Read back out as an arrow table, convert to numpy table.
1482 atab3 = self.butler.get(self.datasetType, dataId={})
1483 tab3 = arrow_to_numpy(atab3)
1484 self._checkNumpyTableEquality(tab1, tab3)
1486 # Check reading the columns.
1487 columns = list(tab2.dtype.names)
1488 columns2 = self.butler.get(
1489 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1490 )
1491 self.assertEqual(columns2, columns)
1493 # Check reading the schema.
1494 schema = ArrowNumpySchema(tab1.dtype)
1495 schema2 = self.butler.get(
1496 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1497 )
1498 self.assertEqual(schema2, schema)
1500 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1501 def testWriteArrowTableReadAsNumpyDict(self):
1502 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1504 self.butler.put(tab1, self.datasetType, dataId={})
1506 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1507 tab2_numpy = _numpy_dict_to_numpy(tab2)
1508 self._checkNumpyTableEquality(tab1, tab2_numpy)
1510 def _checkAstropyTableEquality(self, table1, table2):
1511 """Check if two astropy tables have the same columns/values
1513 Parameters
1514 ----------
1515 table1 : `astropy.table.Table`
1516 table2 : `astropy.table.Table`
1517 """
1518 self.assertEqual(table1.dtype, table2.dtype)
1519 for name in table1.columns:
1520 self.assertEqual(table1[name].unit, table2[name].unit)
1521 self.assertEqual(table1[name].description, table2[name].description)
1522 self.assertEqual(table1[name].format, table2[name].format)
1523 self.assertTrue(np.all(table1 == table2))
1525 def _checkNumpyTableEquality(self, table1, table2):
1526 """Check if two numpy tables have the same columns/values
1528 Parameters
1529 ----------
1530 table1 : `numpy.ndarray`
1531 table2 : `numpy.ndarray`
1532 """
1533 self.assertEqual(table1.dtype.names, table2.dtype.names)
1534 for name in table1.dtype.names:
1535 self.assertEqual(table1.dtype[name], table2.dtype[name])
1536 self.assertTrue(np.all(table1 == table2))
1539@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1540class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1541 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1543 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1545 def testBadInput(self):
1546 tab1 = _makeSimpleArrowTable()
1547 delegate = ArrowTableDelegate("ArrowTable")
1549 with self.assertRaises(ValueError):
1550 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1552 with self.assertRaises(NotImplementedError):
1553 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1555 with self.assertRaises(AttributeError):
1556 delegate.getComponent(composite=tab1, componentName="nothing")
1558 def testStorageClass(self):
1559 tab1 = _makeSimpleArrowTable()
1561 factory = StorageClassFactory()
1562 factory.addFromConfig(StorageClassConfig())
1564 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1565 # Force the name lookup to do name matching.
1566 storageClass._pytype = None
1567 self.assertEqual(storageClass.name, "ArrowTable")
1569 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1570 # Force the name lookup to do name matching.
1571 storageClass._pytype = None
1572 self.assertEqual(storageClass.name, "ArrowTable")
1575@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1576@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1577class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1578 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1580 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1582 def setUp(self):
1583 """Create a new butler root for each test."""
1584 self.root = makeTestTempDir(TESTDIR)
1585 config = Config(self.configFile)
1586 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1587 # No dimensions in dataset type so we don't have to worry about
1588 # inserting dimension data or defining data IDs.
1589 self.datasetType = DatasetType(
1590 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions
1591 )
1592 self.butler.registry.registerDatasetType(self.datasetType)
1594 def tearDown(self):
1595 removeTestTempDir(self.root)
1597 def testNumpyDict(self):
1598 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1599 dict1 = _numpy_to_numpy_dict(tab1)
1601 self.butler.put(dict1, self.datasetType, dataId={})
1602 # Read the whole table.
1603 dict2 = self.butler.get(self.datasetType, dataId={})
1604 self._checkNumpyDictEquality(dict1, dict2)
1605 # Read the columns.
1606 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1607 self.assertEqual(len(columns2), len(dict1.keys()))
1608 for i, name in enumerate(dict1.keys()):
1609 self.assertIn(name, columns2)
1610 # Read the rowcount.
1611 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1612 self.assertEqual(rowcount, len(dict1["a"]))
1613 # Read the schema.
1614 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1615 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1616 # Read just some columns a few different ways.
1617 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1618 subdict = {key: dict1[key] for key in ["a", "c"]}
1619 self._checkNumpyDictEquality(subdict, tab3)
1620 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1621 subdict = {key: dict1[key] for key in ["a"]}
1622 self._checkNumpyDictEquality(subdict, tab4)
1623 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1624 subdict = {key: dict1[key] for key in ["index", "a"]}
1625 self._checkNumpyDictEquality(subdict, tab5)
1626 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1627 subdict = {key: dict1[key] for key in ["ddd"]}
1628 self._checkNumpyDictEquality(subdict, tab6)
1629 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1630 subdict = {key: dict1[key] for key in ["a"]}
1631 self._checkNumpyDictEquality(subdict, tab7)
1632 # Passing an unrecognized column should be a ValueError.
1633 with self.assertRaises(ValueError):
1634 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1636 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1637 def testWriteNumpyDictReadAsArrowTable(self):
1638 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1639 dict1 = _numpy_to_numpy_dict(tab1)
1641 self.butler.put(dict1, self.datasetType, dataId={})
1643 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1645 tab2_dict = arrow_to_numpy_dict(tab2)
1647 self._checkNumpyDictEquality(dict1, tab2_dict)
1649 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1650 def testWriteNumpyDictReadAsDataFrame(self):
1651 tab1 = _makeSimpleNumpyTable()
1652 dict1 = _numpy_to_numpy_dict(tab1)
1654 self.butler.put(dict1, self.datasetType, dataId={})
1656 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1658 # The order of the dict may get mixed up, so we need to check column
1659 # by column. We also need to do this in dataframe form because pandas
1660 # changes the datatype of the string column.
1661 tab1_df = pd.DataFrame(tab1)
1663 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1664 for col in tab1_df.columns:
1665 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1667 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1668 def testWriteNumpyDictReadAsAstropyTable(self):
1669 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1670 dict1 = _numpy_to_numpy_dict(tab1)
1672 self.butler.put(dict1, self.datasetType, dataId={})
1674 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1675 tab2_dict = _astropy_to_numpy_dict(tab2)
1677 self._checkNumpyDictEquality(dict1, tab2_dict)
1679 def testWriteNumpyDictReadAsNumpyTable(self):
1680 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1681 dict1 = _numpy_to_numpy_dict(tab1)
1683 self.butler.put(dict1, self.datasetType, dataId={})
1685 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1686 tab2_dict = _numpy_to_numpy_dict(tab2)
1688 self._checkNumpyDictEquality(dict1, tab2_dict)
1690 def testWriteNumpyDictBad(self):
1691 dict1 = {"a": 4, "b": np.ndarray([1])}
1692 with self.assertRaises(RuntimeError):
1693 self.butler.put(dict1, self.datasetType, dataId={})
1695 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1696 with self.assertRaises(RuntimeError):
1697 self.butler.put(dict2, self.datasetType, dataId={})
1699 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1700 with self.assertRaises(RuntimeError):
1701 self.butler.put(dict3, self.datasetType, dataId={})
1703 def _checkNumpyDictEquality(self, dict1, dict2):
1704 """Check if two numpy dicts have the same columns/values.
1706 Parameters
1707 ----------
1708 dict1 : `dict` [`str`, `np.ndarray`]
1709 dict2 : `dict` [`str`, `np.ndarray`]
1710 """
1711 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1712 for name in dict1.keys():
1713 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1714 self.assertTrue(np.all(dict1[name] == dict2[name]))
1717@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1718@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1719class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1720 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1722 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1724 def testWriteNumpyDictBad(self):
1725 # The sub-type checking is not done on in-memory datastore.
1726 pass
1729if __name__ == "__main__":
1730 unittest.main()