Coverage for tests/test_parquet.py: 17%
937 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-22 02:18 -0700
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-22 02:18 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _astropy_to_numpy_dict,
68 _numpy_dict_to_numpy,
69 _numpy_dtype_to_arrow_types,
70 _numpy_style_arrays_to_arrow_arrays,
71 _numpy_to_numpy_dict,
72 arrow_to_astropy,
73 arrow_to_numpy,
74 arrow_to_numpy_dict,
75 arrow_to_pandas,
76 astropy_to_arrow,
77 compute_row_group_size,
78 numpy_dict_to_arrow,
79 numpy_to_arrow,
80 pandas_to_arrow,
81)
82from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
84TESTDIR = os.path.abspath(os.path.dirname(__file__))
87def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
88 """Make a simple numpy table with random data.
90 Parameters
91 ----------
92 include_multidim : `bool`
93 Include multi-dimensional columns.
94 include_bigendian : `bool`
95 Include big-endian columns.
97 Returns
98 -------
99 numpyTable : `numpy.ndarray`
100 """
101 nrow = 5
103 dtype = [
104 ("index", "i4"),
105 ("a", "f8"),
106 ("b", "f8"),
107 ("c", "f8"),
108 ("ddd", "f8"),
109 ("f", "i8"),
110 ("strcol", "U10"),
111 ("bytecol", "a10"),
112 ]
114 if include_multidim:
115 dtype.extend(
116 [
117 ("d1", "f4", (5,)),
118 ("d2", "i8", (5, 10)),
119 ("d3", "f8", (5, 10)),
120 ]
121 )
123 if include_bigendian:
124 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
126 data = np.zeros(nrow, dtype=dtype)
127 data["index"][:] = np.arange(nrow)
128 data["a"] = np.random.randn(nrow)
129 data["b"] = np.random.randn(nrow)
130 data["c"] = np.random.randn(nrow)
131 data["ddd"] = np.random.randn(nrow)
132 data["f"] = np.arange(nrow) * 10
133 data["strcol"][:] = "teststring"
134 data["bytecol"][:] = "teststring"
136 if include_multidim:
137 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
138 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
139 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
141 if include_bigendian:
142 data["a_bigendian"][:] = data["a"]
143 data["f_bigendian"][:] = data["f"]
145 return data
148def _makeSingleIndexDataFrame(include_masked=False):
149 """Make a single index data frame for testing.
151 Parameters
152 ----------
153 include_masked : `bool`
154 Include masked columns.
156 Returns
157 -------
158 dataFrame : `~pandas.DataFrame`
159 The test dataframe.
160 allColumns : `list` [`str`]
161 List of all the columns (including index columns).
162 """
163 data = _makeSimpleNumpyTable()
164 df = pd.DataFrame(data)
165 df = df.set_index("index")
167 if include_masked:
168 nrow = len(df)
170 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
171 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
172 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
173 df.loc[1, ["m1", "m2", "mstrcol"]] = None
175 allColumns = df.columns.append(pd.Index(df.index.names))
177 return df, allColumns
180def _makeMultiIndexDataFrame():
181 """Make a multi-index data frame for testing.
183 Returns
184 -------
185 dataFrame : `~pandas.DataFrame`
186 The test dataframe.
187 """
188 columns = pd.MultiIndex.from_tuples(
189 [
190 ("g", "a"),
191 ("g", "b"),
192 ("g", "c"),
193 ("r", "a"),
194 ("r", "b"),
195 ("r", "c"),
196 ],
197 names=["filter", "column"],
198 )
199 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
201 return df
204def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
205 """Make an astropy table for testing.
207 Parameters
208 ----------
209 include_multidim : `bool`
210 Include multi-dimensional columns.
211 include_masked : `bool`
212 Include masked columns.
213 include_bigendian : `bool`
214 Include big-endian columns.
216 Returns
217 -------
218 astropyTable : `astropy.table.Table`
219 The test table.
220 """
221 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
222 # Add a couple of units.
223 table = atable.Table(data)
224 table["a"].unit = units.degree
225 table["b"].unit = units.meter
227 # Add some masked columns.
228 if include_masked:
229 nrow = len(table)
230 mask = np.zeros(nrow, dtype=bool)
231 mask[1] = True
232 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
233 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
234 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
235 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
237 return table
240def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
241 """Make an arrow table for testing.
243 Parameters
244 ----------
245 include_multidim : `bool`
246 Include multi-dimensional columns.
247 include_masked : `bool`
248 Include masked columns.
250 Returns
251 -------
252 arrowTable : `pyarrow.Table`
253 The test table.
254 """
255 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
256 return astropy_to_arrow(data)
259@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
260@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
261class ParquetFormatterDataFrameTestCase(unittest.TestCase):
262 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
264 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
266 def setUp(self):
267 """Create a new butler root for each test."""
268 self.root = makeTestTempDir(TESTDIR)
269 config = Config(self.configFile)
270 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
271 # No dimensions in dataset type so we don't have to worry about
272 # inserting dimension data or defining data IDs.
273 self.datasetType = DatasetType(
274 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
275 )
276 self.butler.registry.registerDatasetType(self.datasetType)
278 def tearDown(self):
279 removeTestTempDir(self.root)
281 def testSingleIndexDataFrame(self):
282 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
284 self.butler.put(df1, self.datasetType, dataId={})
285 # Read the whole DataFrame.
286 df2 = self.butler.get(self.datasetType, dataId={})
287 self.assertTrue(df1.equals(df2))
288 # Read just the column descriptions.
289 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
290 self.assertTrue(allColumns.equals(columns2))
291 # Read the rowcount.
292 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
293 self.assertEqual(rowcount, len(df1))
294 # Read the schema.
295 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
296 self.assertEqual(schema, DataFrameSchema(df1))
297 # Read just some columns a few different ways.
298 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
299 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
300 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
301 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
302 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
303 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
304 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
305 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
306 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
307 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
308 # Passing an unrecognized column should be a ValueError.
309 with self.assertRaises(ValueError):
310 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
312 def testMultiIndexDataFrame(self):
313 df1 = _makeMultiIndexDataFrame()
315 self.butler.put(df1, self.datasetType, dataId={})
316 # Read the whole DataFrame.
317 df2 = self.butler.get(self.datasetType, dataId={})
318 self.assertTrue(df1.equals(df2))
319 # Read just the column descriptions.
320 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
321 self.assertTrue(df1.columns.equals(columns2))
322 self.assertEqual(columns2.names, df1.columns.names)
323 # Read the rowcount.
324 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
325 self.assertEqual(rowcount, len(df1))
326 # Read the schema.
327 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
328 self.assertEqual(schema, DataFrameSchema(df1))
329 # Read just some columns a few different ways.
330 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
331 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
332 df4 = self.butler.get(
333 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
334 )
335 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
336 column_list = [("g", "a"), ("r", "c")]
337 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
338 self.assertTrue(df1.loc[:, column_list].equals(df5))
339 column_dict = {"filter": "r", "column": ["a", "b"]}
340 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
341 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
342 # Passing an unrecognized column should be a ValueError.
343 with self.assertRaises(ValueError):
344 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
346 def testSingleIndexDataFrameEmptyString(self):
347 """Test persisting a single index dataframe with empty strings."""
348 df1, _ = _makeSingleIndexDataFrame()
350 # Set one of the strings to None
351 df1.at[1, "strcol"] = None
353 self.butler.put(df1, self.datasetType, dataId={})
354 # Read the whole DataFrame.
355 df2 = self.butler.get(self.datasetType, dataId={})
356 self.assertTrue(df1.equals(df2))
358 def testSingleIndexDataFrameAllEmptyStrings(self):
359 """Test persisting a single index dataframe with an empty string
360 column.
361 """
362 df1, _ = _makeSingleIndexDataFrame()
364 # Set all of the strings to None
365 df1.loc[0:, "strcol"] = None
367 self.butler.put(df1, self.datasetType, dataId={})
368 # Read the whole DataFrame.
369 df2 = self.butler.get(self.datasetType, dataId={})
370 self.assertTrue(df1.equals(df2))
372 def testLegacyDataFrame(self):
373 """Test writing a dataframe to parquet via pandas (without additional
374 metadata) and ensure that we can read it back with all the new
375 functionality.
376 """
377 df1, allColumns = _makeSingleIndexDataFrame()
379 fname = os.path.join(self.root, "test_dataframe.parq")
380 df1.to_parquet(fname)
382 legacy_type = DatasetType(
383 "legacy_dataframe",
384 dimensions=(),
385 storageClass="DataFrame",
386 universe=self.butler.registry.dimensions,
387 )
388 self.butler.registry.registerDatasetType(legacy_type)
390 data_id = {}
391 ref = DatasetRef(legacy_type, data_id, run="testLegacyDataFrame")
392 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
394 self.butler.ingest(dataset, transfer="copy")
396 self.butler.put(df1, self.datasetType, dataId={})
398 df2a = self.butler.get(self.datasetType, dataId={})
399 df2b = self.butler.get("legacy_dataframe", dataId={})
400 self.assertTrue(df2a.equals(df2b))
402 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
403 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
404 self.assertTrue(df3a.equals(df3b))
406 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
407 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
408 self.assertTrue(columns2a.equals(columns2b))
410 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
411 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
412 self.assertEqual(rowcount2a, rowcount2b)
414 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
415 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
416 self.assertEqual(schema2a, schema2b)
418 def testDataFrameSchema(self):
419 tab1 = _makeSimpleArrowTable()
421 schema = DataFrameSchema.from_arrow(tab1.schema)
423 self.assertIsInstance(schema.schema, pd.DataFrame)
424 self.assertEqual(repr(schema), repr(schema._schema))
425 self.assertNotEqual(schema, "not_a_schema")
426 self.assertEqual(schema, schema)
428 tab2 = _makeMultiIndexDataFrame()
429 schema2 = DataFrameSchema(tab2)
431 self.assertNotEqual(schema, schema2)
433 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
434 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
435 df1, allColumns = _makeSingleIndexDataFrame()
437 self.butler.put(df1, self.datasetType, dataId={})
439 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
441 tab2_df = tab2.to_pandas(index="index")
442 self.assertTrue(df1.equals(tab2_df))
444 # Check reading the columns.
445 columns = list(tab2.columns.keys())
446 columns2 = self.butler.get(
447 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
448 )
449 # We check the set because pandas reorders the columns.
450 self.assertEqual(set(columns2), set(columns))
452 # Check reading the schema.
453 schema = ArrowAstropySchema(tab2)
454 schema2 = self.butler.get(
455 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
456 )
458 # The string types are objectified by pandas, and the order
459 # will be changed because of pandas indexing.
460 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
461 for name in schema.schema.columns:
462 self.assertIn(name, schema2.schema.columns)
463 if schema2.schema[name].dtype != np.dtype("O"):
464 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
466 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
467 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
468 # We need to special-case the write-as-pandas read-as-astropy code
469 # with masks because pandas has multiple ways to use masked columns.
470 # (The string column mask handling in particular is frustratingly
471 # inconsistent.)
472 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
474 self.butler.put(df1, self.datasetType, dataId={})
476 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
477 tab2_df = tab2.to_pandas(index="index")
479 self.assertTrue(df1.columns.equals(tab2_df.columns))
480 for name in tab2_df.columns:
481 col1 = df1[name]
482 col2 = tab2_df[name]
484 if col1.hasnans:
485 notNull = col1.notnull()
486 self.assertTrue(notNull.equals(col2.notnull()))
487 # Need to check value-by-value because column may
488 # be made of objects, depending on what pandas decides.
489 for index in notNull.values.nonzero()[0]:
490 self.assertEqual(col1[index], col2[index])
491 else:
492 self.assertTrue(col1.equals(col2))
494 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
495 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
496 df1 = _makeMultiIndexDataFrame()
498 self.butler.put(df1, self.datasetType, dataId={})
500 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
502 # This is an odd duck, it doesn't really round-trip.
503 # This test simply checks that it's readable, but definitely not
504 # recommended.
506 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
507 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
508 df1, allColumns = _makeSingleIndexDataFrame()
510 self.butler.put(df1, self.datasetType, dataId={})
512 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
514 tab2_df = arrow_to_pandas(tab2)
515 self.assertTrue(df1.equals(tab2_df))
517 # Check reading the columns.
518 columns = list(tab2.schema.names)
519 columns2 = self.butler.get(
520 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
521 )
522 # We check the set because pandas reorders the columns.
523 self.assertEqual(set(columns), set(columns2))
525 # Check reading the schema.
526 schema = tab2.schema
527 schema2 = self.butler.get(
528 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
529 )
531 # These will not have the same metadata, nor will the string column
532 # information be maintained.
533 self.assertEqual(len(schema.names), len(schema2.names))
534 for name in schema.names:
535 if schema.field(name).type not in (pa.string(), pa.binary()):
536 self.assertEqual(schema.field(name).type, schema2.field(name).type)
538 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
539 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
540 df1 = _makeMultiIndexDataFrame()
542 self.butler.put(df1, self.datasetType, dataId={})
544 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
546 tab2_df = arrow_to_pandas(tab2)
547 self.assertTrue(df1.equals(tab2_df))
549 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
550 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
551 df1, allColumns = _makeSingleIndexDataFrame()
553 self.butler.put(df1, self.datasetType, dataId={})
555 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
557 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
558 self.assertTrue(df1.equals(tab2_df))
560 # Check reading the columns.
561 columns = list(tab2.dtype.names)
562 columns2 = self.butler.get(
563 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
564 )
565 # We check the set because pandas reorders the columns.
566 self.assertEqual(set(columns2), set(columns))
568 # Check reading the schema.
569 schema = ArrowNumpySchema(tab2.dtype)
570 schema2 = self.butler.get(
571 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
572 )
574 # The string types will be objectified by pandas, and the order
575 # will be changed because of pandas indexing.
576 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
577 for name in schema.schema.names:
578 self.assertIn(name, schema2.schema.names)
579 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
581 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
582 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
583 df1 = _makeMultiIndexDataFrame()
585 self.butler.put(df1, self.datasetType, dataId={})
587 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
589 # This is an odd duck, it doesn't really round-trip.
590 # This test simply checks that it's readable, but definitely not
591 # recommended.
593 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
594 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
595 df1, allColumns = _makeSingleIndexDataFrame()
597 self.butler.put(df1, self.datasetType, dataId={})
599 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
601 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
602 # The column order is not maintained.
603 self.assertEqual(set(df1.columns), set(tab2_df.columns))
604 for col in df1.columns:
605 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
607 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
608 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
609 df1 = _makeMultiIndexDataFrame()
611 self.butler.put(df1, self.datasetType, dataId={})
613 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
615 # This is an odd duck, it doesn't really round-trip.
616 # This test simply checks that it's readable, but definitely not
617 # recommended.
620@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
621class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
622 """Tests for InMemoryDatastore, using DataFrameDelegate."""
624 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
626 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
627 df1 = _makeMultiIndexDataFrame()
629 self.butler.put(df1, self.datasetType, dataId={})
631 with self.assertRaises(ValueError):
632 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
634 def testLegacyDataFrame(self):
635 # This test does not work with an inMemoryDatastore.
636 pass
638 def testBadInput(self):
639 df1, _ = _makeSingleIndexDataFrame()
640 delegate = DataFrameDelegate("DataFrame")
642 with self.assertRaises(ValueError):
643 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
645 with self.assertRaises(AttributeError):
646 delegate.getComponent(composite=df1, componentName="nothing")
648 def testStorageClass(self):
649 df1, allColumns = _makeSingleIndexDataFrame()
651 factory = StorageClassFactory()
652 factory.addFromConfig(StorageClassConfig())
654 storageClass = factory.findStorageClass(type(df1), compare_types=False)
655 # Force the name lookup to do name matching.
656 storageClass._pytype = None
657 self.assertEqual(storageClass.name, "DataFrame")
659 storageClass = factory.findStorageClass(type(df1), compare_types=True)
660 # Force the name lookup to do name matching.
661 storageClass._pytype = None
662 self.assertEqual(storageClass.name, "DataFrame")
665@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
666@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
667class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
668 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
670 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
672 def setUp(self):
673 """Create a new butler root for each test."""
674 self.root = makeTestTempDir(TESTDIR)
675 config = Config(self.configFile)
676 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
677 # No dimensions in dataset type so we don't have to worry about
678 # inserting dimension data or defining data IDs.
679 self.datasetType = DatasetType(
680 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
681 )
682 self.butler.registry.registerDatasetType(self.datasetType)
684 def tearDown(self):
685 removeTestTempDir(self.root)
687 def testAstropyTable(self):
688 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
690 self.butler.put(tab1, self.datasetType, dataId={})
691 # Read the whole Table.
692 tab2 = self.butler.get(self.datasetType, dataId={})
693 self._checkAstropyTableEquality(tab1, tab2)
694 # Read the columns.
695 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
696 self.assertEqual(len(columns2), len(tab1.dtype.names))
697 for i, name in enumerate(tab1.dtype.names):
698 self.assertEqual(columns2[i], name)
699 # Read the rowcount.
700 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
701 self.assertEqual(rowcount, len(tab1))
702 # Read the schema.
703 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
704 self.assertEqual(schema, ArrowAstropySchema(tab1))
705 # Read just some columns a few different ways.
706 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
707 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
708 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
709 self._checkAstropyTableEquality(tab1[("a",)], tab4)
710 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
711 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
712 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
713 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
714 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
715 self._checkAstropyTableEquality(tab1[("a",)], tab7)
716 # Passing an unrecognized column should be a ValueError.
717 with self.assertRaises(ValueError):
718 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
720 def testAstropyTableBigEndian(self):
721 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
723 self.butler.put(tab1, self.datasetType, dataId={})
724 # Read the whole Table.
725 tab2 = self.butler.get(self.datasetType, dataId={})
726 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
728 def testAstropyTableWithMetadata(self):
729 tab1 = _makeSimpleAstropyTable(include_multidim=True)
731 meta = {
732 "meta_a": 5,
733 "meta_b": 10.0,
734 "meta_c": [1, 2, 3],
735 "meta_d": True,
736 "meta_e": "string",
737 }
739 tab1.meta.update(meta)
741 self.butler.put(tab1, self.datasetType, dataId={})
742 # Read the whole Table.
743 tab2 = self.butler.get(self.datasetType, dataId={})
744 # This will check that the metadata is equivalent as well.
745 self._checkAstropyTableEquality(tab1, tab2)
747 def testArrowAstropySchema(self):
748 tab1 = _makeSimpleAstropyTable()
749 tab1_arrow = astropy_to_arrow(tab1)
750 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
752 self.assertIsInstance(schema.schema, atable.Table)
753 self.assertEqual(repr(schema), repr(schema._schema))
754 self.assertNotEqual(schema, "not_a_schema")
755 self.assertEqual(schema, schema)
757 # Test various inequalities
758 tab2 = tab1.copy()
759 tab2.rename_column("index", "index2")
760 schema2 = ArrowAstropySchema(tab2)
761 self.assertNotEqual(schema2, schema)
763 tab2 = tab1.copy()
764 tab2["index"].unit = units.micron
765 schema2 = ArrowAstropySchema(tab2)
766 self.assertNotEqual(schema2, schema)
768 tab2 = tab1.copy()
769 tab2["index"].description = "Index column"
770 schema2 = ArrowAstropySchema(tab2)
771 self.assertNotEqual(schema2, schema)
773 tab2 = tab1.copy()
774 tab2["index"].format = "%05d"
775 schema2 = ArrowAstropySchema(tab2)
776 self.assertNotEqual(schema2, schema)
778 def testAstropyParquet(self):
779 tab1 = _makeSimpleAstropyTable()
781 fname = os.path.join(self.root, "test_astropy.parq")
782 tab1.write(fname)
784 astropy_type = DatasetType(
785 "astropy_parquet",
786 dimensions=(),
787 storageClass="ArrowAstropy",
788 universe=self.butler.registry.dimensions,
789 )
790 self.butler.registry.registerDatasetType(astropy_type)
792 data_id = {}
793 ref = DatasetRef(astropy_type, data_id, run="testAstropyParquet")
794 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
796 self.butler.ingest(dataset, transfer="copy")
798 self.butler.put(tab1, self.datasetType, dataId={})
800 tab2a = self.butler.get(self.datasetType, dataId={})
801 tab2b = self.butler.get("astropy_parquet", dataId={})
802 self._checkAstropyTableEquality(tab2a, tab2b)
804 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
805 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
806 self.assertEqual(len(columns2b), len(columns2a))
807 for i, name in enumerate(columns2a):
808 self.assertEqual(columns2b[i], name)
810 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
811 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
812 self.assertEqual(rowcount2a, rowcount2b)
814 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
815 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
816 self.assertEqual(schema2a, schema2b)
818 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
819 def testWriteAstropyReadAsArrowTable(self):
820 # This astropy <-> arrow works fine with masked columns.
821 tab1 = _makeSimpleAstropyTable(include_masked=True)
823 self.butler.put(tab1, self.datasetType, dataId={})
825 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
827 tab2_astropy = arrow_to_astropy(tab2)
828 self._checkAstropyTableEquality(tab1, tab2_astropy)
830 # Check reading the columns.
831 columns = tab2.schema.names
832 columns2 = self.butler.get(
833 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
834 )
835 self.assertEqual(columns2, columns)
837 # Check reading the schema.
838 schema = tab2.schema
839 schema2 = self.butler.get(
840 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
841 )
843 self.assertEqual(schema, schema2)
845 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
846 def testWriteAstropyReadAsDataFrame(self):
847 tab1 = _makeSimpleAstropyTable()
849 self.butler.put(tab1, self.datasetType, dataId={})
851 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
853 # This is tricky because it loses the units and gains a bonus pandas
854 # _index_ column, so we just test the dataframe form.
856 tab1_df = tab1.to_pandas()
857 self.assertTrue(tab1_df.equals(tab2))
859 # Check reading the columns.
860 columns = tab2.columns
861 columns2 = self.butler.get(
862 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
863 )
864 self.assertTrue(columns.equals(columns2))
866 # Check reading the schema.
867 schema = DataFrameSchema(tab2)
868 schema2 = self.butler.get(
869 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
870 )
872 self.assertEqual(schema2, schema)
874 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
875 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
876 # We need to special-case the write-as-astropy read-as-pandas code
877 # with masks because pandas has multiple ways to use masked columns.
878 # (When writing an astropy table with masked columns we get an object
879 # column back, but each unmasked element has the correct type.)
880 tab1 = _makeSimpleAstropyTable(include_masked=True)
882 self.butler.put(tab1, self.datasetType, dataId={})
884 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
886 tab1_df = tab1.to_pandas()
888 self.assertTrue(tab1_df.columns.equals(tab2.columns))
889 for name in tab2.columns:
890 col1 = tab1_df[name]
891 col2 = tab2[name]
893 if col1.hasnans:
894 notNull = col1.notnull()
895 self.assertTrue(notNull.equals(col2.notnull()))
896 # Need to check value-by-value because column may
897 # be made of objects, depending on what pandas decides.
898 for index in notNull.values.nonzero()[0]:
899 self.assertEqual(col1[index], col2[index])
900 else:
901 self.assertTrue(col1.equals(col2))
903 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
904 def testWriteAstropyReadAsNumpyTable(self):
905 tab1 = _makeSimpleAstropyTable()
906 self.butler.put(tab1, self.datasetType, dataId={})
908 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
910 # This is tricky because it loses the units.
911 tab2_astropy = atable.Table(tab2)
913 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
915 # Check reading the columns.
916 columns = list(tab2.dtype.names)
917 columns2 = self.butler.get(
918 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
919 )
920 self.assertEqual(columns2, columns)
922 # Check reading the schema.
923 schema = ArrowNumpySchema(tab2.dtype)
924 schema2 = self.butler.get(
925 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
926 )
928 self.assertEqual(schema2, schema)
930 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
931 def testWriteAstropyReadAsNumpyDict(self):
932 tab1 = _makeSimpleAstropyTable()
933 self.butler.put(tab1, self.datasetType, dataId={})
935 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
937 # This is tricky because it loses the units.
938 tab2_astropy = atable.Table(tab2)
940 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
942 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
943 """Check if two astropy tables have the same columns/values.
945 Parameters
946 ----------
947 table1 : `astropy.table.Table`
948 table2 : `astropy.table.Table`
949 skip_units : `bool`
950 has_bigendian : `bool`
951 """
952 if not has_bigendian:
953 self.assertEqual(table1.dtype, table2.dtype)
954 else:
955 for name in table1.dtype.names:
956 # Only check type matches, force to little-endian.
957 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
959 self.assertEqual(table1.meta, table2.meta)
960 if not skip_units:
961 for name in table1.columns:
962 self.assertEqual(table1[name].unit, table2[name].unit)
963 self.assertEqual(table1[name].description, table2[name].description)
964 self.assertEqual(table1[name].format, table2[name].format)
965 self.assertTrue(np.all(table1 == table2))
968@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
969class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
970 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
972 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
974 def testAstropyParquet(self):
975 # This test does not work with an inMemoryDatastore.
976 pass
978 def testBadInput(self):
979 tab1 = _makeSimpleAstropyTable()
980 delegate = ArrowAstropyDelegate("ArrowAstropy")
982 with self.assertRaises(ValueError):
983 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
985 with self.assertRaises(NotImplementedError):
986 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
988 with self.assertRaises(AttributeError):
989 delegate.getComponent(composite=tab1, componentName="nothing")
992@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
993@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
994class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
995 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
997 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
999 def setUp(self):
1000 """Create a new butler root for each test."""
1001 self.root = makeTestTempDir(TESTDIR)
1002 config = Config(self.configFile)
1003 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1004 # No dimensions in dataset type so we don't have to worry about
1005 # inserting dimension data or defining data IDs.
1006 self.datasetType = DatasetType(
1007 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
1008 )
1009 self.butler.registry.registerDatasetType(self.datasetType)
1011 def tearDown(self):
1012 removeTestTempDir(self.root)
1014 def testNumpyTable(self):
1015 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1017 self.butler.put(tab1, self.datasetType, dataId={})
1018 # Read the whole Table.
1019 tab2 = self.butler.get(self.datasetType, dataId={})
1020 self._checkNumpyTableEquality(tab1, tab2)
1021 # Read the columns.
1022 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1023 self.assertEqual(len(columns2), len(tab1.dtype.names))
1024 for i, name in enumerate(tab1.dtype.names):
1025 self.assertEqual(columns2[i], name)
1026 # Read the rowcount.
1027 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1028 self.assertEqual(rowcount, len(tab1))
1029 # Read the schema.
1030 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1031 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1032 # Read just some columns a few different ways.
1033 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1034 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1035 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1036 self._checkNumpyTableEquality(
1037 tab1[
1038 [
1039 "a",
1040 ]
1041 ],
1042 tab4,
1043 )
1044 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1045 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1046 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1047 self._checkNumpyTableEquality(
1048 tab1[
1049 [
1050 "ddd",
1051 ]
1052 ],
1053 tab6,
1054 )
1055 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1056 self._checkNumpyTableEquality(
1057 tab1[
1058 [
1059 "a",
1060 ]
1061 ],
1062 tab7,
1063 )
1064 # Passing an unrecognized column should be a ValueError.
1065 with self.assertRaises(ValueError):
1066 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1068 def testNumpyTableBigEndian(self):
1069 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1071 self.butler.put(tab1, self.datasetType, dataId={})
1072 # Read the whole Table.
1073 tab2 = self.butler.get(self.datasetType, dataId={})
1074 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1076 def testArrowNumpySchema(self):
1077 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1078 tab1_arrow = numpy_to_arrow(tab1)
1079 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1081 self.assertIsInstance(schema.schema, np.dtype)
1082 self.assertEqual(repr(schema), repr(schema._dtype))
1083 self.assertNotEqual(schema, "not_a_schema")
1084 self.assertEqual(schema, schema)
1086 # Test inequality
1087 tab2 = tab1.copy()
1088 names = list(tab2.dtype.names)
1089 names[0] = "index2"
1090 tab2.dtype.names = names
1091 schema2 = ArrowNumpySchema(tab2.dtype)
1092 self.assertNotEqual(schema2, schema)
1094 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1095 def testNumpyDictConversions(self):
1096 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1098 # Verify that everything round-trips, including the schema.
1099 tab1_arrow = numpy_to_arrow(tab1)
1100 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1101 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1103 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1104 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1106 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1107 def testWriteNumpyTableReadAsArrowTable(self):
1108 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1110 self.butler.put(tab1, self.datasetType, dataId={})
1112 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1114 tab2_numpy = arrow_to_numpy(tab2)
1116 self._checkNumpyTableEquality(tab1, tab2_numpy)
1118 # Check reading the columns.
1119 columns = tab2.schema.names
1120 columns2 = self.butler.get(
1121 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1122 )
1123 self.assertEqual(columns2, columns)
1125 # Check reading the schema.
1126 schema = tab2.schema
1127 schema2 = self.butler.get(
1128 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1129 )
1130 self.assertEqual(schema2, schema)
1132 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1133 def testWriteNumpyTableReadAsDataFrame(self):
1134 tab1 = _makeSimpleNumpyTable()
1136 self.butler.put(tab1, self.datasetType, dataId={})
1138 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1140 # Converting this back to numpy gets confused with the index column
1141 # and changes the datatype of the string column.
1143 tab1_df = pd.DataFrame(tab1)
1145 self.assertTrue(tab1_df.equals(tab2))
1147 # Check reading the columns.
1148 columns = tab2.columns
1149 columns2 = self.butler.get(
1150 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1151 )
1152 self.assertTrue(columns.equals(columns2))
1154 # Check reading the schema.
1155 schema = DataFrameSchema(tab2)
1156 schema2 = self.butler.get(
1157 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1158 )
1160 self.assertEqual(schema2, schema)
1162 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1163 def testWriteNumpyTableReadAsAstropyTable(self):
1164 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1166 self.butler.put(tab1, self.datasetType, dataId={})
1168 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1169 tab2_numpy = tab2.as_array()
1171 self._checkNumpyTableEquality(tab1, tab2_numpy)
1173 # Check reading the columns.
1174 columns = list(tab2.columns.keys())
1175 columns2 = self.butler.get(
1176 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1177 )
1178 self.assertEqual(columns2, columns)
1180 # Check reading the schema.
1181 schema = ArrowAstropySchema(tab2)
1182 schema2 = self.butler.get(
1183 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1184 )
1186 self.assertEqual(schema2, schema)
1188 def testWriteNumpyTableReadAsNumpyDict(self):
1189 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1191 self.butler.put(tab1, self.datasetType, dataId={})
1193 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1194 tab2_numpy = _numpy_dict_to_numpy(tab2)
1196 self._checkNumpyTableEquality(tab1, tab2_numpy)
1198 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1199 """Check if two numpy tables have the same columns/values
1201 Parameters
1202 ----------
1203 table1 : `numpy.ndarray`
1204 table2 : `numpy.ndarray`
1205 has_bigendian : `bool`
1206 """
1207 self.assertEqual(table1.dtype.names, table2.dtype.names)
1208 for name in table1.dtype.names:
1209 if not has_bigendian:
1210 self.assertEqual(table1.dtype[name], table2.dtype[name])
1211 else:
1212 # Only check type matches, force to little-endian.
1213 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1214 self.assertTrue(np.all(table1 == table2))
1217@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1218class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1219 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1221 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1223 def testBadInput(self):
1224 tab1 = _makeSimpleNumpyTable()
1225 delegate = ArrowNumpyDelegate("ArrowNumpy")
1227 with self.assertRaises(ValueError):
1228 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1230 with self.assertRaises(NotImplementedError):
1231 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1233 with self.assertRaises(AttributeError):
1234 delegate.getComponent(composite=tab1, componentName="nothing")
1236 def testStorageClass(self):
1237 tab1 = _makeSimpleNumpyTable()
1239 factory = StorageClassFactory()
1240 factory.addFromConfig(StorageClassConfig())
1242 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1243 # Force the name lookup to do name matching.
1244 storageClass._pytype = None
1245 self.assertEqual(storageClass.name, "ArrowNumpy")
1247 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1248 # Force the name lookup to do name matching.
1249 storageClass._pytype = None
1250 self.assertEqual(storageClass.name, "ArrowNumpy")
1253@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1254class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1255 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1257 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1259 def setUp(self):
1260 """Create a new butler root for each test."""
1261 self.root = makeTestTempDir(TESTDIR)
1262 config = Config(self.configFile)
1263 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1264 # No dimensions in dataset type so we don't have to worry about
1265 # inserting dimension data or defining data IDs.
1266 self.datasetType = DatasetType(
1267 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1268 )
1269 self.butler.registry.registerDatasetType(self.datasetType)
1271 def tearDown(self):
1272 removeTestTempDir(self.root)
1274 def testArrowTable(self):
1275 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1277 self.butler.put(tab1, self.datasetType, dataId={})
1278 # Read the whole Table.
1279 tab2 = self.butler.get(self.datasetType, dataId={})
1280 self.assertEqual(tab2, tab1)
1281 # Read the columns.
1282 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1283 self.assertEqual(len(columns2), len(tab1.schema.names))
1284 for i, name in enumerate(tab1.schema.names):
1285 self.assertEqual(columns2[i], name)
1286 # Read the rowcount.
1287 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1288 self.assertEqual(rowcount, len(tab1))
1289 # Read the schema.
1290 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1291 self.assertEqual(schema, tab1.schema)
1292 # Read just some columns a few different ways.
1293 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1294 self.assertEqual(tab3, tab1.select(("a", "c")))
1295 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1296 self.assertEqual(tab4, tab1.select(("a",)))
1297 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1298 self.assertEqual(tab5, tab1.select(("index", "a")))
1299 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1300 self.assertEqual(tab6, tab1.select(("ddd",)))
1301 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1302 self.assertEqual(tab7, tab1.select(("a",)))
1303 # Passing an unrecognized column should be a ValueError.
1304 with self.assertRaises(ValueError):
1305 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1307 def testEmptyArrowTable(self):
1308 data = _makeSimpleNumpyTable()
1309 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1311 schema = pa.schema(type_list)
1312 arrays = [[]] * len(schema.names)
1314 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1316 self.butler.put(tab1, self.datasetType, dataId={})
1317 tab2 = self.butler.get(self.datasetType, dataId={})
1318 self.assertEqual(tab2, tab1)
1320 tab1_numpy = arrow_to_numpy(tab1)
1321 self.assertEqual(len(tab1_numpy), 0)
1322 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1323 self.assertEqual(tab1_numpy_arrow, tab1)
1325 tab1_pandas = arrow_to_pandas(tab1)
1326 self.assertEqual(len(tab1_pandas), 0)
1327 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1328 # Unfortunately, string/byte columns get mangled when translated
1329 # through empty pandas dataframes.
1330 self.assertEqual(
1331 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1332 tab1.select(("index", "a", "b", "c", "ddd")),
1333 )
1335 tab1_astropy = arrow_to_astropy(tab1)
1336 self.assertEqual(len(tab1_astropy), 0)
1337 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1338 self.assertEqual(tab1_astropy_arrow, tab1)
1340 def testEmptyArrowTableMultidim(self):
1341 data = _makeSimpleNumpyTable(include_multidim=True)
1342 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1344 md = {}
1345 for name in data.dtype.names:
1346 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1348 schema = pa.schema(type_list, metadata=md)
1349 arrays = [[]] * len(schema.names)
1351 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1353 self.butler.put(tab1, self.datasetType, dataId={})
1354 tab2 = self.butler.get(self.datasetType, dataId={})
1355 self.assertEqual(tab2, tab1)
1357 tab1_numpy = arrow_to_numpy(tab1)
1358 self.assertEqual(len(tab1_numpy), 0)
1359 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1360 self.assertEqual(tab1_numpy_arrow, tab1)
1362 tab1_astropy = arrow_to_astropy(tab1)
1363 self.assertEqual(len(tab1_astropy), 0)
1364 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1365 self.assertEqual(tab1_astropy_arrow, tab1)
1367 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1368 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1369 df1, allColumns = _makeSingleIndexDataFrame()
1371 self.butler.put(df1, self.datasetType, dataId={})
1373 # Read back out as a dataframe.
1374 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1375 self.assertTrue(df1.equals(df2))
1377 # Read back out as an arrow table, convert to dataframe.
1378 tab3 = self.butler.get(self.datasetType, dataId={})
1379 df3 = arrow_to_pandas(tab3)
1380 self.assertTrue(df1.equals(df3))
1382 # Check reading the columns.
1383 columns = df2.reset_index().columns
1384 columns2 = self.butler.get(
1385 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1386 )
1387 # We check the set because pandas reorders the columns.
1388 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1390 # Check reading the schema.
1391 schema = DataFrameSchema(df1)
1392 schema2 = self.butler.get(
1393 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1394 )
1395 self.assertEqual(schema2, schema)
1397 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1398 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1399 df1 = _makeMultiIndexDataFrame()
1401 self.butler.put(df1, self.datasetType, dataId={})
1403 # Read back out as a dataframe.
1404 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1405 self.assertTrue(df1.equals(df2))
1407 # Read back out as an arrow table, convert to dataframe.
1408 atab3 = self.butler.get(self.datasetType, dataId={})
1409 df3 = arrow_to_pandas(atab3)
1410 self.assertTrue(df1.equals(df3))
1412 # Check reading the columns.
1413 columns = df2.columns
1414 columns2 = self.butler.get(
1415 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1416 )
1417 self.assertTrue(columns2.equals(columns))
1419 # Check reading the schema.
1420 schema = DataFrameSchema(df1)
1421 schema2 = self.butler.get(
1422 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1423 )
1424 self.assertEqual(schema2, schema)
1426 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1427 def testWriteArrowTableReadAsAstropyTable(self):
1428 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1430 self.butler.put(tab1, self.datasetType, dataId={})
1432 # Read back out as an astropy table.
1433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1434 self._checkAstropyTableEquality(tab1, tab2)
1436 # Read back out as an arrow table, convert to astropy table.
1437 atab3 = self.butler.get(self.datasetType, dataId={})
1438 tab3 = arrow_to_astropy(atab3)
1439 self._checkAstropyTableEquality(tab1, tab3)
1441 # Check reading the columns.
1442 columns = list(tab2.columns.keys())
1443 columns2 = self.butler.get(
1444 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1445 )
1446 self.assertEqual(columns2, columns)
1448 # Check reading the schema.
1449 schema = ArrowAstropySchema(tab1)
1450 schema2 = self.butler.get(
1451 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1452 )
1453 self.assertEqual(schema2, schema)
1455 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1456 def testWriteArrowTableReadAsNumpyTable(self):
1457 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1459 self.butler.put(tab1, self.datasetType, dataId={})
1461 # Read back out as a numpy table.
1462 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1463 self._checkNumpyTableEquality(tab1, tab2)
1465 # Read back out as an arrow table, convert to numpy table.
1466 atab3 = self.butler.get(self.datasetType, dataId={})
1467 tab3 = arrow_to_numpy(atab3)
1468 self._checkNumpyTableEquality(tab1, tab3)
1470 # Check reading the columns.
1471 columns = list(tab2.dtype.names)
1472 columns2 = self.butler.get(
1473 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1474 )
1475 self.assertEqual(columns2, columns)
1477 # Check reading the schema.
1478 schema = ArrowNumpySchema(tab1.dtype)
1479 schema2 = self.butler.get(
1480 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1481 )
1482 self.assertEqual(schema2, schema)
1484 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1485 def testWriteArrowTableReadAsNumpyDict(self):
1486 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1488 self.butler.put(tab1, self.datasetType, dataId={})
1490 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1491 tab2_numpy = _numpy_dict_to_numpy(tab2)
1492 self._checkNumpyTableEquality(tab1, tab2_numpy)
1494 def _checkAstropyTableEquality(self, table1, table2):
1495 """Check if two astropy tables have the same columns/values
1497 Parameters
1498 ----------
1499 table1 : `astropy.table.Table`
1500 table2 : `astropy.table.Table`
1501 """
1502 self.assertEqual(table1.dtype, table2.dtype)
1503 for name in table1.columns:
1504 self.assertEqual(table1[name].unit, table2[name].unit)
1505 self.assertEqual(table1[name].description, table2[name].description)
1506 self.assertEqual(table1[name].format, table2[name].format)
1507 self.assertTrue(np.all(table1 == table2))
1509 def _checkNumpyTableEquality(self, table1, table2):
1510 """Check if two numpy tables have the same columns/values
1512 Parameters
1513 ----------
1514 table1 : `numpy.ndarray`
1515 table2 : `numpy.ndarray`
1516 """
1517 self.assertEqual(table1.dtype.names, table2.dtype.names)
1518 for name in table1.dtype.names:
1519 self.assertEqual(table1.dtype[name], table2.dtype[name])
1520 self.assertTrue(np.all(table1 == table2))
1523@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1524class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1525 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1527 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1529 def testBadInput(self):
1530 tab1 = _makeSimpleArrowTable()
1531 delegate = ArrowTableDelegate("ArrowTable")
1533 with self.assertRaises(ValueError):
1534 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1536 with self.assertRaises(NotImplementedError):
1537 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1539 with self.assertRaises(AttributeError):
1540 delegate.getComponent(composite=tab1, componentName="nothing")
1542 def testStorageClass(self):
1543 tab1 = _makeSimpleArrowTable()
1545 factory = StorageClassFactory()
1546 factory.addFromConfig(StorageClassConfig())
1548 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1549 # Force the name lookup to do name matching.
1550 storageClass._pytype = None
1551 self.assertEqual(storageClass.name, "ArrowTable")
1553 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1554 # Force the name lookup to do name matching.
1555 storageClass._pytype = None
1556 self.assertEqual(storageClass.name, "ArrowTable")
1559@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1560@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1561class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1562 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1564 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1566 def setUp(self):
1567 """Create a new butler root for each test."""
1568 self.root = makeTestTempDir(TESTDIR)
1569 config = Config(self.configFile)
1570 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1571 # No dimensions in dataset type so we don't have to worry about
1572 # inserting dimension data or defining data IDs.
1573 self.datasetType = DatasetType(
1574 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions
1575 )
1576 self.butler.registry.registerDatasetType(self.datasetType)
1578 def tearDown(self):
1579 removeTestTempDir(self.root)
1581 def testNumpyDict(self):
1582 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1583 dict1 = _numpy_to_numpy_dict(tab1)
1585 self.butler.put(dict1, self.datasetType, dataId={})
1586 # Read the whole table.
1587 dict2 = self.butler.get(self.datasetType, dataId={})
1588 self._checkNumpyDictEquality(dict1, dict2)
1589 # Read the columns.
1590 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1591 self.assertEqual(len(columns2), len(dict1.keys()))
1592 for i, name in enumerate(dict1.keys()):
1593 self.assertIn(name, columns2)
1594 # Read the rowcount.
1595 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1596 self.assertEqual(rowcount, len(dict1["a"]))
1597 # Read the schema.
1598 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1599 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1600 # Read just some columns a few different ways.
1601 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1602 subdict = {key: dict1[key] for key in ["a", "c"]}
1603 self._checkNumpyDictEquality(subdict, tab3)
1604 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1605 subdict = {key: dict1[key] for key in ["a"]}
1606 self._checkNumpyDictEquality(subdict, tab4)
1607 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1608 subdict = {key: dict1[key] for key in ["index", "a"]}
1609 self._checkNumpyDictEquality(subdict, tab5)
1610 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1611 subdict = {key: dict1[key] for key in ["ddd"]}
1612 self._checkNumpyDictEquality(subdict, tab6)
1613 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1614 subdict = {key: dict1[key] for key in ["a"]}
1615 self._checkNumpyDictEquality(subdict, tab7)
1616 # Passing an unrecognized column should be a ValueError.
1617 with self.assertRaises(ValueError):
1618 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1620 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1621 def testWriteNumpyDictReadAsArrowTable(self):
1622 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1623 dict1 = _numpy_to_numpy_dict(tab1)
1625 self.butler.put(dict1, self.datasetType, dataId={})
1627 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1629 tab2_dict = arrow_to_numpy_dict(tab2)
1631 self._checkNumpyDictEquality(dict1, tab2_dict)
1633 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1634 def testWriteNumpyDictReadAsDataFrame(self):
1635 tab1 = _makeSimpleNumpyTable()
1636 dict1 = _numpy_to_numpy_dict(tab1)
1638 self.butler.put(dict1, self.datasetType, dataId={})
1640 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1642 # The order of the dict may get mixed up, so we need to check column
1643 # by column. We also need to do this in dataframe form because pandas
1644 # changes the datatype of the string column.
1645 tab1_df = pd.DataFrame(tab1)
1647 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1648 for col in tab1_df.columns:
1649 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1651 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1652 def testWriteNumpyDictReadAsAstropyTable(self):
1653 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1654 dict1 = _numpy_to_numpy_dict(tab1)
1656 self.butler.put(dict1, self.datasetType, dataId={})
1658 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1659 tab2_dict = _astropy_to_numpy_dict(tab2)
1661 self._checkNumpyDictEquality(dict1, tab2_dict)
1663 def testWriteNumpyDictReadAsNumpyTable(self):
1664 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1665 dict1 = _numpy_to_numpy_dict(tab1)
1667 self.butler.put(dict1, self.datasetType, dataId={})
1669 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1670 tab2_dict = _numpy_to_numpy_dict(tab2)
1672 self._checkNumpyDictEquality(dict1, tab2_dict)
1674 def testWriteNumpyDictBad(self):
1675 dict1 = {"a": 4, "b": np.ndarray([1])}
1676 with self.assertRaises(RuntimeError):
1677 self.butler.put(dict1, self.datasetType, dataId={})
1679 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1680 with self.assertRaises(RuntimeError):
1681 self.butler.put(dict2, self.datasetType, dataId={})
1683 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1684 with self.assertRaises(RuntimeError):
1685 self.butler.put(dict3, self.datasetType, dataId={})
1687 def _checkNumpyDictEquality(self, dict1, dict2):
1688 """Check if two numpy dicts have the same columns/values.
1690 Parameters
1691 ----------
1692 dict1 : `dict` [`str`, `np.ndarray`]
1693 dict2 : `dict` [`str`, `np.ndarray`]
1694 """
1695 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1696 for name in dict1.keys():
1697 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1698 self.assertTrue(np.all(dict1[name] == dict2[name]))
1701@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1702@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1703class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1704 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1706 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1708 def testWriteNumpyDictBad(self):
1709 # The sub-type checking is not done on in-memory datastore.
1710 pass
1713@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
1714@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
1715class ComputeRowGroupSizeTestCase(unittest.TestCase):
1716 """Tests for compute_row_group_size."""
1718 def testRowGroupSizeNoMetadata(self):
1719 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1721 # We can't use the numpy_to_arrow convenience function because
1722 # that adds metadata.
1723 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
1724 schema = pa.schema(type_list)
1725 arrays = _numpy_style_arrays_to_arrow_arrays(
1726 numpyTable.dtype,
1727 len(numpyTable),
1728 numpyTable,
1729 schema,
1730 )
1731 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
1733 row_group_size = compute_row_group_size(arrowTable.schema)
1735 self.assertGreater(row_group_size, 1_000_000)
1736 self.assertLess(row_group_size, 2_000_000)
1738 def testRowGroupSizeWithMetadata(self):
1739 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1741 arrowTable = numpy_to_arrow(numpyTable)
1743 row_group_size = compute_row_group_size(arrowTable.schema)
1745 self.assertGreater(row_group_size, 1_000_000)
1746 self.assertLess(row_group_size, 2_000_000)
1748 def testRowGroupSizeTinyTable(self):
1749 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
1751 arrowTable = numpy_to_arrow(numpyTable)
1753 row_group_size = compute_row_group_size(arrowTable.schema)
1755 self.assertGreater(row_group_size, 1_000_000)
1758if __name__ == "__main__":
1759 unittest.main()