Coverage for tests/test_parquet.py: 17%
957 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-09 02:11 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-09 02:11 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _astropy_to_numpy_dict,
68 _numpy_dict_to_numpy,
69 _numpy_dtype_to_arrow_types,
70 _numpy_style_arrays_to_arrow_arrays,
71 _numpy_to_numpy_dict,
72 arrow_to_astropy,
73 arrow_to_numpy,
74 arrow_to_numpy_dict,
75 arrow_to_pandas,
76 astropy_to_arrow,
77 compute_row_group_size,
78 numpy_dict_to_arrow,
79 numpy_to_arrow,
80 pandas_to_arrow,
81)
82from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
84TESTDIR = os.path.abspath(os.path.dirname(__file__))
87def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
88 """Make a simple numpy table with random data.
90 Parameters
91 ----------
92 include_multidim : `bool`
93 Include multi-dimensional columns.
94 include_bigendian : `bool`
95 Include big-endian columns.
97 Returns
98 -------
99 numpyTable : `numpy.ndarray`
100 """
101 nrow = 5
103 dtype = [
104 ("index", "i4"),
105 ("a", "f8"),
106 ("b", "f8"),
107 ("c", "f8"),
108 ("ddd", "f8"),
109 ("f", "i8"),
110 ("strcol", "U10"),
111 ("bytecol", "a10"),
112 ]
114 if include_multidim:
115 dtype.extend(
116 [
117 ("d1", "f4", (5,)),
118 ("d2", "i8", (5, 10)),
119 ("d3", "f8", (5, 10)),
120 ]
121 )
123 if include_bigendian:
124 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
126 data = np.zeros(nrow, dtype=dtype)
127 data["index"][:] = np.arange(nrow)
128 data["a"] = np.random.randn(nrow)
129 data["b"] = np.random.randn(nrow)
130 data["c"] = np.random.randn(nrow)
131 data["ddd"] = np.random.randn(nrow)
132 data["f"] = np.arange(nrow) * 10
133 data["strcol"][:] = "teststring"
134 data["bytecol"][:] = "teststring"
136 if include_multidim:
137 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
138 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
139 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
141 if include_bigendian:
142 data["a_bigendian"][:] = data["a"]
143 data["f_bigendian"][:] = data["f"]
145 return data
148def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
149 """Make a single index data frame for testing.
151 Parameters
152 ----------
153 include_masked : `bool`
154 Include masked columns.
155 include_lists : `bool`
156 Include list columns.
158 Returns
159 -------
160 dataFrame : `~pandas.DataFrame`
161 The test dataframe.
162 allColumns : `list` [`str`]
163 List of all the columns (including index columns).
164 """
165 data = _makeSimpleNumpyTable()
166 df = pd.DataFrame(data)
167 df = df.set_index("index")
169 if include_masked:
170 nrow = len(df)
172 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
173 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
174 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
175 df.loc[1, ["m1", "m2", "mstrcol"]] = None
177 if include_lists:
178 nrow = len(df)
180 df["l1"] = [[0, 0]] * nrow
181 df["l2"] = [[0.0, 0.0]] * nrow
182 df["l3"] = [[]] * nrow
184 allColumns = df.columns.append(pd.Index(df.index.names))
186 return df, allColumns
189def _makeMultiIndexDataFrame():
190 """Make a multi-index data frame for testing.
192 Returns
193 -------
194 dataFrame : `~pandas.DataFrame`
195 The test dataframe.
196 """
197 columns = pd.MultiIndex.from_tuples(
198 [
199 ("g", "a"),
200 ("g", "b"),
201 ("g", "c"),
202 ("r", "a"),
203 ("r", "b"),
204 ("r", "c"),
205 ],
206 names=["filter", "column"],
207 )
208 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
210 return df
213def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
214 """Make an astropy table for testing.
216 Parameters
217 ----------
218 include_multidim : `bool`
219 Include multi-dimensional columns.
220 include_masked : `bool`
221 Include masked columns.
222 include_bigendian : `bool`
223 Include big-endian columns.
225 Returns
226 -------
227 astropyTable : `astropy.table.Table`
228 The test table.
229 """
230 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
231 # Add a couple of units.
232 table = atable.Table(data)
233 table["a"].unit = units.degree
234 table["b"].unit = units.meter
236 # Add some masked columns.
237 if include_masked:
238 nrow = len(table)
239 mask = np.zeros(nrow, dtype=bool)
240 mask[1] = True
241 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
242 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
243 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
244 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
246 return table
249def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
250 """Make an arrow table for testing.
252 Parameters
253 ----------
254 include_multidim : `bool`
255 Include multi-dimensional columns.
256 include_masked : `bool`
257 Include masked columns.
259 Returns
260 -------
261 arrowTable : `pyarrow.Table`
262 The test table.
263 """
264 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
265 return astropy_to_arrow(data)
268@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
269@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
270class ParquetFormatterDataFrameTestCase(unittest.TestCase):
271 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
273 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
275 def setUp(self):
276 """Create a new butler root for each test."""
277 self.root = makeTestTempDir(TESTDIR)
278 config = Config(self.configFile)
279 self.run = "test_run"
280 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
281 # No dimensions in dataset type so we don't have to worry about
282 # inserting dimension data or defining data IDs.
283 self.datasetType = DatasetType(
284 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
285 )
286 self.butler.registry.registerDatasetType(self.datasetType)
288 def tearDown(self):
289 removeTestTempDir(self.root)
291 def testSingleIndexDataFrame(self):
292 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
294 self.butler.put(df1, self.datasetType, dataId={})
295 # Read the whole DataFrame.
296 df2 = self.butler.get(self.datasetType, dataId={})
297 self.assertTrue(df1.equals(df2))
298 # Read just the column descriptions.
299 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
300 self.assertTrue(allColumns.equals(columns2))
301 # Read the rowcount.
302 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
303 self.assertEqual(rowcount, len(df1))
304 # Read the schema.
305 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
306 self.assertEqual(schema, DataFrameSchema(df1))
307 # Read just some columns a few different ways.
308 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
309 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
310 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
311 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
312 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
313 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
314 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
315 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
316 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
317 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
318 # Passing an unrecognized column should be a ValueError.
319 with self.assertRaises(ValueError):
320 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
322 def testSingleIndexDataFrameWithLists(self):
323 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
325 self.butler.put(df1, self.datasetType, dataId={})
326 # Read the whole DataFrame.
327 df2 = self.butler.get(self.datasetType, dataId={})
329 # We need to check the list columns specially because they go
330 # from lists to arrays.
331 for col in ["l1", "l2", "l3"]:
332 for i in range(len(df1)):
333 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
335 def testMultiIndexDataFrame(self):
336 df1 = _makeMultiIndexDataFrame()
338 self.butler.put(df1, self.datasetType, dataId={})
339 # Read the whole DataFrame.
340 df2 = self.butler.get(self.datasetType, dataId={})
341 self.assertTrue(df1.equals(df2))
342 # Read just the column descriptions.
343 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
344 self.assertTrue(df1.columns.equals(columns2))
345 self.assertEqual(columns2.names, df1.columns.names)
346 # Read the rowcount.
347 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
348 self.assertEqual(rowcount, len(df1))
349 # Read the schema.
350 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
351 self.assertEqual(schema, DataFrameSchema(df1))
352 # Read just some columns a few different ways.
353 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
354 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
355 df4 = self.butler.get(
356 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
357 )
358 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
359 column_list = [("g", "a"), ("r", "c")]
360 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
361 self.assertTrue(df1.loc[:, column_list].equals(df5))
362 column_dict = {"filter": "r", "column": ["a", "b"]}
363 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
364 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
365 # Passing an unrecognized column should be a ValueError.
366 with self.assertRaises(ValueError):
367 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
369 def testSingleIndexDataFrameEmptyString(self):
370 """Test persisting a single index dataframe with empty strings."""
371 df1, _ = _makeSingleIndexDataFrame()
373 # Set one of the strings to None
374 df1.at[1, "strcol"] = None
376 self.butler.put(df1, self.datasetType, dataId={})
377 # Read the whole DataFrame.
378 df2 = self.butler.get(self.datasetType, dataId={})
379 self.assertTrue(df1.equals(df2))
381 def testSingleIndexDataFrameAllEmptyStrings(self):
382 """Test persisting a single index dataframe with an empty string
383 column.
384 """
385 df1, _ = _makeSingleIndexDataFrame()
387 # Set all of the strings to None
388 df1.loc[0:, "strcol"] = None
390 self.butler.put(df1, self.datasetType, dataId={})
391 # Read the whole DataFrame.
392 df2 = self.butler.get(self.datasetType, dataId={})
393 self.assertTrue(df1.equals(df2))
395 def testLegacyDataFrame(self):
396 """Test writing a dataframe to parquet via pandas (without additional
397 metadata) and ensure that we can read it back with all the new
398 functionality.
399 """
400 df1, allColumns = _makeSingleIndexDataFrame()
402 fname = os.path.join(self.root, "test_dataframe.parq")
403 df1.to_parquet(fname)
405 legacy_type = DatasetType(
406 "legacy_dataframe",
407 dimensions=(),
408 storageClass="DataFrame",
409 universe=self.butler.registry.dimensions,
410 )
411 self.butler.registry.registerDatasetType(legacy_type)
413 data_id = {}
414 ref = DatasetRef(legacy_type, data_id, run=self.run)
415 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
417 self.butler.ingest(dataset, transfer="copy")
419 self.butler.put(df1, self.datasetType, dataId={})
421 df2a = self.butler.get(self.datasetType, dataId={})
422 df2b = self.butler.get("legacy_dataframe", dataId={})
423 self.assertTrue(df2a.equals(df2b))
425 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
426 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
427 self.assertTrue(df3a.equals(df3b))
429 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
430 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
431 self.assertTrue(columns2a.equals(columns2b))
433 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
434 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
435 self.assertEqual(rowcount2a, rowcount2b)
437 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
438 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
439 self.assertEqual(schema2a, schema2b)
441 def testDataFrameSchema(self):
442 tab1 = _makeSimpleArrowTable()
444 schema = DataFrameSchema.from_arrow(tab1.schema)
446 self.assertIsInstance(schema.schema, pd.DataFrame)
447 self.assertEqual(repr(schema), repr(schema._schema))
448 self.assertNotEqual(schema, "not_a_schema")
449 self.assertEqual(schema, schema)
451 tab2 = _makeMultiIndexDataFrame()
452 schema2 = DataFrameSchema(tab2)
454 self.assertNotEqual(schema, schema2)
456 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
457 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
458 df1, allColumns = _makeSingleIndexDataFrame()
460 self.butler.put(df1, self.datasetType, dataId={})
462 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
464 tab2_df = tab2.to_pandas(index="index")
465 self.assertTrue(df1.equals(tab2_df))
467 # Check reading the columns.
468 columns = list(tab2.columns.keys())
469 columns2 = self.butler.get(
470 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
471 )
472 # We check the set because pandas reorders the columns.
473 self.assertEqual(set(columns2), set(columns))
475 # Check reading the schema.
476 schema = ArrowAstropySchema(tab2)
477 schema2 = self.butler.get(
478 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
479 )
481 # The string types are objectified by pandas, and the order
482 # will be changed because of pandas indexing.
483 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
484 for name in schema.schema.columns:
485 self.assertIn(name, schema2.schema.columns)
486 if schema2.schema[name].dtype != np.dtype("O"):
487 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
489 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
490 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
491 # We need to special-case the write-as-pandas read-as-astropy code
492 # with masks because pandas has multiple ways to use masked columns.
493 # (The string column mask handling in particular is frustratingly
494 # inconsistent.)
495 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
497 self.butler.put(df1, self.datasetType, dataId={})
499 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
500 tab2_df = tab2.to_pandas(index="index")
502 self.assertTrue(df1.columns.equals(tab2_df.columns))
503 for name in tab2_df.columns:
504 col1 = df1[name]
505 col2 = tab2_df[name]
507 if col1.hasnans:
508 notNull = col1.notnull()
509 self.assertTrue(notNull.equals(col2.notnull()))
510 # Need to check value-by-value because column may
511 # be made of objects, depending on what pandas decides.
512 for index in notNull.values.nonzero()[0]:
513 self.assertEqual(col1[index], col2[index])
514 else:
515 self.assertTrue(col1.equals(col2))
517 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
518 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
519 df1 = _makeMultiIndexDataFrame()
521 self.butler.put(df1, self.datasetType, dataId={})
523 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
525 # This is an odd duck, it doesn't really round-trip.
526 # This test simply checks that it's readable, but definitely not
527 # recommended.
529 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
530 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
531 df1, allColumns = _makeSingleIndexDataFrame()
533 self.butler.put(df1, self.datasetType, dataId={})
535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
537 tab2_df = arrow_to_pandas(tab2)
538 self.assertTrue(df1.equals(tab2_df))
540 # Check reading the columns.
541 columns = list(tab2.schema.names)
542 columns2 = self.butler.get(
543 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
544 )
545 # We check the set because pandas reorders the columns.
546 self.assertEqual(set(columns), set(columns2))
548 # Check reading the schema.
549 schema = tab2.schema
550 schema2 = self.butler.get(
551 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
552 )
554 # These will not have the same metadata, nor will the string column
555 # information be maintained.
556 self.assertEqual(len(schema.names), len(schema2.names))
557 for name in schema.names:
558 if schema.field(name).type not in (pa.string(), pa.binary()):
559 self.assertEqual(schema.field(name).type, schema2.field(name).type)
561 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
562 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
563 df1 = _makeMultiIndexDataFrame()
565 self.butler.put(df1, self.datasetType, dataId={})
567 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
569 tab2_df = arrow_to_pandas(tab2)
570 self.assertTrue(df1.equals(tab2_df))
572 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
573 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
574 df1, allColumns = _makeSingleIndexDataFrame()
576 self.butler.put(df1, self.datasetType, dataId={})
578 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
580 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
581 self.assertTrue(df1.equals(tab2_df))
583 # Check reading the columns.
584 columns = list(tab2.dtype.names)
585 columns2 = self.butler.get(
586 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
587 )
588 # We check the set because pandas reorders the columns.
589 self.assertEqual(set(columns2), set(columns))
591 # Check reading the schema.
592 schema = ArrowNumpySchema(tab2.dtype)
593 schema2 = self.butler.get(
594 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
595 )
597 # The string types will be objectified by pandas, and the order
598 # will be changed because of pandas indexing.
599 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
600 for name in schema.schema.names:
601 self.assertIn(name, schema2.schema.names)
602 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
604 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
605 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
606 df1 = _makeMultiIndexDataFrame()
608 self.butler.put(df1, self.datasetType, dataId={})
610 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
612 # This is an odd duck, it doesn't really round-trip.
613 # This test simply checks that it's readable, but definitely not
614 # recommended.
616 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
617 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
618 df1, allColumns = _makeSingleIndexDataFrame()
620 self.butler.put(df1, self.datasetType, dataId={})
622 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
624 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
625 # The column order is not maintained.
626 self.assertEqual(set(df1.columns), set(tab2_df.columns))
627 for col in df1.columns:
628 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
630 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
631 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
632 df1 = _makeMultiIndexDataFrame()
634 self.butler.put(df1, self.datasetType, dataId={})
636 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
638 # This is an odd duck, it doesn't really round-trip.
639 # This test simply checks that it's readable, but definitely not
640 # recommended.
643@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
644class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
645 """Tests for InMemoryDatastore, using DataFrameDelegate."""
647 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
649 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
650 df1 = _makeMultiIndexDataFrame()
652 self.butler.put(df1, self.datasetType, dataId={})
654 with self.assertRaises(ValueError):
655 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
657 def testLegacyDataFrame(self):
658 # This test does not work with an inMemoryDatastore.
659 pass
661 def testBadInput(self):
662 df1, _ = _makeSingleIndexDataFrame()
663 delegate = DataFrameDelegate("DataFrame")
665 with self.assertRaises(ValueError):
666 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
668 with self.assertRaises(AttributeError):
669 delegate.getComponent(composite=df1, componentName="nothing")
671 def testStorageClass(self):
672 df1, allColumns = _makeSingleIndexDataFrame()
674 factory = StorageClassFactory()
675 factory.addFromConfig(StorageClassConfig())
677 storageClass = factory.findStorageClass(type(df1), compare_types=False)
678 # Force the name lookup to do name matching.
679 storageClass._pytype = None
680 self.assertEqual(storageClass.name, "DataFrame")
682 storageClass = factory.findStorageClass(type(df1), compare_types=True)
683 # Force the name lookup to do name matching.
684 storageClass._pytype = None
685 self.assertEqual(storageClass.name, "DataFrame")
688@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
689@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
690class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
691 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
693 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
695 def setUp(self):
696 """Create a new butler root for each test."""
697 self.root = makeTestTempDir(TESTDIR)
698 config = Config(self.configFile)
699 self.run = "test_run"
700 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
701 # No dimensions in dataset type so we don't have to worry about
702 # inserting dimension data or defining data IDs.
703 self.datasetType = DatasetType(
704 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
705 )
706 self.butler.registry.registerDatasetType(self.datasetType)
708 def tearDown(self):
709 removeTestTempDir(self.root)
711 def testAstropyTable(self):
712 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
714 self.butler.put(tab1, self.datasetType, dataId={})
715 # Read the whole Table.
716 tab2 = self.butler.get(self.datasetType, dataId={})
717 self._checkAstropyTableEquality(tab1, tab2)
718 # Read the columns.
719 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
720 self.assertEqual(len(columns2), len(tab1.dtype.names))
721 for i, name in enumerate(tab1.dtype.names):
722 self.assertEqual(columns2[i], name)
723 # Read the rowcount.
724 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
725 self.assertEqual(rowcount, len(tab1))
726 # Read the schema.
727 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
728 self.assertEqual(schema, ArrowAstropySchema(tab1))
729 # Read just some columns a few different ways.
730 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
731 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
732 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
733 self._checkAstropyTableEquality(tab1[("a",)], tab4)
734 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
735 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
736 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
737 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
738 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
739 self._checkAstropyTableEquality(tab1[("a",)], tab7)
740 # Passing an unrecognized column should be a ValueError.
741 with self.assertRaises(ValueError):
742 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
744 def testAstropyTableBigEndian(self):
745 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
747 self.butler.put(tab1, self.datasetType, dataId={})
748 # Read the whole Table.
749 tab2 = self.butler.get(self.datasetType, dataId={})
750 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
752 def testAstropyTableWithMetadata(self):
753 tab1 = _makeSimpleAstropyTable(include_multidim=True)
755 meta = {
756 "meta_a": 5,
757 "meta_b": 10.0,
758 "meta_c": [1, 2, 3],
759 "meta_d": True,
760 "meta_e": "string",
761 }
763 tab1.meta.update(meta)
765 self.butler.put(tab1, self.datasetType, dataId={})
766 # Read the whole Table.
767 tab2 = self.butler.get(self.datasetType, dataId={})
768 # This will check that the metadata is equivalent as well.
769 self._checkAstropyTableEquality(tab1, tab2)
771 def testArrowAstropySchema(self):
772 tab1 = _makeSimpleAstropyTable()
773 tab1_arrow = astropy_to_arrow(tab1)
774 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
776 self.assertIsInstance(schema.schema, atable.Table)
777 self.assertEqual(repr(schema), repr(schema._schema))
778 self.assertNotEqual(schema, "not_a_schema")
779 self.assertEqual(schema, schema)
781 # Test various inequalities
782 tab2 = tab1.copy()
783 tab2.rename_column("index", "index2")
784 schema2 = ArrowAstropySchema(tab2)
785 self.assertNotEqual(schema2, schema)
787 tab2 = tab1.copy()
788 tab2["index"].unit = units.micron
789 schema2 = ArrowAstropySchema(tab2)
790 self.assertNotEqual(schema2, schema)
792 tab2 = tab1.copy()
793 tab2["index"].description = "Index column"
794 schema2 = ArrowAstropySchema(tab2)
795 self.assertNotEqual(schema2, schema)
797 tab2 = tab1.copy()
798 tab2["index"].format = "%05d"
799 schema2 = ArrowAstropySchema(tab2)
800 self.assertNotEqual(schema2, schema)
802 def testAstropyParquet(self):
803 tab1 = _makeSimpleAstropyTable()
805 fname = os.path.join(self.root, "test_astropy.parq")
806 tab1.write(fname)
808 astropy_type = DatasetType(
809 "astropy_parquet",
810 dimensions=(),
811 storageClass="ArrowAstropy",
812 universe=self.butler.registry.dimensions,
813 )
814 self.butler.registry.registerDatasetType(astropy_type)
816 data_id = {}
817 ref = DatasetRef(astropy_type, data_id, run=self.run)
818 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
820 self.butler.ingest(dataset, transfer="copy")
822 self.butler.put(tab1, self.datasetType, dataId={})
824 tab2a = self.butler.get(self.datasetType, dataId={})
825 tab2b = self.butler.get("astropy_parquet", dataId={})
826 self._checkAstropyTableEquality(tab2a, tab2b)
828 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
829 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
830 self.assertEqual(len(columns2b), len(columns2a))
831 for i, name in enumerate(columns2a):
832 self.assertEqual(columns2b[i], name)
834 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
835 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
836 self.assertEqual(rowcount2a, rowcount2b)
838 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
839 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
840 self.assertEqual(schema2a, schema2b)
842 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
843 def testWriteAstropyReadAsArrowTable(self):
844 # This astropy <-> arrow works fine with masked columns.
845 tab1 = _makeSimpleAstropyTable(include_masked=True)
847 self.butler.put(tab1, self.datasetType, dataId={})
849 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
851 tab2_astropy = arrow_to_astropy(tab2)
852 self._checkAstropyTableEquality(tab1, tab2_astropy)
854 # Check reading the columns.
855 columns = tab2.schema.names
856 columns2 = self.butler.get(
857 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
858 )
859 self.assertEqual(columns2, columns)
861 # Check reading the schema.
862 schema = tab2.schema
863 schema2 = self.butler.get(
864 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
865 )
867 self.assertEqual(schema, schema2)
869 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
870 def testWriteAstropyReadAsDataFrame(self):
871 tab1 = _makeSimpleAstropyTable()
873 self.butler.put(tab1, self.datasetType, dataId={})
875 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
877 # This is tricky because it loses the units and gains a bonus pandas
878 # _index_ column, so we just test the dataframe form.
880 tab1_df = tab1.to_pandas()
881 self.assertTrue(tab1_df.equals(tab2))
883 # Check reading the columns.
884 columns = tab2.columns
885 columns2 = self.butler.get(
886 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
887 )
888 self.assertTrue(columns.equals(columns2))
890 # Check reading the schema.
891 schema = DataFrameSchema(tab2)
892 schema2 = self.butler.get(
893 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
894 )
896 self.assertEqual(schema2, schema)
898 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
899 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
900 # We need to special-case the write-as-astropy read-as-pandas code
901 # with masks because pandas has multiple ways to use masked columns.
902 # (When writing an astropy table with masked columns we get an object
903 # column back, but each unmasked element has the correct type.)
904 tab1 = _makeSimpleAstropyTable(include_masked=True)
906 self.butler.put(tab1, self.datasetType, dataId={})
908 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
910 tab1_df = tab1.to_pandas()
912 self.assertTrue(tab1_df.columns.equals(tab2.columns))
913 for name in tab2.columns:
914 col1 = tab1_df[name]
915 col2 = tab2[name]
917 if col1.hasnans:
918 notNull = col1.notnull()
919 self.assertTrue(notNull.equals(col2.notnull()))
920 # Need to check value-by-value because column may
921 # be made of objects, depending on what pandas decides.
922 for index in notNull.values.nonzero()[0]:
923 self.assertEqual(col1[index], col2[index])
924 else:
925 self.assertTrue(col1.equals(col2))
927 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
928 def testWriteAstropyReadAsNumpyTable(self):
929 tab1 = _makeSimpleAstropyTable()
930 self.butler.put(tab1, self.datasetType, dataId={})
932 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
934 # This is tricky because it loses the units.
935 tab2_astropy = atable.Table(tab2)
937 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
939 # Check reading the columns.
940 columns = list(tab2.dtype.names)
941 columns2 = self.butler.get(
942 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
943 )
944 self.assertEqual(columns2, columns)
946 # Check reading the schema.
947 schema = ArrowNumpySchema(tab2.dtype)
948 schema2 = self.butler.get(
949 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
950 )
952 self.assertEqual(schema2, schema)
954 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
955 def testWriteAstropyReadAsNumpyDict(self):
956 tab1 = _makeSimpleAstropyTable()
957 self.butler.put(tab1, self.datasetType, dataId={})
959 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
961 # This is tricky because it loses the units.
962 tab2_astropy = atable.Table(tab2)
964 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
966 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
967 """Check if two astropy tables have the same columns/values.
969 Parameters
970 ----------
971 table1 : `astropy.table.Table`
972 table2 : `astropy.table.Table`
973 skip_units : `bool`
974 has_bigendian : `bool`
975 """
976 if not has_bigendian:
977 self.assertEqual(table1.dtype, table2.dtype)
978 else:
979 for name in table1.dtype.names:
980 # Only check type matches, force to little-endian.
981 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
983 self.assertEqual(table1.meta, table2.meta)
984 if not skip_units:
985 for name in table1.columns:
986 self.assertEqual(table1[name].unit, table2[name].unit)
987 self.assertEqual(table1[name].description, table2[name].description)
988 self.assertEqual(table1[name].format, table2[name].format)
989 self.assertTrue(np.all(table1 == table2))
992@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
993class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
994 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
996 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
998 def testAstropyParquet(self):
999 # This test does not work with an inMemoryDatastore.
1000 pass
1002 def testBadInput(self):
1003 tab1 = _makeSimpleAstropyTable()
1004 delegate = ArrowAstropyDelegate("ArrowAstropy")
1006 with self.assertRaises(ValueError):
1007 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1009 with self.assertRaises(NotImplementedError):
1010 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1012 with self.assertRaises(AttributeError):
1013 delegate.getComponent(composite=tab1, componentName="nothing")
1016@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1017@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1018class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1019 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1021 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1023 def setUp(self):
1024 """Create a new butler root for each test."""
1025 self.root = makeTestTempDir(TESTDIR)
1026 config = Config(self.configFile)
1027 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1028 # No dimensions in dataset type so we don't have to worry about
1029 # inserting dimension data or defining data IDs.
1030 self.datasetType = DatasetType(
1031 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
1032 )
1033 self.butler.registry.registerDatasetType(self.datasetType)
1035 def tearDown(self):
1036 removeTestTempDir(self.root)
1038 def testNumpyTable(self):
1039 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1041 self.butler.put(tab1, self.datasetType, dataId={})
1042 # Read the whole Table.
1043 tab2 = self.butler.get(self.datasetType, dataId={})
1044 self._checkNumpyTableEquality(tab1, tab2)
1045 # Read the columns.
1046 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1047 self.assertEqual(len(columns2), len(tab1.dtype.names))
1048 for i, name in enumerate(tab1.dtype.names):
1049 self.assertEqual(columns2[i], name)
1050 # Read the rowcount.
1051 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1052 self.assertEqual(rowcount, len(tab1))
1053 # Read the schema.
1054 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1055 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1056 # Read just some columns a few different ways.
1057 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1058 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1059 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1060 self._checkNumpyTableEquality(
1061 tab1[
1062 [
1063 "a",
1064 ]
1065 ],
1066 tab4,
1067 )
1068 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1069 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1070 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1071 self._checkNumpyTableEquality(
1072 tab1[
1073 [
1074 "ddd",
1075 ]
1076 ],
1077 tab6,
1078 )
1079 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1080 self._checkNumpyTableEquality(
1081 tab1[
1082 [
1083 "a",
1084 ]
1085 ],
1086 tab7,
1087 )
1088 # Passing an unrecognized column should be a ValueError.
1089 with self.assertRaises(ValueError):
1090 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1092 def testNumpyTableBigEndian(self):
1093 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1095 self.butler.put(tab1, self.datasetType, dataId={})
1096 # Read the whole Table.
1097 tab2 = self.butler.get(self.datasetType, dataId={})
1098 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1100 def testArrowNumpySchema(self):
1101 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1102 tab1_arrow = numpy_to_arrow(tab1)
1103 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1105 self.assertIsInstance(schema.schema, np.dtype)
1106 self.assertEqual(repr(schema), repr(schema._dtype))
1107 self.assertNotEqual(schema, "not_a_schema")
1108 self.assertEqual(schema, schema)
1110 # Test inequality
1111 tab2 = tab1.copy()
1112 names = list(tab2.dtype.names)
1113 names[0] = "index2"
1114 tab2.dtype.names = names
1115 schema2 = ArrowNumpySchema(tab2.dtype)
1116 self.assertNotEqual(schema2, schema)
1118 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1119 def testNumpyDictConversions(self):
1120 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1122 # Verify that everything round-trips, including the schema.
1123 tab1_arrow = numpy_to_arrow(tab1)
1124 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1125 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1127 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1128 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1130 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1131 def testWriteNumpyTableReadAsArrowTable(self):
1132 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1134 self.butler.put(tab1, self.datasetType, dataId={})
1136 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1138 tab2_numpy = arrow_to_numpy(tab2)
1140 self._checkNumpyTableEquality(tab1, tab2_numpy)
1142 # Check reading the columns.
1143 columns = tab2.schema.names
1144 columns2 = self.butler.get(
1145 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1146 )
1147 self.assertEqual(columns2, columns)
1149 # Check reading the schema.
1150 schema = tab2.schema
1151 schema2 = self.butler.get(
1152 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1153 )
1154 self.assertEqual(schema2, schema)
1156 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1157 def testWriteNumpyTableReadAsDataFrame(self):
1158 tab1 = _makeSimpleNumpyTable()
1160 self.butler.put(tab1, self.datasetType, dataId={})
1162 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1164 # Converting this back to numpy gets confused with the index column
1165 # and changes the datatype of the string column.
1167 tab1_df = pd.DataFrame(tab1)
1169 self.assertTrue(tab1_df.equals(tab2))
1171 # Check reading the columns.
1172 columns = tab2.columns
1173 columns2 = self.butler.get(
1174 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1175 )
1176 self.assertTrue(columns.equals(columns2))
1178 # Check reading the schema.
1179 schema = DataFrameSchema(tab2)
1180 schema2 = self.butler.get(
1181 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1182 )
1184 self.assertEqual(schema2, schema)
1186 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1187 def testWriteNumpyTableReadAsAstropyTable(self):
1188 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1190 self.butler.put(tab1, self.datasetType, dataId={})
1192 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1193 tab2_numpy = tab2.as_array()
1195 self._checkNumpyTableEquality(tab1, tab2_numpy)
1197 # Check reading the columns.
1198 columns = list(tab2.columns.keys())
1199 columns2 = self.butler.get(
1200 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1201 )
1202 self.assertEqual(columns2, columns)
1204 # Check reading the schema.
1205 schema = ArrowAstropySchema(tab2)
1206 schema2 = self.butler.get(
1207 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1208 )
1210 self.assertEqual(schema2, schema)
1212 def testWriteNumpyTableReadAsNumpyDict(self):
1213 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1215 self.butler.put(tab1, self.datasetType, dataId={})
1217 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1218 tab2_numpy = _numpy_dict_to_numpy(tab2)
1220 self._checkNumpyTableEquality(tab1, tab2_numpy)
1222 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1223 """Check if two numpy tables have the same columns/values
1225 Parameters
1226 ----------
1227 table1 : `numpy.ndarray`
1228 table2 : `numpy.ndarray`
1229 has_bigendian : `bool`
1230 """
1231 self.assertEqual(table1.dtype.names, table2.dtype.names)
1232 for name in table1.dtype.names:
1233 if not has_bigendian:
1234 self.assertEqual(table1.dtype[name], table2.dtype[name])
1235 else:
1236 # Only check type matches, force to little-endian.
1237 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1238 self.assertTrue(np.all(table1 == table2))
1241@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1242class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1243 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1245 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1247 def testBadInput(self):
1248 tab1 = _makeSimpleNumpyTable()
1249 delegate = ArrowNumpyDelegate("ArrowNumpy")
1251 with self.assertRaises(ValueError):
1252 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1254 with self.assertRaises(NotImplementedError):
1255 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1257 with self.assertRaises(AttributeError):
1258 delegate.getComponent(composite=tab1, componentName="nothing")
1260 def testStorageClass(self):
1261 tab1 = _makeSimpleNumpyTable()
1263 factory = StorageClassFactory()
1264 factory.addFromConfig(StorageClassConfig())
1266 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1267 # Force the name lookup to do name matching.
1268 storageClass._pytype = None
1269 self.assertEqual(storageClass.name, "ArrowNumpy")
1271 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1272 # Force the name lookup to do name matching.
1273 storageClass._pytype = None
1274 self.assertEqual(storageClass.name, "ArrowNumpy")
1277@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1278class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1279 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1281 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1283 def setUp(self):
1284 """Create a new butler root for each test."""
1285 self.root = makeTestTempDir(TESTDIR)
1286 config = Config(self.configFile)
1287 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1288 # No dimensions in dataset type so we don't have to worry about
1289 # inserting dimension data or defining data IDs.
1290 self.datasetType = DatasetType(
1291 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1292 )
1293 self.butler.registry.registerDatasetType(self.datasetType)
1295 def tearDown(self):
1296 removeTestTempDir(self.root)
1298 def testArrowTable(self):
1299 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1301 self.butler.put(tab1, self.datasetType, dataId={})
1302 # Read the whole Table.
1303 tab2 = self.butler.get(self.datasetType, dataId={})
1304 self.assertEqual(tab2, tab1)
1305 # Read the columns.
1306 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1307 self.assertEqual(len(columns2), len(tab1.schema.names))
1308 for i, name in enumerate(tab1.schema.names):
1309 self.assertEqual(columns2[i], name)
1310 # Read the rowcount.
1311 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1312 self.assertEqual(rowcount, len(tab1))
1313 # Read the schema.
1314 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1315 self.assertEqual(schema, tab1.schema)
1316 # Read just some columns a few different ways.
1317 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1318 self.assertEqual(tab3, tab1.select(("a", "c")))
1319 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1320 self.assertEqual(tab4, tab1.select(("a",)))
1321 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1322 self.assertEqual(tab5, tab1.select(("index", "a")))
1323 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1324 self.assertEqual(tab6, tab1.select(("ddd",)))
1325 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1326 self.assertEqual(tab7, tab1.select(("a",)))
1327 # Passing an unrecognized column should be a ValueError.
1328 with self.assertRaises(ValueError):
1329 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1331 def testEmptyArrowTable(self):
1332 data = _makeSimpleNumpyTable()
1333 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1335 schema = pa.schema(type_list)
1336 arrays = [[]] * len(schema.names)
1338 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1340 self.butler.put(tab1, self.datasetType, dataId={})
1341 tab2 = self.butler.get(self.datasetType, dataId={})
1342 self.assertEqual(tab2, tab1)
1344 tab1_numpy = arrow_to_numpy(tab1)
1345 self.assertEqual(len(tab1_numpy), 0)
1346 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1347 self.assertEqual(tab1_numpy_arrow, tab1)
1349 tab1_pandas = arrow_to_pandas(tab1)
1350 self.assertEqual(len(tab1_pandas), 0)
1351 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1352 # Unfortunately, string/byte columns get mangled when translated
1353 # through empty pandas dataframes.
1354 self.assertEqual(
1355 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1356 tab1.select(("index", "a", "b", "c", "ddd")),
1357 )
1359 tab1_astropy = arrow_to_astropy(tab1)
1360 self.assertEqual(len(tab1_astropy), 0)
1361 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1362 self.assertEqual(tab1_astropy_arrow, tab1)
1364 def testEmptyArrowTableMultidim(self):
1365 data = _makeSimpleNumpyTable(include_multidim=True)
1366 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1368 md = {}
1369 for name in data.dtype.names:
1370 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1372 schema = pa.schema(type_list, metadata=md)
1373 arrays = [[]] * len(schema.names)
1375 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1377 self.butler.put(tab1, self.datasetType, dataId={})
1378 tab2 = self.butler.get(self.datasetType, dataId={})
1379 self.assertEqual(tab2, tab1)
1381 tab1_numpy = arrow_to_numpy(tab1)
1382 self.assertEqual(len(tab1_numpy), 0)
1383 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1384 self.assertEqual(tab1_numpy_arrow, tab1)
1386 tab1_astropy = arrow_to_astropy(tab1)
1387 self.assertEqual(len(tab1_astropy), 0)
1388 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1389 self.assertEqual(tab1_astropy_arrow, tab1)
1391 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1392 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1393 df1, allColumns = _makeSingleIndexDataFrame()
1395 self.butler.put(df1, self.datasetType, dataId={})
1397 # Read back out as a dataframe.
1398 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1399 self.assertTrue(df1.equals(df2))
1401 # Read back out as an arrow table, convert to dataframe.
1402 tab3 = self.butler.get(self.datasetType, dataId={})
1403 df3 = arrow_to_pandas(tab3)
1404 self.assertTrue(df1.equals(df3))
1406 # Check reading the columns.
1407 columns = df2.reset_index().columns
1408 columns2 = self.butler.get(
1409 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1410 )
1411 # We check the set because pandas reorders the columns.
1412 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1414 # Check reading the schema.
1415 schema = DataFrameSchema(df1)
1416 schema2 = self.butler.get(
1417 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1418 )
1419 self.assertEqual(schema2, schema)
1421 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1422 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1423 df1 = _makeMultiIndexDataFrame()
1425 self.butler.put(df1, self.datasetType, dataId={})
1427 # Read back out as a dataframe.
1428 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1429 self.assertTrue(df1.equals(df2))
1431 # Read back out as an arrow table, convert to dataframe.
1432 atab3 = self.butler.get(self.datasetType, dataId={})
1433 df3 = arrow_to_pandas(atab3)
1434 self.assertTrue(df1.equals(df3))
1436 # Check reading the columns.
1437 columns = df2.columns
1438 columns2 = self.butler.get(
1439 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1440 )
1441 self.assertTrue(columns2.equals(columns))
1443 # Check reading the schema.
1444 schema = DataFrameSchema(df1)
1445 schema2 = self.butler.get(
1446 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1447 )
1448 self.assertEqual(schema2, schema)
1450 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1451 def testWriteArrowTableReadAsAstropyTable(self):
1452 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1454 self.butler.put(tab1, self.datasetType, dataId={})
1456 # Read back out as an astropy table.
1457 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1458 self._checkAstropyTableEquality(tab1, tab2)
1460 # Read back out as an arrow table, convert to astropy table.
1461 atab3 = self.butler.get(self.datasetType, dataId={})
1462 tab3 = arrow_to_astropy(atab3)
1463 self._checkAstropyTableEquality(tab1, tab3)
1465 # Check reading the columns.
1466 columns = list(tab2.columns.keys())
1467 columns2 = self.butler.get(
1468 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1469 )
1470 self.assertEqual(columns2, columns)
1472 # Check reading the schema.
1473 schema = ArrowAstropySchema(tab1)
1474 schema2 = self.butler.get(
1475 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1476 )
1477 self.assertEqual(schema2, schema)
1479 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1480 def testWriteArrowTableReadAsNumpyTable(self):
1481 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1483 self.butler.put(tab1, self.datasetType, dataId={})
1485 # Read back out as a numpy table.
1486 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1487 self._checkNumpyTableEquality(tab1, tab2)
1489 # Read back out as an arrow table, convert to numpy table.
1490 atab3 = self.butler.get(self.datasetType, dataId={})
1491 tab3 = arrow_to_numpy(atab3)
1492 self._checkNumpyTableEquality(tab1, tab3)
1494 # Check reading the columns.
1495 columns = list(tab2.dtype.names)
1496 columns2 = self.butler.get(
1497 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1498 )
1499 self.assertEqual(columns2, columns)
1501 # Check reading the schema.
1502 schema = ArrowNumpySchema(tab1.dtype)
1503 schema2 = self.butler.get(
1504 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1505 )
1506 self.assertEqual(schema2, schema)
1508 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1509 def testWriteArrowTableReadAsNumpyDict(self):
1510 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1512 self.butler.put(tab1, self.datasetType, dataId={})
1514 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1515 tab2_numpy = _numpy_dict_to_numpy(tab2)
1516 self._checkNumpyTableEquality(tab1, tab2_numpy)
1518 def _checkAstropyTableEquality(self, table1, table2):
1519 """Check if two astropy tables have the same columns/values
1521 Parameters
1522 ----------
1523 table1 : `astropy.table.Table`
1524 table2 : `astropy.table.Table`
1525 """
1526 self.assertEqual(table1.dtype, table2.dtype)
1527 for name in table1.columns:
1528 self.assertEqual(table1[name].unit, table2[name].unit)
1529 self.assertEqual(table1[name].description, table2[name].description)
1530 self.assertEqual(table1[name].format, table2[name].format)
1531 self.assertTrue(np.all(table1 == table2))
1533 def _checkNumpyTableEquality(self, table1, table2):
1534 """Check if two numpy tables have the same columns/values
1536 Parameters
1537 ----------
1538 table1 : `numpy.ndarray`
1539 table2 : `numpy.ndarray`
1540 """
1541 self.assertEqual(table1.dtype.names, table2.dtype.names)
1542 for name in table1.dtype.names:
1543 self.assertEqual(table1.dtype[name], table2.dtype[name])
1544 self.assertTrue(np.all(table1 == table2))
1547@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1548class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1549 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1551 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1553 def testBadInput(self):
1554 tab1 = _makeSimpleArrowTable()
1555 delegate = ArrowTableDelegate("ArrowTable")
1557 with self.assertRaises(ValueError):
1558 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1560 with self.assertRaises(NotImplementedError):
1561 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1563 with self.assertRaises(AttributeError):
1564 delegate.getComponent(composite=tab1, componentName="nothing")
1566 def testStorageClass(self):
1567 tab1 = _makeSimpleArrowTable()
1569 factory = StorageClassFactory()
1570 factory.addFromConfig(StorageClassConfig())
1572 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1573 # Force the name lookup to do name matching.
1574 storageClass._pytype = None
1575 self.assertEqual(storageClass.name, "ArrowTable")
1577 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1578 # Force the name lookup to do name matching.
1579 storageClass._pytype = None
1580 self.assertEqual(storageClass.name, "ArrowTable")
1583@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1584@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1585class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1586 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1588 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1590 def setUp(self):
1591 """Create a new butler root for each test."""
1592 self.root = makeTestTempDir(TESTDIR)
1593 config = Config(self.configFile)
1594 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1595 # No dimensions in dataset type so we don't have to worry about
1596 # inserting dimension data or defining data IDs.
1597 self.datasetType = DatasetType(
1598 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions
1599 )
1600 self.butler.registry.registerDatasetType(self.datasetType)
1602 def tearDown(self):
1603 removeTestTempDir(self.root)
1605 def testNumpyDict(self):
1606 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1607 dict1 = _numpy_to_numpy_dict(tab1)
1609 self.butler.put(dict1, self.datasetType, dataId={})
1610 # Read the whole table.
1611 dict2 = self.butler.get(self.datasetType, dataId={})
1612 self._checkNumpyDictEquality(dict1, dict2)
1613 # Read the columns.
1614 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1615 self.assertEqual(len(columns2), len(dict1.keys()))
1616 for i, name in enumerate(dict1.keys()):
1617 self.assertIn(name, columns2)
1618 # Read the rowcount.
1619 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1620 self.assertEqual(rowcount, len(dict1["a"]))
1621 # Read the schema.
1622 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1623 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1624 # Read just some columns a few different ways.
1625 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1626 subdict = {key: dict1[key] for key in ["a", "c"]}
1627 self._checkNumpyDictEquality(subdict, tab3)
1628 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1629 subdict = {key: dict1[key] for key in ["a"]}
1630 self._checkNumpyDictEquality(subdict, tab4)
1631 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1632 subdict = {key: dict1[key] for key in ["index", "a"]}
1633 self._checkNumpyDictEquality(subdict, tab5)
1634 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1635 subdict = {key: dict1[key] for key in ["ddd"]}
1636 self._checkNumpyDictEquality(subdict, tab6)
1637 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1638 subdict = {key: dict1[key] for key in ["a"]}
1639 self._checkNumpyDictEquality(subdict, tab7)
1640 # Passing an unrecognized column should be a ValueError.
1641 with self.assertRaises(ValueError):
1642 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1644 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1645 def testWriteNumpyDictReadAsArrowTable(self):
1646 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1647 dict1 = _numpy_to_numpy_dict(tab1)
1649 self.butler.put(dict1, self.datasetType, dataId={})
1651 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1653 tab2_dict = arrow_to_numpy_dict(tab2)
1655 self._checkNumpyDictEquality(dict1, tab2_dict)
1657 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1658 def testWriteNumpyDictReadAsDataFrame(self):
1659 tab1 = _makeSimpleNumpyTable()
1660 dict1 = _numpy_to_numpy_dict(tab1)
1662 self.butler.put(dict1, self.datasetType, dataId={})
1664 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1666 # The order of the dict may get mixed up, so we need to check column
1667 # by column. We also need to do this in dataframe form because pandas
1668 # changes the datatype of the string column.
1669 tab1_df = pd.DataFrame(tab1)
1671 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1672 for col in tab1_df.columns:
1673 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1675 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1676 def testWriteNumpyDictReadAsAstropyTable(self):
1677 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1678 dict1 = _numpy_to_numpy_dict(tab1)
1680 self.butler.put(dict1, self.datasetType, dataId={})
1682 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1683 tab2_dict = _astropy_to_numpy_dict(tab2)
1685 self._checkNumpyDictEquality(dict1, tab2_dict)
1687 def testWriteNumpyDictReadAsNumpyTable(self):
1688 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1689 dict1 = _numpy_to_numpy_dict(tab1)
1691 self.butler.put(dict1, self.datasetType, dataId={})
1693 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1694 tab2_dict = _numpy_to_numpy_dict(tab2)
1696 self._checkNumpyDictEquality(dict1, tab2_dict)
1698 def testWriteNumpyDictBad(self):
1699 dict1 = {"a": 4, "b": np.ndarray([1])}
1700 with self.assertRaises(RuntimeError):
1701 self.butler.put(dict1, self.datasetType, dataId={})
1703 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1704 with self.assertRaises(RuntimeError):
1705 self.butler.put(dict2, self.datasetType, dataId={})
1707 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1708 with self.assertRaises(RuntimeError):
1709 self.butler.put(dict3, self.datasetType, dataId={})
1711 def _checkNumpyDictEquality(self, dict1, dict2):
1712 """Check if two numpy dicts have the same columns/values.
1714 Parameters
1715 ----------
1716 dict1 : `dict` [`str`, `np.ndarray`]
1717 dict2 : `dict` [`str`, `np.ndarray`]
1718 """
1719 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1720 for name in dict1.keys():
1721 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1722 self.assertTrue(np.all(dict1[name] == dict2[name]))
1725@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1726@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1727class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1728 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1730 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1732 def testWriteNumpyDictBad(self):
1733 # The sub-type checking is not done on in-memory datastore.
1734 pass
1737@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
1738@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
1739class ComputeRowGroupSizeTestCase(unittest.TestCase):
1740 """Tests for compute_row_group_size."""
1742 def testRowGroupSizeNoMetadata(self):
1743 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1745 # We can't use the numpy_to_arrow convenience function because
1746 # that adds metadata.
1747 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
1748 schema = pa.schema(type_list)
1749 arrays = _numpy_style_arrays_to_arrow_arrays(
1750 numpyTable.dtype,
1751 len(numpyTable),
1752 numpyTable,
1753 schema,
1754 )
1755 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
1757 row_group_size = compute_row_group_size(arrowTable.schema)
1759 self.assertGreater(row_group_size, 1_000_000)
1760 self.assertLess(row_group_size, 2_000_000)
1762 def testRowGroupSizeWithMetadata(self):
1763 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1765 arrowTable = numpy_to_arrow(numpyTable)
1767 row_group_size = compute_row_group_size(arrowTable.schema)
1769 self.assertGreater(row_group_size, 1_000_000)
1770 self.assertLess(row_group_size, 2_000_000)
1772 def testRowGroupSizeTinyTable(self):
1773 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
1775 arrowTable = numpy_to_arrow(numpyTable)
1777 row_group_size = compute_row_group_size(arrowTable.schema)
1779 self.assertGreater(row_group_size, 1_000_000)
1781 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
1782 def testRowGroupSizeDataFrameWithLists(self):
1783 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
1784 arrowTable = pandas_to_arrow(df)
1785 row_group_size = compute_row_group_size(arrowTable.schema)
1787 self.assertGreater(row_group_size, 1_000_000)
1790if __name__ == "__main__":
1791 unittest.main()