Coverage for tests/test_parquet.py: 17%
955 statements
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-29 02:58 -0700
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-29 02:58 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 np = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
61from lsst.daf.butler.formatters.parquet import (
62 ArrowAstropySchema,
63 ArrowNumpySchema,
64 DataFrameSchema,
65 ParquetFormatter,
66 _append_numpy_multidim_metadata,
67 _astropy_to_numpy_dict,
68 _numpy_dict_to_numpy,
69 _numpy_dtype_to_arrow_types,
70 _numpy_style_arrays_to_arrow_arrays,
71 _numpy_to_numpy_dict,
72 arrow_to_astropy,
73 arrow_to_numpy,
74 arrow_to_numpy_dict,
75 arrow_to_pandas,
76 astropy_to_arrow,
77 compute_row_group_size,
78 numpy_dict_to_arrow,
79 numpy_to_arrow,
80 pandas_to_arrow,
81)
82from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
84TESTDIR = os.path.abspath(os.path.dirname(__file__))
87def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
88 """Make a simple numpy table with random data.
90 Parameters
91 ----------
92 include_multidim : `bool`
93 Include multi-dimensional columns.
94 include_bigendian : `bool`
95 Include big-endian columns.
97 Returns
98 -------
99 numpyTable : `numpy.ndarray`
100 """
101 nrow = 5
103 dtype = [
104 ("index", "i4"),
105 ("a", "f8"),
106 ("b", "f8"),
107 ("c", "f8"),
108 ("ddd", "f8"),
109 ("f", "i8"),
110 ("strcol", "U10"),
111 ("bytecol", "a10"),
112 ]
114 if include_multidim:
115 dtype.extend(
116 [
117 ("d1", "f4", (5,)),
118 ("d2", "i8", (5, 10)),
119 ("d3", "f8", (5, 10)),
120 ]
121 )
123 if include_bigendian:
124 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
126 data = np.zeros(nrow, dtype=dtype)
127 data["index"][:] = np.arange(nrow)
128 data["a"] = np.random.randn(nrow)
129 data["b"] = np.random.randn(nrow)
130 data["c"] = np.random.randn(nrow)
131 data["ddd"] = np.random.randn(nrow)
132 data["f"] = np.arange(nrow) * 10
133 data["strcol"][:] = "teststring"
134 data["bytecol"][:] = "teststring"
136 if include_multidim:
137 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
138 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
139 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
141 if include_bigendian:
142 data["a_bigendian"][:] = data["a"]
143 data["f_bigendian"][:] = data["f"]
145 return data
148def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
149 """Make a single index data frame for testing.
151 Parameters
152 ----------
153 include_masked : `bool`
154 Include masked columns.
155 include_lists : `bool`
156 Include list columns.
158 Returns
159 -------
160 dataFrame : `~pandas.DataFrame`
161 The test dataframe.
162 allColumns : `list` [`str`]
163 List of all the columns (including index columns).
164 """
165 data = _makeSimpleNumpyTable()
166 df = pd.DataFrame(data)
167 df = df.set_index("index")
169 if include_masked:
170 nrow = len(df)
172 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
173 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
174 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
175 df.loc[1, ["m1", "m2", "mstrcol"]] = None
177 if include_lists:
178 nrow = len(df)
180 df["l1"] = [[0, 0]] * nrow
181 df["l2"] = [[0.0, 0.0]] * nrow
182 df["l3"] = [[]] * nrow
184 allColumns = df.columns.append(pd.Index(df.index.names))
186 return df, allColumns
189def _makeMultiIndexDataFrame():
190 """Make a multi-index data frame for testing.
192 Returns
193 -------
194 dataFrame : `~pandas.DataFrame`
195 The test dataframe.
196 """
197 columns = pd.MultiIndex.from_tuples(
198 [
199 ("g", "a"),
200 ("g", "b"),
201 ("g", "c"),
202 ("r", "a"),
203 ("r", "b"),
204 ("r", "c"),
205 ],
206 names=["filter", "column"],
207 )
208 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
210 return df
213def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
214 """Make an astropy table for testing.
216 Parameters
217 ----------
218 include_multidim : `bool`
219 Include multi-dimensional columns.
220 include_masked : `bool`
221 Include masked columns.
222 include_bigendian : `bool`
223 Include big-endian columns.
225 Returns
226 -------
227 astropyTable : `astropy.table.Table`
228 The test table.
229 """
230 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
231 # Add a couple of units.
232 table = atable.Table(data)
233 table["a"].unit = units.degree
234 table["b"].unit = units.meter
236 # Add some masked columns.
237 if include_masked:
238 nrow = len(table)
239 mask = np.zeros(nrow, dtype=bool)
240 mask[1] = True
241 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
242 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
243 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
244 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
246 return table
249def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
250 """Make an arrow table for testing.
252 Parameters
253 ----------
254 include_multidim : `bool`
255 Include multi-dimensional columns.
256 include_masked : `bool`
257 Include masked columns.
259 Returns
260 -------
261 arrowTable : `pyarrow.Table`
262 The test table.
263 """
264 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
265 return astropy_to_arrow(data)
268@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
269@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
270class ParquetFormatterDataFrameTestCase(unittest.TestCase):
271 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
273 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
275 def setUp(self):
276 """Create a new butler root for each test."""
277 self.root = makeTestTempDir(TESTDIR)
278 config = Config(self.configFile)
279 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
280 # No dimensions in dataset type so we don't have to worry about
281 # inserting dimension data or defining data IDs.
282 self.datasetType = DatasetType(
283 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
284 )
285 self.butler.registry.registerDatasetType(self.datasetType)
287 def tearDown(self):
288 removeTestTempDir(self.root)
290 def testSingleIndexDataFrame(self):
291 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
293 self.butler.put(df1, self.datasetType, dataId={})
294 # Read the whole DataFrame.
295 df2 = self.butler.get(self.datasetType, dataId={})
296 self.assertTrue(df1.equals(df2))
297 # Read just the column descriptions.
298 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
299 self.assertTrue(allColumns.equals(columns2))
300 # Read the rowcount.
301 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
302 self.assertEqual(rowcount, len(df1))
303 # Read the schema.
304 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
305 self.assertEqual(schema, DataFrameSchema(df1))
306 # Read just some columns a few different ways.
307 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
308 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
309 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
310 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
311 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
312 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
313 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
314 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
315 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
316 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
317 # Passing an unrecognized column should be a ValueError.
318 with self.assertRaises(ValueError):
319 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
321 def testSingleIndexDataFrameWithLists(self):
322 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
324 self.butler.put(df1, self.datasetType, dataId={})
325 # Read the whole DataFrame.
326 df2 = self.butler.get(self.datasetType, dataId={})
328 # We need to check the list columns specially because they go
329 # from lists to arrays.
330 for col in ["l1", "l2", "l3"]:
331 for i in range(len(df1)):
332 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
334 def testMultiIndexDataFrame(self):
335 df1 = _makeMultiIndexDataFrame()
337 self.butler.put(df1, self.datasetType, dataId={})
338 # Read the whole DataFrame.
339 df2 = self.butler.get(self.datasetType, dataId={})
340 self.assertTrue(df1.equals(df2))
341 # Read just the column descriptions.
342 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
343 self.assertTrue(df1.columns.equals(columns2))
344 self.assertEqual(columns2.names, df1.columns.names)
345 # Read the rowcount.
346 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
347 self.assertEqual(rowcount, len(df1))
348 # Read the schema.
349 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
350 self.assertEqual(schema, DataFrameSchema(df1))
351 # Read just some columns a few different ways.
352 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
353 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
354 df4 = self.butler.get(
355 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
356 )
357 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
358 column_list = [("g", "a"), ("r", "c")]
359 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
360 self.assertTrue(df1.loc[:, column_list].equals(df5))
361 column_dict = {"filter": "r", "column": ["a", "b"]}
362 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
363 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
364 # Passing an unrecognized column should be a ValueError.
365 with self.assertRaises(ValueError):
366 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
368 def testSingleIndexDataFrameEmptyString(self):
369 """Test persisting a single index dataframe with empty strings."""
370 df1, _ = _makeSingleIndexDataFrame()
372 # Set one of the strings to None
373 df1.at[1, "strcol"] = None
375 self.butler.put(df1, self.datasetType, dataId={})
376 # Read the whole DataFrame.
377 df2 = self.butler.get(self.datasetType, dataId={})
378 self.assertTrue(df1.equals(df2))
380 def testSingleIndexDataFrameAllEmptyStrings(self):
381 """Test persisting a single index dataframe with an empty string
382 column.
383 """
384 df1, _ = _makeSingleIndexDataFrame()
386 # Set all of the strings to None
387 df1.loc[0:, "strcol"] = None
389 self.butler.put(df1, self.datasetType, dataId={})
390 # Read the whole DataFrame.
391 df2 = self.butler.get(self.datasetType, dataId={})
392 self.assertTrue(df1.equals(df2))
394 def testLegacyDataFrame(self):
395 """Test writing a dataframe to parquet via pandas (without additional
396 metadata) and ensure that we can read it back with all the new
397 functionality.
398 """
399 df1, allColumns = _makeSingleIndexDataFrame()
401 fname = os.path.join(self.root, "test_dataframe.parq")
402 df1.to_parquet(fname)
404 legacy_type = DatasetType(
405 "legacy_dataframe",
406 dimensions=(),
407 storageClass="DataFrame",
408 universe=self.butler.registry.dimensions,
409 )
410 self.butler.registry.registerDatasetType(legacy_type)
412 data_id = {}
413 ref = DatasetRef(legacy_type, data_id, run="testLegacyDataFrame")
414 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
416 self.butler.ingest(dataset, transfer="copy")
418 self.butler.put(df1, self.datasetType, dataId={})
420 df2a = self.butler.get(self.datasetType, dataId={})
421 df2b = self.butler.get("legacy_dataframe", dataId={})
422 self.assertTrue(df2a.equals(df2b))
424 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
425 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
426 self.assertTrue(df3a.equals(df3b))
428 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
429 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
430 self.assertTrue(columns2a.equals(columns2b))
432 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
433 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
434 self.assertEqual(rowcount2a, rowcount2b)
436 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
437 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
438 self.assertEqual(schema2a, schema2b)
440 def testDataFrameSchema(self):
441 tab1 = _makeSimpleArrowTable()
443 schema = DataFrameSchema.from_arrow(tab1.schema)
445 self.assertIsInstance(schema.schema, pd.DataFrame)
446 self.assertEqual(repr(schema), repr(schema._schema))
447 self.assertNotEqual(schema, "not_a_schema")
448 self.assertEqual(schema, schema)
450 tab2 = _makeMultiIndexDataFrame()
451 schema2 = DataFrameSchema(tab2)
453 self.assertNotEqual(schema, schema2)
455 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
456 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
457 df1, allColumns = _makeSingleIndexDataFrame()
459 self.butler.put(df1, self.datasetType, dataId={})
461 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
463 tab2_df = tab2.to_pandas(index="index")
464 self.assertTrue(df1.equals(tab2_df))
466 # Check reading the columns.
467 columns = list(tab2.columns.keys())
468 columns2 = self.butler.get(
469 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
470 )
471 # We check the set because pandas reorders the columns.
472 self.assertEqual(set(columns2), set(columns))
474 # Check reading the schema.
475 schema = ArrowAstropySchema(tab2)
476 schema2 = self.butler.get(
477 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
478 )
480 # The string types are objectified by pandas, and the order
481 # will be changed because of pandas indexing.
482 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
483 for name in schema.schema.columns:
484 self.assertIn(name, schema2.schema.columns)
485 if schema2.schema[name].dtype != np.dtype("O"):
486 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
488 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
489 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
490 # We need to special-case the write-as-pandas read-as-astropy code
491 # with masks because pandas has multiple ways to use masked columns.
492 # (The string column mask handling in particular is frustratingly
493 # inconsistent.)
494 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
496 self.butler.put(df1, self.datasetType, dataId={})
498 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
499 tab2_df = tab2.to_pandas(index="index")
501 self.assertTrue(df1.columns.equals(tab2_df.columns))
502 for name in tab2_df.columns:
503 col1 = df1[name]
504 col2 = tab2_df[name]
506 if col1.hasnans:
507 notNull = col1.notnull()
508 self.assertTrue(notNull.equals(col2.notnull()))
509 # Need to check value-by-value because column may
510 # be made of objects, depending on what pandas decides.
511 for index in notNull.values.nonzero()[0]:
512 self.assertEqual(col1[index], col2[index])
513 else:
514 self.assertTrue(col1.equals(col2))
516 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
517 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
518 df1 = _makeMultiIndexDataFrame()
520 self.butler.put(df1, self.datasetType, dataId={})
522 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
524 # This is an odd duck, it doesn't really round-trip.
525 # This test simply checks that it's readable, but definitely not
526 # recommended.
528 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
529 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
530 df1, allColumns = _makeSingleIndexDataFrame()
532 self.butler.put(df1, self.datasetType, dataId={})
534 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
536 tab2_df = arrow_to_pandas(tab2)
537 self.assertTrue(df1.equals(tab2_df))
539 # Check reading the columns.
540 columns = list(tab2.schema.names)
541 columns2 = self.butler.get(
542 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
543 )
544 # We check the set because pandas reorders the columns.
545 self.assertEqual(set(columns), set(columns2))
547 # Check reading the schema.
548 schema = tab2.schema
549 schema2 = self.butler.get(
550 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
551 )
553 # These will not have the same metadata, nor will the string column
554 # information be maintained.
555 self.assertEqual(len(schema.names), len(schema2.names))
556 for name in schema.names:
557 if schema.field(name).type not in (pa.string(), pa.binary()):
558 self.assertEqual(schema.field(name).type, schema2.field(name).type)
560 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
561 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
562 df1 = _makeMultiIndexDataFrame()
564 self.butler.put(df1, self.datasetType, dataId={})
566 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
568 tab2_df = arrow_to_pandas(tab2)
569 self.assertTrue(df1.equals(tab2_df))
571 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
572 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
573 df1, allColumns = _makeSingleIndexDataFrame()
575 self.butler.put(df1, self.datasetType, dataId={})
577 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
579 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
580 self.assertTrue(df1.equals(tab2_df))
582 # Check reading the columns.
583 columns = list(tab2.dtype.names)
584 columns2 = self.butler.get(
585 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
586 )
587 # We check the set because pandas reorders the columns.
588 self.assertEqual(set(columns2), set(columns))
590 # Check reading the schema.
591 schema = ArrowNumpySchema(tab2.dtype)
592 schema2 = self.butler.get(
593 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
594 )
596 # The string types will be objectified by pandas, and the order
597 # will be changed because of pandas indexing.
598 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
599 for name in schema.schema.names:
600 self.assertIn(name, schema2.schema.names)
601 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
603 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
604 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
605 df1 = _makeMultiIndexDataFrame()
607 self.butler.put(df1, self.datasetType, dataId={})
609 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
611 # This is an odd duck, it doesn't really round-trip.
612 # This test simply checks that it's readable, but definitely not
613 # recommended.
615 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
616 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
617 df1, allColumns = _makeSingleIndexDataFrame()
619 self.butler.put(df1, self.datasetType, dataId={})
621 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
623 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
624 # The column order is not maintained.
625 self.assertEqual(set(df1.columns), set(tab2_df.columns))
626 for col in df1.columns:
627 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
629 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
630 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
631 df1 = _makeMultiIndexDataFrame()
633 self.butler.put(df1, self.datasetType, dataId={})
635 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
637 # This is an odd duck, it doesn't really round-trip.
638 # This test simply checks that it's readable, but definitely not
639 # recommended.
642@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
643class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
644 """Tests for InMemoryDatastore, using DataFrameDelegate."""
646 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
648 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
649 df1 = _makeMultiIndexDataFrame()
651 self.butler.put(df1, self.datasetType, dataId={})
653 with self.assertRaises(ValueError):
654 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
656 def testLegacyDataFrame(self):
657 # This test does not work with an inMemoryDatastore.
658 pass
660 def testBadInput(self):
661 df1, _ = _makeSingleIndexDataFrame()
662 delegate = DataFrameDelegate("DataFrame")
664 with self.assertRaises(ValueError):
665 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
667 with self.assertRaises(AttributeError):
668 delegate.getComponent(composite=df1, componentName="nothing")
670 def testStorageClass(self):
671 df1, allColumns = _makeSingleIndexDataFrame()
673 factory = StorageClassFactory()
674 factory.addFromConfig(StorageClassConfig())
676 storageClass = factory.findStorageClass(type(df1), compare_types=False)
677 # Force the name lookup to do name matching.
678 storageClass._pytype = None
679 self.assertEqual(storageClass.name, "DataFrame")
681 storageClass = factory.findStorageClass(type(df1), compare_types=True)
682 # Force the name lookup to do name matching.
683 storageClass._pytype = None
684 self.assertEqual(storageClass.name, "DataFrame")
687@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
688@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
689class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
690 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
692 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
694 def setUp(self):
695 """Create a new butler root for each test."""
696 self.root = makeTestTempDir(TESTDIR)
697 config = Config(self.configFile)
698 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
699 # No dimensions in dataset type so we don't have to worry about
700 # inserting dimension data or defining data IDs.
701 self.datasetType = DatasetType(
702 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions
703 )
704 self.butler.registry.registerDatasetType(self.datasetType)
706 def tearDown(self):
707 removeTestTempDir(self.root)
709 def testAstropyTable(self):
710 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
712 self.butler.put(tab1, self.datasetType, dataId={})
713 # Read the whole Table.
714 tab2 = self.butler.get(self.datasetType, dataId={})
715 self._checkAstropyTableEquality(tab1, tab2)
716 # Read the columns.
717 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
718 self.assertEqual(len(columns2), len(tab1.dtype.names))
719 for i, name in enumerate(tab1.dtype.names):
720 self.assertEqual(columns2[i], name)
721 # Read the rowcount.
722 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
723 self.assertEqual(rowcount, len(tab1))
724 # Read the schema.
725 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
726 self.assertEqual(schema, ArrowAstropySchema(tab1))
727 # Read just some columns a few different ways.
728 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
729 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
730 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
731 self._checkAstropyTableEquality(tab1[("a",)], tab4)
732 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
733 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
734 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
735 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
736 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
737 self._checkAstropyTableEquality(tab1[("a",)], tab7)
738 # Passing an unrecognized column should be a ValueError.
739 with self.assertRaises(ValueError):
740 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
742 def testAstropyTableBigEndian(self):
743 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
745 self.butler.put(tab1, self.datasetType, dataId={})
746 # Read the whole Table.
747 tab2 = self.butler.get(self.datasetType, dataId={})
748 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
750 def testAstropyTableWithMetadata(self):
751 tab1 = _makeSimpleAstropyTable(include_multidim=True)
753 meta = {
754 "meta_a": 5,
755 "meta_b": 10.0,
756 "meta_c": [1, 2, 3],
757 "meta_d": True,
758 "meta_e": "string",
759 }
761 tab1.meta.update(meta)
763 self.butler.put(tab1, self.datasetType, dataId={})
764 # Read the whole Table.
765 tab2 = self.butler.get(self.datasetType, dataId={})
766 # This will check that the metadata is equivalent as well.
767 self._checkAstropyTableEquality(tab1, tab2)
769 def testArrowAstropySchema(self):
770 tab1 = _makeSimpleAstropyTable()
771 tab1_arrow = astropy_to_arrow(tab1)
772 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
774 self.assertIsInstance(schema.schema, atable.Table)
775 self.assertEqual(repr(schema), repr(schema._schema))
776 self.assertNotEqual(schema, "not_a_schema")
777 self.assertEqual(schema, schema)
779 # Test various inequalities
780 tab2 = tab1.copy()
781 tab2.rename_column("index", "index2")
782 schema2 = ArrowAstropySchema(tab2)
783 self.assertNotEqual(schema2, schema)
785 tab2 = tab1.copy()
786 tab2["index"].unit = units.micron
787 schema2 = ArrowAstropySchema(tab2)
788 self.assertNotEqual(schema2, schema)
790 tab2 = tab1.copy()
791 tab2["index"].description = "Index column"
792 schema2 = ArrowAstropySchema(tab2)
793 self.assertNotEqual(schema2, schema)
795 tab2 = tab1.copy()
796 tab2["index"].format = "%05d"
797 schema2 = ArrowAstropySchema(tab2)
798 self.assertNotEqual(schema2, schema)
800 def testAstropyParquet(self):
801 tab1 = _makeSimpleAstropyTable()
803 fname = os.path.join(self.root, "test_astropy.parq")
804 tab1.write(fname)
806 astropy_type = DatasetType(
807 "astropy_parquet",
808 dimensions=(),
809 storageClass="ArrowAstropy",
810 universe=self.butler.registry.dimensions,
811 )
812 self.butler.registry.registerDatasetType(astropy_type)
814 data_id = {}
815 ref = DatasetRef(astropy_type, data_id, run="testAstropyParquet")
816 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
818 self.butler.ingest(dataset, transfer="copy")
820 self.butler.put(tab1, self.datasetType, dataId={})
822 tab2a = self.butler.get(self.datasetType, dataId={})
823 tab2b = self.butler.get("astropy_parquet", dataId={})
824 self._checkAstropyTableEquality(tab2a, tab2b)
826 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
827 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
828 self.assertEqual(len(columns2b), len(columns2a))
829 for i, name in enumerate(columns2a):
830 self.assertEqual(columns2b[i], name)
832 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
833 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
834 self.assertEqual(rowcount2a, rowcount2b)
836 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
837 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
838 self.assertEqual(schema2a, schema2b)
840 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
841 def testWriteAstropyReadAsArrowTable(self):
842 # This astropy <-> arrow works fine with masked columns.
843 tab1 = _makeSimpleAstropyTable(include_masked=True)
845 self.butler.put(tab1, self.datasetType, dataId={})
847 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
849 tab2_astropy = arrow_to_astropy(tab2)
850 self._checkAstropyTableEquality(tab1, tab2_astropy)
852 # Check reading the columns.
853 columns = tab2.schema.names
854 columns2 = self.butler.get(
855 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
856 )
857 self.assertEqual(columns2, columns)
859 # Check reading the schema.
860 schema = tab2.schema
861 schema2 = self.butler.get(
862 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
863 )
865 self.assertEqual(schema, schema2)
867 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
868 def testWriteAstropyReadAsDataFrame(self):
869 tab1 = _makeSimpleAstropyTable()
871 self.butler.put(tab1, self.datasetType, dataId={})
873 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
875 # This is tricky because it loses the units and gains a bonus pandas
876 # _index_ column, so we just test the dataframe form.
878 tab1_df = tab1.to_pandas()
879 self.assertTrue(tab1_df.equals(tab2))
881 # Check reading the columns.
882 columns = tab2.columns
883 columns2 = self.butler.get(
884 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
885 )
886 self.assertTrue(columns.equals(columns2))
888 # Check reading the schema.
889 schema = DataFrameSchema(tab2)
890 schema2 = self.butler.get(
891 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
892 )
894 self.assertEqual(schema2, schema)
896 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
897 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
898 # We need to special-case the write-as-astropy read-as-pandas code
899 # with masks because pandas has multiple ways to use masked columns.
900 # (When writing an astropy table with masked columns we get an object
901 # column back, but each unmasked element has the correct type.)
902 tab1 = _makeSimpleAstropyTable(include_masked=True)
904 self.butler.put(tab1, self.datasetType, dataId={})
906 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
908 tab1_df = tab1.to_pandas()
910 self.assertTrue(tab1_df.columns.equals(tab2.columns))
911 for name in tab2.columns:
912 col1 = tab1_df[name]
913 col2 = tab2[name]
915 if col1.hasnans:
916 notNull = col1.notnull()
917 self.assertTrue(notNull.equals(col2.notnull()))
918 # Need to check value-by-value because column may
919 # be made of objects, depending on what pandas decides.
920 for index in notNull.values.nonzero()[0]:
921 self.assertEqual(col1[index], col2[index])
922 else:
923 self.assertTrue(col1.equals(col2))
925 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
926 def testWriteAstropyReadAsNumpyTable(self):
927 tab1 = _makeSimpleAstropyTable()
928 self.butler.put(tab1, self.datasetType, dataId={})
930 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
932 # This is tricky because it loses the units.
933 tab2_astropy = atable.Table(tab2)
935 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
937 # Check reading the columns.
938 columns = list(tab2.dtype.names)
939 columns2 = self.butler.get(
940 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
941 )
942 self.assertEqual(columns2, columns)
944 # Check reading the schema.
945 schema = ArrowNumpySchema(tab2.dtype)
946 schema2 = self.butler.get(
947 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
948 )
950 self.assertEqual(schema2, schema)
952 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
953 def testWriteAstropyReadAsNumpyDict(self):
954 tab1 = _makeSimpleAstropyTable()
955 self.butler.put(tab1, self.datasetType, dataId={})
957 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
959 # This is tricky because it loses the units.
960 tab2_astropy = atable.Table(tab2)
962 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
964 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
965 """Check if two astropy tables have the same columns/values.
967 Parameters
968 ----------
969 table1 : `astropy.table.Table`
970 table2 : `astropy.table.Table`
971 skip_units : `bool`
972 has_bigendian : `bool`
973 """
974 if not has_bigendian:
975 self.assertEqual(table1.dtype, table2.dtype)
976 else:
977 for name in table1.dtype.names:
978 # Only check type matches, force to little-endian.
979 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
981 self.assertEqual(table1.meta, table2.meta)
982 if not skip_units:
983 for name in table1.columns:
984 self.assertEqual(table1[name].unit, table2[name].unit)
985 self.assertEqual(table1[name].description, table2[name].description)
986 self.assertEqual(table1[name].format, table2[name].format)
987 self.assertTrue(np.all(table1 == table2))
990@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
991class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
992 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
994 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
996 def testAstropyParquet(self):
997 # This test does not work with an inMemoryDatastore.
998 pass
1000 def testBadInput(self):
1001 tab1 = _makeSimpleAstropyTable()
1002 delegate = ArrowAstropyDelegate("ArrowAstropy")
1004 with self.assertRaises(ValueError):
1005 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1007 with self.assertRaises(NotImplementedError):
1008 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1010 with self.assertRaises(AttributeError):
1011 delegate.getComponent(composite=tab1, componentName="nothing")
1014@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1015@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1016class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1017 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1019 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1021 def setUp(self):
1022 """Create a new butler root for each test."""
1023 self.root = makeTestTempDir(TESTDIR)
1024 config = Config(self.configFile)
1025 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1026 # No dimensions in dataset type so we don't have to worry about
1027 # inserting dimension data or defining data IDs.
1028 self.datasetType = DatasetType(
1029 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions
1030 )
1031 self.butler.registry.registerDatasetType(self.datasetType)
1033 def tearDown(self):
1034 removeTestTempDir(self.root)
1036 def testNumpyTable(self):
1037 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1039 self.butler.put(tab1, self.datasetType, dataId={})
1040 # Read the whole Table.
1041 tab2 = self.butler.get(self.datasetType, dataId={})
1042 self._checkNumpyTableEquality(tab1, tab2)
1043 # Read the columns.
1044 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1045 self.assertEqual(len(columns2), len(tab1.dtype.names))
1046 for i, name in enumerate(tab1.dtype.names):
1047 self.assertEqual(columns2[i], name)
1048 # Read the rowcount.
1049 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1050 self.assertEqual(rowcount, len(tab1))
1051 # Read the schema.
1052 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1053 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1054 # Read just some columns a few different ways.
1055 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1056 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1057 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1058 self._checkNumpyTableEquality(
1059 tab1[
1060 [
1061 "a",
1062 ]
1063 ],
1064 tab4,
1065 )
1066 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1067 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1068 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1069 self._checkNumpyTableEquality(
1070 tab1[
1071 [
1072 "ddd",
1073 ]
1074 ],
1075 tab6,
1076 )
1077 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1078 self._checkNumpyTableEquality(
1079 tab1[
1080 [
1081 "a",
1082 ]
1083 ],
1084 tab7,
1085 )
1086 # Passing an unrecognized column should be a ValueError.
1087 with self.assertRaises(ValueError):
1088 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1090 def testNumpyTableBigEndian(self):
1091 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1093 self.butler.put(tab1, self.datasetType, dataId={})
1094 # Read the whole Table.
1095 tab2 = self.butler.get(self.datasetType, dataId={})
1096 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1098 def testArrowNumpySchema(self):
1099 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1100 tab1_arrow = numpy_to_arrow(tab1)
1101 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1103 self.assertIsInstance(schema.schema, np.dtype)
1104 self.assertEqual(repr(schema), repr(schema._dtype))
1105 self.assertNotEqual(schema, "not_a_schema")
1106 self.assertEqual(schema, schema)
1108 # Test inequality
1109 tab2 = tab1.copy()
1110 names = list(tab2.dtype.names)
1111 names[0] = "index2"
1112 tab2.dtype.names = names
1113 schema2 = ArrowNumpySchema(tab2.dtype)
1114 self.assertNotEqual(schema2, schema)
1116 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1117 def testNumpyDictConversions(self):
1118 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1120 # Verify that everything round-trips, including the schema.
1121 tab1_arrow = numpy_to_arrow(tab1)
1122 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1123 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1125 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1126 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1128 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1129 def testWriteNumpyTableReadAsArrowTable(self):
1130 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1132 self.butler.put(tab1, self.datasetType, dataId={})
1134 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1136 tab2_numpy = arrow_to_numpy(tab2)
1138 self._checkNumpyTableEquality(tab1, tab2_numpy)
1140 # Check reading the columns.
1141 columns = tab2.schema.names
1142 columns2 = self.butler.get(
1143 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1144 )
1145 self.assertEqual(columns2, columns)
1147 # Check reading the schema.
1148 schema = tab2.schema
1149 schema2 = self.butler.get(
1150 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1151 )
1152 self.assertEqual(schema2, schema)
1154 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1155 def testWriteNumpyTableReadAsDataFrame(self):
1156 tab1 = _makeSimpleNumpyTable()
1158 self.butler.put(tab1, self.datasetType, dataId={})
1160 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1162 # Converting this back to numpy gets confused with the index column
1163 # and changes the datatype of the string column.
1165 tab1_df = pd.DataFrame(tab1)
1167 self.assertTrue(tab1_df.equals(tab2))
1169 # Check reading the columns.
1170 columns = tab2.columns
1171 columns2 = self.butler.get(
1172 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1173 )
1174 self.assertTrue(columns.equals(columns2))
1176 # Check reading the schema.
1177 schema = DataFrameSchema(tab2)
1178 schema2 = self.butler.get(
1179 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1180 )
1182 self.assertEqual(schema2, schema)
1184 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1185 def testWriteNumpyTableReadAsAstropyTable(self):
1186 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1188 self.butler.put(tab1, self.datasetType, dataId={})
1190 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1191 tab2_numpy = tab2.as_array()
1193 self._checkNumpyTableEquality(tab1, tab2_numpy)
1195 # Check reading the columns.
1196 columns = list(tab2.columns.keys())
1197 columns2 = self.butler.get(
1198 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1199 )
1200 self.assertEqual(columns2, columns)
1202 # Check reading the schema.
1203 schema = ArrowAstropySchema(tab2)
1204 schema2 = self.butler.get(
1205 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1206 )
1208 self.assertEqual(schema2, schema)
1210 def testWriteNumpyTableReadAsNumpyDict(self):
1211 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1213 self.butler.put(tab1, self.datasetType, dataId={})
1215 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1216 tab2_numpy = _numpy_dict_to_numpy(tab2)
1218 self._checkNumpyTableEquality(tab1, tab2_numpy)
1220 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1221 """Check if two numpy tables have the same columns/values
1223 Parameters
1224 ----------
1225 table1 : `numpy.ndarray`
1226 table2 : `numpy.ndarray`
1227 has_bigendian : `bool`
1228 """
1229 self.assertEqual(table1.dtype.names, table2.dtype.names)
1230 for name in table1.dtype.names:
1231 if not has_bigendian:
1232 self.assertEqual(table1.dtype[name], table2.dtype[name])
1233 else:
1234 # Only check type matches, force to little-endian.
1235 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1236 self.assertTrue(np.all(table1 == table2))
1239@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1240class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1241 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1243 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1245 def testBadInput(self):
1246 tab1 = _makeSimpleNumpyTable()
1247 delegate = ArrowNumpyDelegate("ArrowNumpy")
1249 with self.assertRaises(ValueError):
1250 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1252 with self.assertRaises(NotImplementedError):
1253 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1255 with self.assertRaises(AttributeError):
1256 delegate.getComponent(composite=tab1, componentName="nothing")
1258 def testStorageClass(self):
1259 tab1 = _makeSimpleNumpyTable()
1261 factory = StorageClassFactory()
1262 factory.addFromConfig(StorageClassConfig())
1264 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1265 # Force the name lookup to do name matching.
1266 storageClass._pytype = None
1267 self.assertEqual(storageClass.name, "ArrowNumpy")
1269 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1270 # Force the name lookup to do name matching.
1271 storageClass._pytype = None
1272 self.assertEqual(storageClass.name, "ArrowNumpy")
1275@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1276class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1277 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1279 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1281 def setUp(self):
1282 """Create a new butler root for each test."""
1283 self.root = makeTestTempDir(TESTDIR)
1284 config = Config(self.configFile)
1285 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1286 # No dimensions in dataset type so we don't have to worry about
1287 # inserting dimension data or defining data IDs.
1288 self.datasetType = DatasetType(
1289 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions
1290 )
1291 self.butler.registry.registerDatasetType(self.datasetType)
1293 def tearDown(self):
1294 removeTestTempDir(self.root)
1296 def testArrowTable(self):
1297 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1299 self.butler.put(tab1, self.datasetType, dataId={})
1300 # Read the whole Table.
1301 tab2 = self.butler.get(self.datasetType, dataId={})
1302 self.assertEqual(tab2, tab1)
1303 # Read the columns.
1304 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1305 self.assertEqual(len(columns2), len(tab1.schema.names))
1306 for i, name in enumerate(tab1.schema.names):
1307 self.assertEqual(columns2[i], name)
1308 # Read the rowcount.
1309 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1310 self.assertEqual(rowcount, len(tab1))
1311 # Read the schema.
1312 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1313 self.assertEqual(schema, tab1.schema)
1314 # Read just some columns a few different ways.
1315 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1316 self.assertEqual(tab3, tab1.select(("a", "c")))
1317 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1318 self.assertEqual(tab4, tab1.select(("a",)))
1319 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1320 self.assertEqual(tab5, tab1.select(("index", "a")))
1321 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1322 self.assertEqual(tab6, tab1.select(("ddd",)))
1323 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1324 self.assertEqual(tab7, tab1.select(("a",)))
1325 # Passing an unrecognized column should be a ValueError.
1326 with self.assertRaises(ValueError):
1327 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1329 def testEmptyArrowTable(self):
1330 data = _makeSimpleNumpyTable()
1331 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1333 schema = pa.schema(type_list)
1334 arrays = [[]] * len(schema.names)
1336 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1338 self.butler.put(tab1, self.datasetType, dataId={})
1339 tab2 = self.butler.get(self.datasetType, dataId={})
1340 self.assertEqual(tab2, tab1)
1342 tab1_numpy = arrow_to_numpy(tab1)
1343 self.assertEqual(len(tab1_numpy), 0)
1344 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1345 self.assertEqual(tab1_numpy_arrow, tab1)
1347 tab1_pandas = arrow_to_pandas(tab1)
1348 self.assertEqual(len(tab1_pandas), 0)
1349 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1350 # Unfortunately, string/byte columns get mangled when translated
1351 # through empty pandas dataframes.
1352 self.assertEqual(
1353 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1354 tab1.select(("index", "a", "b", "c", "ddd")),
1355 )
1357 tab1_astropy = arrow_to_astropy(tab1)
1358 self.assertEqual(len(tab1_astropy), 0)
1359 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1360 self.assertEqual(tab1_astropy_arrow, tab1)
1362 def testEmptyArrowTableMultidim(self):
1363 data = _makeSimpleNumpyTable(include_multidim=True)
1364 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1366 md = {}
1367 for name in data.dtype.names:
1368 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1370 schema = pa.schema(type_list, metadata=md)
1371 arrays = [[]] * len(schema.names)
1373 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1375 self.butler.put(tab1, self.datasetType, dataId={})
1376 tab2 = self.butler.get(self.datasetType, dataId={})
1377 self.assertEqual(tab2, tab1)
1379 tab1_numpy = arrow_to_numpy(tab1)
1380 self.assertEqual(len(tab1_numpy), 0)
1381 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1382 self.assertEqual(tab1_numpy_arrow, tab1)
1384 tab1_astropy = arrow_to_astropy(tab1)
1385 self.assertEqual(len(tab1_astropy), 0)
1386 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1387 self.assertEqual(tab1_astropy_arrow, tab1)
1389 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1390 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1391 df1, allColumns = _makeSingleIndexDataFrame()
1393 self.butler.put(df1, self.datasetType, dataId={})
1395 # Read back out as a dataframe.
1396 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1397 self.assertTrue(df1.equals(df2))
1399 # Read back out as an arrow table, convert to dataframe.
1400 tab3 = self.butler.get(self.datasetType, dataId={})
1401 df3 = arrow_to_pandas(tab3)
1402 self.assertTrue(df1.equals(df3))
1404 # Check reading the columns.
1405 columns = df2.reset_index().columns
1406 columns2 = self.butler.get(
1407 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1408 )
1409 # We check the set because pandas reorders the columns.
1410 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1412 # Check reading the schema.
1413 schema = DataFrameSchema(df1)
1414 schema2 = self.butler.get(
1415 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1416 )
1417 self.assertEqual(schema2, schema)
1419 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1420 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1421 df1 = _makeMultiIndexDataFrame()
1423 self.butler.put(df1, self.datasetType, dataId={})
1425 # Read back out as a dataframe.
1426 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1427 self.assertTrue(df1.equals(df2))
1429 # Read back out as an arrow table, convert to dataframe.
1430 atab3 = self.butler.get(self.datasetType, dataId={})
1431 df3 = arrow_to_pandas(atab3)
1432 self.assertTrue(df1.equals(df3))
1434 # Check reading the columns.
1435 columns = df2.columns
1436 columns2 = self.butler.get(
1437 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1438 )
1439 self.assertTrue(columns2.equals(columns))
1441 # Check reading the schema.
1442 schema = DataFrameSchema(df1)
1443 schema2 = self.butler.get(
1444 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1445 )
1446 self.assertEqual(schema2, schema)
1448 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1449 def testWriteArrowTableReadAsAstropyTable(self):
1450 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1452 self.butler.put(tab1, self.datasetType, dataId={})
1454 # Read back out as an astropy table.
1455 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1456 self._checkAstropyTableEquality(tab1, tab2)
1458 # Read back out as an arrow table, convert to astropy table.
1459 atab3 = self.butler.get(self.datasetType, dataId={})
1460 tab3 = arrow_to_astropy(atab3)
1461 self._checkAstropyTableEquality(tab1, tab3)
1463 # Check reading the columns.
1464 columns = list(tab2.columns.keys())
1465 columns2 = self.butler.get(
1466 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1467 )
1468 self.assertEqual(columns2, columns)
1470 # Check reading the schema.
1471 schema = ArrowAstropySchema(tab1)
1472 schema2 = self.butler.get(
1473 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1474 )
1475 self.assertEqual(schema2, schema)
1477 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1478 def testWriteArrowTableReadAsNumpyTable(self):
1479 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1481 self.butler.put(tab1, self.datasetType, dataId={})
1483 # Read back out as a numpy table.
1484 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1485 self._checkNumpyTableEquality(tab1, tab2)
1487 # Read back out as an arrow table, convert to numpy table.
1488 atab3 = self.butler.get(self.datasetType, dataId={})
1489 tab3 = arrow_to_numpy(atab3)
1490 self._checkNumpyTableEquality(tab1, tab3)
1492 # Check reading the columns.
1493 columns = list(tab2.dtype.names)
1494 columns2 = self.butler.get(
1495 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1496 )
1497 self.assertEqual(columns2, columns)
1499 # Check reading the schema.
1500 schema = ArrowNumpySchema(tab1.dtype)
1501 schema2 = self.butler.get(
1502 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1503 )
1504 self.assertEqual(schema2, schema)
1506 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1507 def testWriteArrowTableReadAsNumpyDict(self):
1508 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1510 self.butler.put(tab1, self.datasetType, dataId={})
1512 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1513 tab2_numpy = _numpy_dict_to_numpy(tab2)
1514 self._checkNumpyTableEquality(tab1, tab2_numpy)
1516 def _checkAstropyTableEquality(self, table1, table2):
1517 """Check if two astropy tables have the same columns/values
1519 Parameters
1520 ----------
1521 table1 : `astropy.table.Table`
1522 table2 : `astropy.table.Table`
1523 """
1524 self.assertEqual(table1.dtype, table2.dtype)
1525 for name in table1.columns:
1526 self.assertEqual(table1[name].unit, table2[name].unit)
1527 self.assertEqual(table1[name].description, table2[name].description)
1528 self.assertEqual(table1[name].format, table2[name].format)
1529 self.assertTrue(np.all(table1 == table2))
1531 def _checkNumpyTableEquality(self, table1, table2):
1532 """Check if two numpy tables have the same columns/values
1534 Parameters
1535 ----------
1536 table1 : `numpy.ndarray`
1537 table2 : `numpy.ndarray`
1538 """
1539 self.assertEqual(table1.dtype.names, table2.dtype.names)
1540 for name in table1.dtype.names:
1541 self.assertEqual(table1.dtype[name], table2.dtype[name])
1542 self.assertTrue(np.all(table1 == table2))
1545@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1546class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1547 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1549 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1551 def testBadInput(self):
1552 tab1 = _makeSimpleArrowTable()
1553 delegate = ArrowTableDelegate("ArrowTable")
1555 with self.assertRaises(ValueError):
1556 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1558 with self.assertRaises(NotImplementedError):
1559 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1561 with self.assertRaises(AttributeError):
1562 delegate.getComponent(composite=tab1, componentName="nothing")
1564 def testStorageClass(self):
1565 tab1 = _makeSimpleArrowTable()
1567 factory = StorageClassFactory()
1568 factory.addFromConfig(StorageClassConfig())
1570 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1571 # Force the name lookup to do name matching.
1572 storageClass._pytype = None
1573 self.assertEqual(storageClass.name, "ArrowTable")
1575 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1576 # Force the name lookup to do name matching.
1577 storageClass._pytype = None
1578 self.assertEqual(storageClass.name, "ArrowTable")
1581@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1582@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1583class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1584 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1586 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1588 def setUp(self):
1589 """Create a new butler root for each test."""
1590 self.root = makeTestTempDir(TESTDIR)
1591 config = Config(self.configFile)
1592 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1593 # No dimensions in dataset type so we don't have to worry about
1594 # inserting dimension data or defining data IDs.
1595 self.datasetType = DatasetType(
1596 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions
1597 )
1598 self.butler.registry.registerDatasetType(self.datasetType)
1600 def tearDown(self):
1601 removeTestTempDir(self.root)
1603 def testNumpyDict(self):
1604 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1605 dict1 = _numpy_to_numpy_dict(tab1)
1607 self.butler.put(dict1, self.datasetType, dataId={})
1608 # Read the whole table.
1609 dict2 = self.butler.get(self.datasetType, dataId={})
1610 self._checkNumpyDictEquality(dict1, dict2)
1611 # Read the columns.
1612 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1613 self.assertEqual(len(columns2), len(dict1.keys()))
1614 for i, name in enumerate(dict1.keys()):
1615 self.assertIn(name, columns2)
1616 # Read the rowcount.
1617 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1618 self.assertEqual(rowcount, len(dict1["a"]))
1619 # Read the schema.
1620 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1621 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1622 # Read just some columns a few different ways.
1623 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1624 subdict = {key: dict1[key] for key in ["a", "c"]}
1625 self._checkNumpyDictEquality(subdict, tab3)
1626 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1627 subdict = {key: dict1[key] for key in ["a"]}
1628 self._checkNumpyDictEquality(subdict, tab4)
1629 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1630 subdict = {key: dict1[key] for key in ["index", "a"]}
1631 self._checkNumpyDictEquality(subdict, tab5)
1632 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1633 subdict = {key: dict1[key] for key in ["ddd"]}
1634 self._checkNumpyDictEquality(subdict, tab6)
1635 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1636 subdict = {key: dict1[key] for key in ["a"]}
1637 self._checkNumpyDictEquality(subdict, tab7)
1638 # Passing an unrecognized column should be a ValueError.
1639 with self.assertRaises(ValueError):
1640 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1642 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1643 def testWriteNumpyDictReadAsArrowTable(self):
1644 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1645 dict1 = _numpy_to_numpy_dict(tab1)
1647 self.butler.put(dict1, self.datasetType, dataId={})
1649 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1651 tab2_dict = arrow_to_numpy_dict(tab2)
1653 self._checkNumpyDictEquality(dict1, tab2_dict)
1655 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1656 def testWriteNumpyDictReadAsDataFrame(self):
1657 tab1 = _makeSimpleNumpyTable()
1658 dict1 = _numpy_to_numpy_dict(tab1)
1660 self.butler.put(dict1, self.datasetType, dataId={})
1662 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1664 # The order of the dict may get mixed up, so we need to check column
1665 # by column. We also need to do this in dataframe form because pandas
1666 # changes the datatype of the string column.
1667 tab1_df = pd.DataFrame(tab1)
1669 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1670 for col in tab1_df.columns:
1671 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1673 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1674 def testWriteNumpyDictReadAsAstropyTable(self):
1675 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1676 dict1 = _numpy_to_numpy_dict(tab1)
1678 self.butler.put(dict1, self.datasetType, dataId={})
1680 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1681 tab2_dict = _astropy_to_numpy_dict(tab2)
1683 self._checkNumpyDictEquality(dict1, tab2_dict)
1685 def testWriteNumpyDictReadAsNumpyTable(self):
1686 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1687 dict1 = _numpy_to_numpy_dict(tab1)
1689 self.butler.put(dict1, self.datasetType, dataId={})
1691 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1692 tab2_dict = _numpy_to_numpy_dict(tab2)
1694 self._checkNumpyDictEquality(dict1, tab2_dict)
1696 def testWriteNumpyDictBad(self):
1697 dict1 = {"a": 4, "b": np.ndarray([1])}
1698 with self.assertRaises(RuntimeError):
1699 self.butler.put(dict1, self.datasetType, dataId={})
1701 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1702 with self.assertRaises(RuntimeError):
1703 self.butler.put(dict2, self.datasetType, dataId={})
1705 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1706 with self.assertRaises(RuntimeError):
1707 self.butler.put(dict3, self.datasetType, dataId={})
1709 def _checkNumpyDictEquality(self, dict1, dict2):
1710 """Check if two numpy dicts have the same columns/values.
1712 Parameters
1713 ----------
1714 dict1 : `dict` [`str`, `np.ndarray`]
1715 dict2 : `dict` [`str`, `np.ndarray`]
1716 """
1717 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1718 for name in dict1.keys():
1719 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1720 self.assertTrue(np.all(dict1[name] == dict2[name]))
1723@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1724@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1725class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1726 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1728 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1730 def testWriteNumpyDictBad(self):
1731 # The sub-type checking is not done on in-memory datastore.
1732 pass
1735@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
1736@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
1737class ComputeRowGroupSizeTestCase(unittest.TestCase):
1738 """Tests for compute_row_group_size."""
1740 def testRowGroupSizeNoMetadata(self):
1741 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1743 # We can't use the numpy_to_arrow convenience function because
1744 # that adds metadata.
1745 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
1746 schema = pa.schema(type_list)
1747 arrays = _numpy_style_arrays_to_arrow_arrays(
1748 numpyTable.dtype,
1749 len(numpyTable),
1750 numpyTable,
1751 schema,
1752 )
1753 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
1755 row_group_size = compute_row_group_size(arrowTable.schema)
1757 self.assertGreater(row_group_size, 1_000_000)
1758 self.assertLess(row_group_size, 2_000_000)
1760 def testRowGroupSizeWithMetadata(self):
1761 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1763 arrowTable = numpy_to_arrow(numpyTable)
1765 row_group_size = compute_row_group_size(arrowTable.schema)
1767 self.assertGreater(row_group_size, 1_000_000)
1768 self.assertLess(row_group_size, 2_000_000)
1770 def testRowGroupSizeTinyTable(self):
1771 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
1773 arrowTable = numpy_to_arrow(numpyTable)
1775 row_group_size = compute_row_group_size(arrowTable.schema)
1777 self.assertGreater(row_group_size, 1_000_000)
1779 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
1780 def testRowGroupSizeDataFrameWithLists(self):
1781 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
1782 arrowTable = pandas_to_arrow(df)
1783 row_group_size = compute_row_group_size(arrowTable.schema)
1785 self.assertGreater(row_group_size, 1_000_000)
1788if __name__ == "__main__":
1789 unittest.main()