Coverage for tests/test_parquet.py: 23%
1039 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for ParquetFormatter.
30Tests in this module are disabled unless pandas and pyarrow are importable.
31"""
33import os
34import unittest
36try:
37 import pyarrow as pa
38except ImportError:
39 pa = None
40try:
41 import astropy.table as atable
42 from astropy import units
43except ImportError:
44 atable = None
45try:
46 import numpy as np
47except ImportError:
48 np = None
49try:
50 import pandas as pd
51except ImportError:
52 pd = None
54from lsst.daf.butler import (
55 Butler,
56 Config,
57 DatasetRef,
58 DatasetType,
59 FileDataset,
60 StorageClassConfig,
61 StorageClassFactory,
62)
64try:
65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
66except ImportError:
67 atable = None
68 pa = None
69try:
70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
71except ImportError:
72 np = None
73 pa = None
74try:
75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
76except ImportError:
77 pa = None
78try:
79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
80except ImportError:
81 pd = None
82try:
83 from lsst.daf.butler.formatters.parquet import (
84 ArrowAstropySchema,
85 ArrowNumpySchema,
86 DataFrameSchema,
87 ParquetFormatter,
88 _append_numpy_multidim_metadata,
89 _astropy_to_numpy_dict,
90 _numpy_dict_to_numpy,
91 _numpy_dtype_to_arrow_types,
92 _numpy_style_arrays_to_arrow_arrays,
93 _numpy_to_numpy_dict,
94 arrow_to_astropy,
95 arrow_to_numpy,
96 arrow_to_numpy_dict,
97 arrow_to_pandas,
98 astropy_to_arrow,
99 compute_row_group_size,
100 numpy_dict_to_arrow,
101 numpy_to_arrow,
102 pandas_to_arrow,
103 )
104except ImportError:
105 pa = None
106 pd = None
107 atable = None
108 np = None
109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
111TESTDIR = os.path.abspath(os.path.dirname(__file__))
114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
115 """Make a simple numpy table with random data.
117 Parameters
118 ----------
119 include_multidim : `bool`
120 Include multi-dimensional columns.
121 include_bigendian : `bool`
122 Include big-endian columns.
124 Returns
125 -------
126 numpyTable : `numpy.ndarray`
127 """
128 nrow = 5
130 dtype = [
131 ("index", "i4"),
132 ("a", "f8"),
133 ("b", "f8"),
134 ("c", "f8"),
135 ("ddd", "f8"),
136 ("f", "i8"),
137 ("strcol", "U10"),
138 ("bytecol", "a10"),
139 ]
141 if include_multidim:
142 dtype.extend(
143 [
144 ("d1", "f4", (5,)),
145 ("d2", "i8", (5, 10)),
146 ("d3", "f8", (5, 10)),
147 ]
148 )
150 if include_bigendian:
151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
153 data = np.zeros(nrow, dtype=dtype)
154 data["index"][:] = np.arange(nrow)
155 data["a"] = np.random.randn(nrow)
156 data["b"] = np.random.randn(nrow)
157 data["c"] = np.random.randn(nrow)
158 data["ddd"] = np.random.randn(nrow)
159 data["f"] = np.arange(nrow) * 10
160 data["strcol"][:] = "teststring"
161 data["bytecol"][:] = "teststring"
163 if include_multidim:
164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
168 if include_bigendian:
169 data["a_bigendian"][:] = data["a"]
170 data["f_bigendian"][:] = data["f"]
172 return data
175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
176 """Make a single index data frame for testing.
178 Parameters
179 ----------
180 include_masked : `bool`
181 Include masked columns.
182 include_lists : `bool`
183 Include list columns.
185 Returns
186 -------
187 dataFrame : `~pandas.DataFrame`
188 The test dataframe.
189 allColumns : `list` [`str`]
190 List of all the columns (including index columns).
191 """
192 data = _makeSimpleNumpyTable()
193 df = pd.DataFrame(data)
194 df = df.set_index("index")
196 if include_masked:
197 nrow = len(df)
199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
201 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
202 df.loc[1, ["m1", "m2", "mstrcol"]] = None
204 if include_lists:
205 nrow = len(df)
207 df["l1"] = [[0, 0]] * nrow
208 df["l2"] = [[0.0, 0.0]] * nrow
209 df["l3"] = [[]] * nrow
211 allColumns = df.columns.append(pd.Index(df.index.names))
213 return df, allColumns
216def _makeMultiIndexDataFrame():
217 """Make a multi-index data frame for testing.
219 Returns
220 -------
221 dataFrame : `~pandas.DataFrame`
222 The test dataframe.
223 """
224 columns = pd.MultiIndex.from_tuples(
225 [
226 ("g", "a"),
227 ("g", "b"),
228 ("g", "c"),
229 ("r", "a"),
230 ("r", "b"),
231 ("r", "c"),
232 ],
233 names=["filter", "column"],
234 )
235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
237 return df
240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
241 """Make an astropy table for testing.
243 Parameters
244 ----------
245 include_multidim : `bool`
246 Include multi-dimensional columns.
247 include_masked : `bool`
248 Include masked columns.
249 include_bigendian : `bool`
250 Include big-endian columns.
252 Returns
253 -------
254 astropyTable : `astropy.table.Table`
255 The test table.
256 """
257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
258 # Add a couple of units.
259 table = atable.Table(data)
260 table["a"].unit = units.degree
261 table["a"].description = "Description of column a"
262 table["b"].unit = units.meter
263 table["b"].description = "Description of column b"
265 # Add some masked columns.
266 if include_masked:
267 nrow = len(table)
268 mask = np.zeros(nrow, dtype=bool)
269 mask[1] = True
270 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
271 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
272 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
273 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
275 return table
278def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
279 """Make an arrow table for testing.
281 Parameters
282 ----------
283 include_multidim : `bool`
284 Include multi-dimensional columns.
285 include_masked : `bool`
286 Include masked columns.
288 Returns
289 -------
290 arrowTable : `pyarrow.Table`
291 The test table.
292 """
293 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
294 return astropy_to_arrow(data)
297@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
298@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
299class ParquetFormatterDataFrameTestCase(unittest.TestCase):
300 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
304 def setUp(self):
305 """Create a new butler root for each test."""
306 self.root = makeTestTempDir(TESTDIR)
307 config = Config(self.configFile)
308 self.run = "test_run"
309 self.butler = Butler.from_config(
310 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
311 )
312 # No dimensions in dataset type so we don't have to worry about
313 # inserting dimension data or defining data IDs.
314 self.datasetType = DatasetType(
315 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions
316 )
317 self.butler.registry.registerDatasetType(self.datasetType)
319 def tearDown(self):
320 removeTestTempDir(self.root)
322 def testSingleIndexDataFrame(self):
323 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
325 self.butler.put(df1, self.datasetType, dataId={})
326 # Read the whole DataFrame.
327 df2 = self.butler.get(self.datasetType, dataId={})
328 self.assertTrue(df1.equals(df2))
329 # Read just the column descriptions.
330 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
331 self.assertTrue(allColumns.equals(columns2))
332 # Read the rowcount.
333 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
334 self.assertEqual(rowcount, len(df1))
335 # Read the schema.
336 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
337 self.assertEqual(schema, DataFrameSchema(df1))
338 # Read just some columns a few different ways.
339 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
340 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
341 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
342 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
343 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
344 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
345 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
346 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
347 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
348 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
349 # Passing an unrecognized column should be a ValueError.
350 with self.assertRaises(ValueError):
351 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
353 def testSingleIndexDataFrameWithLists(self):
354 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
356 self.butler.put(df1, self.datasetType, dataId={})
357 # Read the whole DataFrame.
358 df2 = self.butler.get(self.datasetType, dataId={})
360 # We need to check the list columns specially because they go
361 # from lists to arrays.
362 for col in ["l1", "l2", "l3"]:
363 for i in range(len(df1)):
364 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
366 def testMultiIndexDataFrame(self):
367 df1 = _makeMultiIndexDataFrame()
369 self.butler.put(df1, self.datasetType, dataId={})
370 # Read the whole DataFrame.
371 df2 = self.butler.get(self.datasetType, dataId={})
372 self.assertTrue(df1.equals(df2))
373 # Read just the column descriptions.
374 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
375 self.assertTrue(df1.columns.equals(columns2))
376 self.assertEqual(columns2.names, df1.columns.names)
377 # Read the rowcount.
378 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
379 self.assertEqual(rowcount, len(df1))
380 # Read the schema.
381 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
382 self.assertEqual(schema, DataFrameSchema(df1))
383 # Read just some columns a few different ways.
384 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
385 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
386 df4 = self.butler.get(
387 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
388 )
389 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
390 column_list = [("g", "a"), ("r", "c")]
391 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
392 self.assertTrue(df1.loc[:, column_list].equals(df5))
393 column_dict = {"filter": "r", "column": ["a", "b"]}
394 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
395 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
396 # Passing an unrecognized column should be a ValueError.
397 with self.assertRaises(ValueError):
398 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
400 def testSingleIndexDataFrameEmptyString(self):
401 """Test persisting a single index dataframe with empty strings."""
402 df1, _ = _makeSingleIndexDataFrame()
404 # Set one of the strings to None
405 df1.at[1, "strcol"] = None
407 self.butler.put(df1, self.datasetType, dataId={})
408 # Read the whole DataFrame.
409 df2 = self.butler.get(self.datasetType, dataId={})
410 self.assertTrue(df1.equals(df2))
412 def testSingleIndexDataFrameAllEmptyStrings(self):
413 """Test persisting a single index dataframe with an empty string
414 column.
415 """
416 df1, _ = _makeSingleIndexDataFrame()
418 # Set all of the strings to None
419 df1.loc[0:, "strcol"] = None
421 self.butler.put(df1, self.datasetType, dataId={})
422 # Read the whole DataFrame.
423 df2 = self.butler.get(self.datasetType, dataId={})
424 self.assertTrue(df1.equals(df2))
426 def testLegacyDataFrame(self):
427 """Test writing a dataframe to parquet via pandas (without additional
428 metadata) and ensure that we can read it back with all the new
429 functionality.
430 """
431 df1, allColumns = _makeSingleIndexDataFrame()
433 fname = os.path.join(self.root, "test_dataframe.parq")
434 df1.to_parquet(fname)
436 legacy_type = DatasetType(
437 "legacy_dataframe",
438 dimensions=(),
439 storageClass="DataFrame",
440 universe=self.butler.dimensions,
441 )
442 self.butler.registry.registerDatasetType(legacy_type)
444 data_id = {}
445 ref = DatasetRef(legacy_type, data_id, run=self.run)
446 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
448 self.butler.ingest(dataset, transfer="copy")
450 self.butler.put(df1, self.datasetType, dataId={})
452 df2a = self.butler.get(self.datasetType, dataId={})
453 df2b = self.butler.get("legacy_dataframe", dataId={})
454 self.assertTrue(df2a.equals(df2b))
456 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
457 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
458 self.assertTrue(df3a.equals(df3b))
460 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
461 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
462 self.assertTrue(columns2a.equals(columns2b))
464 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
465 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
466 self.assertEqual(rowcount2a, rowcount2b)
468 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
469 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
470 self.assertEqual(schema2a, schema2b)
472 def testDataFrameSchema(self):
473 tab1 = _makeSimpleArrowTable()
475 schema = DataFrameSchema.from_arrow(tab1.schema)
477 self.assertIsInstance(schema.schema, pd.DataFrame)
478 self.assertEqual(repr(schema), repr(schema._schema))
479 self.assertNotEqual(schema, "not_a_schema")
480 self.assertEqual(schema, schema)
482 tab2 = _makeMultiIndexDataFrame()
483 schema2 = DataFrameSchema(tab2)
485 self.assertNotEqual(schema, schema2)
487 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
488 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
489 df1, allColumns = _makeSingleIndexDataFrame()
491 self.butler.put(df1, self.datasetType, dataId={})
493 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
495 tab2_df = tab2.to_pandas(index="index")
496 self.assertTrue(df1.equals(tab2_df))
498 # Check reading the columns.
499 columns = list(tab2.columns.keys())
500 columns2 = self.butler.get(
501 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
502 )
503 # We check the set because pandas reorders the columns.
504 self.assertEqual(set(columns2), set(columns))
506 # Check reading the schema.
507 schema = ArrowAstropySchema(tab2)
508 schema2 = self.butler.get(
509 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
510 )
512 # The string types are objectified by pandas, and the order
513 # will be changed because of pandas indexing.
514 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
515 for name in schema.schema.columns:
516 self.assertIn(name, schema2.schema.columns)
517 if schema2.schema[name].dtype != np.dtype("O"):
518 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
520 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
521 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
522 # We need to special-case the write-as-pandas read-as-astropy code
523 # with masks because pandas has multiple ways to use masked columns.
524 # (The string column mask handling in particular is frustratingly
525 # inconsistent.)
526 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
528 self.butler.put(df1, self.datasetType, dataId={})
530 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
531 tab2_df = tab2.to_pandas(index="index")
533 self.assertTrue(df1.columns.equals(tab2_df.columns))
534 for name in tab2_df.columns:
535 col1 = df1[name]
536 col2 = tab2_df[name]
538 if col1.hasnans:
539 notNull = col1.notnull()
540 self.assertTrue(notNull.equals(col2.notnull()))
541 # Need to check value-by-value because column may
542 # be made of objects, depending on what pandas decides.
543 for index in notNull.values.nonzero()[0]:
544 self.assertEqual(col1[index], col2[index])
545 else:
546 self.assertTrue(col1.equals(col2))
548 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
549 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
550 df1 = _makeMultiIndexDataFrame()
552 self.butler.put(df1, self.datasetType, dataId={})
554 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
556 # This is an odd duck, it doesn't really round-trip.
557 # This test simply checks that it's readable, but definitely not
558 # recommended.
560 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
561 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
562 df1, allColumns = _makeSingleIndexDataFrame()
564 self.butler.put(df1, self.datasetType, dataId={})
566 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
568 tab2_df = arrow_to_pandas(tab2)
569 self.assertTrue(df1.equals(tab2_df))
571 # Check reading the columns.
572 columns = list(tab2.schema.names)
573 columns2 = self.butler.get(
574 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
575 )
576 # We check the set because pandas reorders the columns.
577 self.assertEqual(set(columns), set(columns2))
579 # Check reading the schema.
580 schema = tab2.schema
581 schema2 = self.butler.get(
582 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
583 )
585 # These will not have the same metadata, nor will the string column
586 # information be maintained.
587 self.assertEqual(len(schema.names), len(schema2.names))
588 for name in schema.names:
589 if schema.field(name).type not in (pa.string(), pa.binary()):
590 self.assertEqual(schema.field(name).type, schema2.field(name).type)
592 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
593 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
594 df1 = _makeMultiIndexDataFrame()
596 self.butler.put(df1, self.datasetType, dataId={})
598 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
600 tab2_df = arrow_to_pandas(tab2)
601 self.assertTrue(df1.equals(tab2_df))
603 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
604 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
605 df1, allColumns = _makeSingleIndexDataFrame()
607 self.butler.put(df1, self.datasetType, dataId={})
609 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
611 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
612 self.assertTrue(df1.equals(tab2_df))
614 # Check reading the columns.
615 columns = list(tab2.dtype.names)
616 columns2 = self.butler.get(
617 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
618 )
619 # We check the set because pandas reorders the columns.
620 self.assertEqual(set(columns2), set(columns))
622 # Check reading the schema.
623 schema = ArrowNumpySchema(tab2.dtype)
624 schema2 = self.butler.get(
625 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
626 )
628 # The string types will be objectified by pandas, and the order
629 # will be changed because of pandas indexing.
630 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
631 for name in schema.schema.names:
632 self.assertIn(name, schema2.schema.names)
633 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
635 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
636 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
637 df1 = _makeMultiIndexDataFrame()
639 self.butler.put(df1, self.datasetType, dataId={})
641 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
643 # This is an odd duck, it doesn't really round-trip.
644 # This test simply checks that it's readable, but definitely not
645 # recommended.
647 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
648 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
649 df1, allColumns = _makeSingleIndexDataFrame()
651 self.butler.put(df1, self.datasetType, dataId={})
653 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
655 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
656 # The column order is not maintained.
657 self.assertEqual(set(df1.columns), set(tab2_df.columns))
658 for col in df1.columns:
659 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
661 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
662 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
663 df1 = _makeMultiIndexDataFrame()
665 self.butler.put(df1, self.datasetType, dataId={})
667 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
669 # This is an odd duck, it doesn't really round-trip.
670 # This test simply checks that it's readable, but definitely not
671 # recommended.
674@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
675class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
676 """Tests for InMemoryDatastore, using DataFrameDelegate."""
678 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
680 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
681 df1 = _makeMultiIndexDataFrame()
683 self.butler.put(df1, self.datasetType, dataId={})
685 with self.assertRaises(ValueError):
686 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
688 def testLegacyDataFrame(self):
689 # This test does not work with an inMemoryDatastore.
690 pass
692 def testBadInput(self):
693 df1, _ = _makeSingleIndexDataFrame()
694 delegate = DataFrameDelegate("DataFrame")
696 with self.assertRaises(ValueError):
697 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
699 with self.assertRaises(AttributeError):
700 delegate.getComponent(composite=df1, componentName="nothing")
702 def testStorageClass(self):
703 df1, allColumns = _makeSingleIndexDataFrame()
705 factory = StorageClassFactory()
706 factory.addFromConfig(StorageClassConfig())
708 storageClass = factory.findStorageClass(type(df1), compare_types=False)
709 # Force the name lookup to do name matching.
710 storageClass._pytype = None
711 self.assertEqual(storageClass.name, "DataFrame")
713 storageClass = factory.findStorageClass(type(df1), compare_types=True)
714 # Force the name lookup to do name matching.
715 storageClass._pytype = None
716 self.assertEqual(storageClass.name, "DataFrame")
719@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
720@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
721class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
722 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
724 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
726 def setUp(self):
727 """Create a new butler root for each test."""
728 self.root = makeTestTempDir(TESTDIR)
729 config = Config(self.configFile)
730 self.run = "test_run"
731 self.butler = Butler.from_config(
732 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
733 )
734 # No dimensions in dataset type so we don't have to worry about
735 # inserting dimension data or defining data IDs.
736 self.datasetType = DatasetType(
737 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions
738 )
739 self.butler.registry.registerDatasetType(self.datasetType)
741 def tearDown(self):
742 removeTestTempDir(self.root)
744 def testAstropyTable(self):
745 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
747 self.butler.put(tab1, self.datasetType, dataId={})
748 # Read the whole Table.
749 tab2 = self.butler.get(self.datasetType, dataId={})
750 self._checkAstropyTableEquality(tab1, tab2)
751 # Read the columns.
752 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
753 self.assertEqual(len(columns2), len(tab1.dtype.names))
754 for i, name in enumerate(tab1.dtype.names):
755 self.assertEqual(columns2[i], name)
756 # Read the rowcount.
757 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
758 self.assertEqual(rowcount, len(tab1))
759 # Read the schema.
760 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
761 self.assertEqual(schema, ArrowAstropySchema(tab1))
762 # Read just some columns a few different ways.
763 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
764 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
765 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
766 self._checkAstropyTableEquality(tab1[("a",)], tab4)
767 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
768 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
769 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
770 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
771 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
772 self._checkAstropyTableEquality(tab1[("a",)], tab7)
773 # Passing an unrecognized column should be a ValueError.
774 with self.assertRaises(ValueError):
775 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
777 def testAstropyTableBigEndian(self):
778 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
780 self.butler.put(tab1, self.datasetType, dataId={})
781 # Read the whole Table.
782 tab2 = self.butler.get(self.datasetType, dataId={})
783 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
785 def testAstropyTableWithMetadata(self):
786 tab1 = _makeSimpleAstropyTable(include_multidim=True)
788 meta = {
789 "meta_a": 5,
790 "meta_b": 10.0,
791 "meta_c": [1, 2, 3],
792 "meta_d": True,
793 "meta_e": "string",
794 }
796 tab1.meta.update(meta)
798 self.butler.put(tab1, self.datasetType, dataId={})
799 # Read the whole Table.
800 tab2 = self.butler.get(self.datasetType, dataId={})
801 # This will check that the metadata is equivalent as well.
802 self._checkAstropyTableEquality(tab1, tab2)
804 def testArrowAstropySchema(self):
805 tab1 = _makeSimpleAstropyTable()
806 tab1_arrow = astropy_to_arrow(tab1)
807 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
809 self.assertIsInstance(schema.schema, atable.Table)
810 self.assertEqual(repr(schema), repr(schema._schema))
811 self.assertNotEqual(schema, "not_a_schema")
812 self.assertEqual(schema, schema)
814 # Test various inequalities
815 tab2 = tab1.copy()
816 tab2.rename_column("index", "index2")
817 schema2 = ArrowAstropySchema(tab2)
818 self.assertNotEqual(schema2, schema)
820 tab2 = tab1.copy()
821 tab2["index"].unit = units.micron
822 schema2 = ArrowAstropySchema(tab2)
823 self.assertNotEqual(schema2, schema)
825 tab2 = tab1.copy()
826 tab2["index"].description = "Index column"
827 schema2 = ArrowAstropySchema(tab2)
828 self.assertNotEqual(schema2, schema)
830 tab2 = tab1.copy()
831 tab2["index"].format = "%05d"
832 schema2 = ArrowAstropySchema(tab2)
833 self.assertNotEqual(schema2, schema)
835 def testAstropyParquet(self):
836 tab1 = _makeSimpleAstropyTable()
838 fname = os.path.join(self.root, "test_astropy.parq")
839 tab1.write(fname)
841 astropy_type = DatasetType(
842 "astropy_parquet",
843 dimensions=(),
844 storageClass="ArrowAstropy",
845 universe=self.butler.dimensions,
846 )
847 self.butler.registry.registerDatasetType(astropy_type)
849 data_id = {}
850 ref = DatasetRef(astropy_type, data_id, run=self.run)
851 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
853 self.butler.ingest(dataset, transfer="copy")
855 self.butler.put(tab1, self.datasetType, dataId={})
857 tab2a = self.butler.get(self.datasetType, dataId={})
858 tab2b = self.butler.get("astropy_parquet", dataId={})
859 self._checkAstropyTableEquality(tab2a, tab2b)
861 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
862 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
863 self.assertEqual(len(columns2b), len(columns2a))
864 for i, name in enumerate(columns2a):
865 self.assertEqual(columns2b[i], name)
867 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
868 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
869 self.assertEqual(rowcount2a, rowcount2b)
871 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
872 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
873 self.assertEqual(schema2a, schema2b)
875 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
876 def testWriteAstropyReadAsArrowTable(self):
877 # This astropy <-> arrow works fine with masked columns.
878 tab1 = _makeSimpleAstropyTable(include_masked=True)
880 self.butler.put(tab1, self.datasetType, dataId={})
882 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
884 tab2_astropy = arrow_to_astropy(tab2)
885 self._checkAstropyTableEquality(tab1, tab2_astropy)
887 # Check reading the columns.
888 columns = tab2.schema.names
889 columns2 = self.butler.get(
890 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
891 )
892 self.assertEqual(columns2, columns)
894 # Check reading the schema.
895 schema = tab2.schema
896 schema2 = self.butler.get(
897 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
898 )
900 self.assertEqual(schema, schema2)
902 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
903 def testWriteAstropyReadAsDataFrame(self):
904 tab1 = _makeSimpleAstropyTable()
906 self.butler.put(tab1, self.datasetType, dataId={})
908 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
910 # This is tricky because it loses the units and gains a bonus pandas
911 # _index_ column, so we just test the dataframe form.
913 tab1_df = tab1.to_pandas()
914 self.assertTrue(tab1_df.equals(tab2))
916 # Check reading the columns.
917 columns = tab2.columns
918 columns2 = self.butler.get(
919 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
920 )
921 self.assertTrue(columns.equals(columns2))
923 # Check reading the schema.
924 schema = DataFrameSchema(tab2)
925 schema2 = self.butler.get(
926 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
927 )
929 self.assertEqual(schema2, schema)
931 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
932 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
933 # We need to special-case the write-as-astropy read-as-pandas code
934 # with masks because pandas has multiple ways to use masked columns.
935 # (When writing an astropy table with masked columns we get an object
936 # column back, but each unmasked element has the correct type.)
937 tab1 = _makeSimpleAstropyTable(include_masked=True)
939 self.butler.put(tab1, self.datasetType, dataId={})
941 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
943 tab1_df = tab1.to_pandas()
945 self.assertTrue(tab1_df.columns.equals(tab2.columns))
946 for name in tab2.columns:
947 col1 = tab1_df[name]
948 col2 = tab2[name]
950 if col1.hasnans:
951 notNull = col1.notnull()
952 self.assertTrue(notNull.equals(col2.notnull()))
953 # Need to check value-by-value because column may
954 # be made of objects, depending on what pandas decides.
955 for index in notNull.values.nonzero()[0]:
956 self.assertEqual(col1[index], col2[index])
957 else:
958 self.assertTrue(col1.equals(col2))
960 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
961 def testWriteAstropyReadAsNumpyTable(self):
962 tab1 = _makeSimpleAstropyTable()
963 self.butler.put(tab1, self.datasetType, dataId={})
965 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
967 # This is tricky because it loses the units.
968 tab2_astropy = atable.Table(tab2)
970 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
972 # Check reading the columns.
973 columns = list(tab2.dtype.names)
974 columns2 = self.butler.get(
975 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
976 )
977 self.assertEqual(columns2, columns)
979 # Check reading the schema.
980 schema = ArrowNumpySchema(tab2.dtype)
981 schema2 = self.butler.get(
982 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
983 )
985 self.assertEqual(schema2, schema)
987 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
988 def testWriteAstropyReadAsNumpyDict(self):
989 tab1 = _makeSimpleAstropyTable()
990 self.butler.put(tab1, self.datasetType, dataId={})
992 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
994 # This is tricky because it loses the units.
995 tab2_astropy = atable.Table(tab2)
997 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
999 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
1000 """Check if two astropy tables have the same columns/values.
1002 Parameters
1003 ----------
1004 table1 : `astropy.table.Table`
1005 table2 : `astropy.table.Table`
1006 skip_units : `bool`
1007 has_bigendian : `bool`
1008 """
1009 if not has_bigendian:
1010 self.assertEqual(table1.dtype, table2.dtype)
1011 else:
1012 for name in table1.dtype.names:
1013 # Only check type matches, force to little-endian.
1014 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1016 self.assertEqual(table1.meta, table2.meta)
1017 if not skip_units:
1018 for name in table1.columns:
1019 self.assertEqual(table1[name].unit, table2[name].unit)
1020 self.assertEqual(table1[name].description, table2[name].description)
1021 self.assertEqual(table1[name].format, table2[name].format)
1022 self.assertTrue(np.all(table1 == table2))
1025@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
1026class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
1027 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
1029 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1031 def testAstropyParquet(self):
1032 # This test does not work with an inMemoryDatastore.
1033 pass
1035 def testBadInput(self):
1036 tab1 = _makeSimpleAstropyTable()
1037 delegate = ArrowAstropyDelegate("ArrowAstropy")
1039 with self.assertRaises(ValueError):
1040 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1042 with self.assertRaises(NotImplementedError):
1043 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1045 with self.assertRaises(AttributeError):
1046 delegate.getComponent(composite=tab1, componentName="nothing")
1049@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1050@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1051class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1052 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1054 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1056 def setUp(self):
1057 """Create a new butler root for each test."""
1058 self.root = makeTestTempDir(TESTDIR)
1059 config = Config(self.configFile)
1060 self.butler = Butler.from_config(
1061 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1062 )
1063 # No dimensions in dataset type so we don't have to worry about
1064 # inserting dimension data or defining data IDs.
1065 self.datasetType = DatasetType(
1066 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions
1067 )
1068 self.butler.registry.registerDatasetType(self.datasetType)
1070 def tearDown(self):
1071 removeTestTempDir(self.root)
1073 def testNumpyTable(self):
1074 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1076 self.butler.put(tab1, self.datasetType, dataId={})
1077 # Read the whole Table.
1078 tab2 = self.butler.get(self.datasetType, dataId={})
1079 self._checkNumpyTableEquality(tab1, tab2)
1080 # Read the columns.
1081 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1082 self.assertEqual(len(columns2), len(tab1.dtype.names))
1083 for i, name in enumerate(tab1.dtype.names):
1084 self.assertEqual(columns2[i], name)
1085 # Read the rowcount.
1086 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1087 self.assertEqual(rowcount, len(tab1))
1088 # Read the schema.
1089 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1090 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1091 # Read just some columns a few different ways.
1092 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1093 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1094 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1095 self._checkNumpyTableEquality(
1096 tab1[
1097 [
1098 "a",
1099 ]
1100 ],
1101 tab4,
1102 )
1103 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1104 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1105 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1106 self._checkNumpyTableEquality(
1107 tab1[
1108 [
1109 "ddd",
1110 ]
1111 ],
1112 tab6,
1113 )
1114 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1115 self._checkNumpyTableEquality(
1116 tab1[
1117 [
1118 "a",
1119 ]
1120 ],
1121 tab7,
1122 )
1123 # Passing an unrecognized column should be a ValueError.
1124 with self.assertRaises(ValueError):
1125 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1127 def testNumpyTableBigEndian(self):
1128 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1130 self.butler.put(tab1, self.datasetType, dataId={})
1131 # Read the whole Table.
1132 tab2 = self.butler.get(self.datasetType, dataId={})
1133 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1135 def testArrowNumpySchema(self):
1136 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1137 tab1_arrow = numpy_to_arrow(tab1)
1138 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1140 self.assertIsInstance(schema.schema, np.dtype)
1141 self.assertEqual(repr(schema), repr(schema._dtype))
1142 self.assertNotEqual(schema, "not_a_schema")
1143 self.assertEqual(schema, schema)
1145 # Test inequality
1146 tab2 = tab1.copy()
1147 names = list(tab2.dtype.names)
1148 names[0] = "index2"
1149 tab2.dtype.names = names
1150 schema2 = ArrowNumpySchema(tab2.dtype)
1151 self.assertNotEqual(schema2, schema)
1153 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1154 def testNumpyDictConversions(self):
1155 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1157 # Verify that everything round-trips, including the schema.
1158 tab1_arrow = numpy_to_arrow(tab1)
1159 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1160 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1162 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1163 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1165 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1166 def testWriteNumpyTableReadAsArrowTable(self):
1167 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1169 self.butler.put(tab1, self.datasetType, dataId={})
1171 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1173 tab2_numpy = arrow_to_numpy(tab2)
1175 self._checkNumpyTableEquality(tab1, tab2_numpy)
1177 # Check reading the columns.
1178 columns = tab2.schema.names
1179 columns2 = self.butler.get(
1180 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1181 )
1182 self.assertEqual(columns2, columns)
1184 # Check reading the schema.
1185 schema = tab2.schema
1186 schema2 = self.butler.get(
1187 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1188 )
1189 self.assertEqual(schema2, schema)
1191 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1192 def testWriteNumpyTableReadAsDataFrame(self):
1193 tab1 = _makeSimpleNumpyTable()
1195 self.butler.put(tab1, self.datasetType, dataId={})
1197 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1199 # Converting this back to numpy gets confused with the index column
1200 # and changes the datatype of the string column.
1202 tab1_df = pd.DataFrame(tab1)
1204 self.assertTrue(tab1_df.equals(tab2))
1206 # Check reading the columns.
1207 columns = tab2.columns
1208 columns2 = self.butler.get(
1209 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1210 )
1211 self.assertTrue(columns.equals(columns2))
1213 # Check reading the schema.
1214 schema = DataFrameSchema(tab2)
1215 schema2 = self.butler.get(
1216 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1217 )
1219 self.assertEqual(schema2, schema)
1221 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1222 def testWriteNumpyTableReadAsAstropyTable(self):
1223 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1225 self.butler.put(tab1, self.datasetType, dataId={})
1227 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1228 tab2_numpy = tab2.as_array()
1230 self._checkNumpyTableEquality(tab1, tab2_numpy)
1232 # Check reading the columns.
1233 columns = list(tab2.columns.keys())
1234 columns2 = self.butler.get(
1235 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1236 )
1237 self.assertEqual(columns2, columns)
1239 # Check reading the schema.
1240 schema = ArrowAstropySchema(tab2)
1241 schema2 = self.butler.get(
1242 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1243 )
1245 self.assertEqual(schema2, schema)
1247 def testWriteNumpyTableReadAsNumpyDict(self):
1248 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1250 self.butler.put(tab1, self.datasetType, dataId={})
1252 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1253 tab2_numpy = _numpy_dict_to_numpy(tab2)
1255 self._checkNumpyTableEquality(tab1, tab2_numpy)
1257 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1258 """Check if two numpy tables have the same columns/values
1260 Parameters
1261 ----------
1262 table1 : `numpy.ndarray`
1263 table2 : `numpy.ndarray`
1264 has_bigendian : `bool`
1265 """
1266 self.assertEqual(table1.dtype.names, table2.dtype.names)
1267 for name in table1.dtype.names:
1268 if not has_bigendian:
1269 self.assertEqual(table1.dtype[name], table2.dtype[name])
1270 else:
1271 # Only check type matches, force to little-endian.
1272 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1273 self.assertTrue(np.all(table1 == table2))
1276@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1277class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1278 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1280 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1282 def testBadInput(self):
1283 tab1 = _makeSimpleNumpyTable()
1284 delegate = ArrowNumpyDelegate("ArrowNumpy")
1286 with self.assertRaises(ValueError):
1287 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1289 with self.assertRaises(NotImplementedError):
1290 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1292 with self.assertRaises(AttributeError):
1293 delegate.getComponent(composite=tab1, componentName="nothing")
1295 def testStorageClass(self):
1296 tab1 = _makeSimpleNumpyTable()
1298 factory = StorageClassFactory()
1299 factory.addFromConfig(StorageClassConfig())
1301 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1302 # Force the name lookup to do name matching.
1303 storageClass._pytype = None
1304 self.assertEqual(storageClass.name, "ArrowNumpy")
1306 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1307 # Force the name lookup to do name matching.
1308 storageClass._pytype = None
1309 self.assertEqual(storageClass.name, "ArrowNumpy")
1312@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1313class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1314 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1316 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1318 def setUp(self):
1319 """Create a new butler root for each test."""
1320 self.root = makeTestTempDir(TESTDIR)
1321 config = Config(self.configFile)
1322 self.butler = Butler.from_config(
1323 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1324 )
1325 # No dimensions in dataset type so we don't have to worry about
1326 # inserting dimension data or defining data IDs.
1327 self.datasetType = DatasetType(
1328 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions
1329 )
1330 self.butler.registry.registerDatasetType(self.datasetType)
1332 def tearDown(self):
1333 removeTestTempDir(self.root)
1335 def testArrowTable(self):
1336 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1338 self.butler.put(tab1, self.datasetType, dataId={})
1339 # Read the whole Table.
1340 tab2 = self.butler.get(self.datasetType, dataId={})
1341 self.assertEqual(tab2, tab1)
1342 # Read the columns.
1343 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1344 self.assertEqual(len(columns2), len(tab1.schema.names))
1345 for i, name in enumerate(tab1.schema.names):
1346 self.assertEqual(columns2[i], name)
1347 # Read the rowcount.
1348 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1349 self.assertEqual(rowcount, len(tab1))
1350 # Read the schema.
1351 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1352 self.assertEqual(schema, tab1.schema)
1353 # Read just some columns a few different ways.
1354 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1355 self.assertEqual(tab3, tab1.select(("a", "c")))
1356 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1357 self.assertEqual(tab4, tab1.select(("a",)))
1358 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1359 self.assertEqual(tab5, tab1.select(("index", "a")))
1360 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1361 self.assertEqual(tab6, tab1.select(("ddd",)))
1362 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1363 self.assertEqual(tab7, tab1.select(("a",)))
1364 # Passing an unrecognized column should be a ValueError.
1365 with self.assertRaises(ValueError):
1366 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1368 def testEmptyArrowTable(self):
1369 data = _makeSimpleNumpyTable()
1370 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1372 schema = pa.schema(type_list)
1373 arrays = [[]] * len(schema.names)
1375 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1377 self.butler.put(tab1, self.datasetType, dataId={})
1378 tab2 = self.butler.get(self.datasetType, dataId={})
1379 self.assertEqual(tab2, tab1)
1381 tab1_numpy = arrow_to_numpy(tab1)
1382 self.assertEqual(len(tab1_numpy), 0)
1383 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1384 self.assertEqual(tab1_numpy_arrow, tab1)
1386 tab1_pandas = arrow_to_pandas(tab1)
1387 self.assertEqual(len(tab1_pandas), 0)
1388 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1389 # Unfortunately, string/byte columns get mangled when translated
1390 # through empty pandas dataframes.
1391 self.assertEqual(
1392 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1393 tab1.select(("index", "a", "b", "c", "ddd")),
1394 )
1396 tab1_astropy = arrow_to_astropy(tab1)
1397 self.assertEqual(len(tab1_astropy), 0)
1398 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1399 self.assertEqual(tab1_astropy_arrow, tab1)
1401 def testEmptyArrowTableMultidim(self):
1402 data = _makeSimpleNumpyTable(include_multidim=True)
1403 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1405 md = {}
1406 for name in data.dtype.names:
1407 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1409 schema = pa.schema(type_list, metadata=md)
1410 arrays = [[]] * len(schema.names)
1412 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1414 self.butler.put(tab1, self.datasetType, dataId={})
1415 tab2 = self.butler.get(self.datasetType, dataId={})
1416 self.assertEqual(tab2, tab1)
1418 tab1_numpy = arrow_to_numpy(tab1)
1419 self.assertEqual(len(tab1_numpy), 0)
1420 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1421 self.assertEqual(tab1_numpy_arrow, tab1)
1423 tab1_astropy = arrow_to_astropy(tab1)
1424 self.assertEqual(len(tab1_astropy), 0)
1425 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1426 self.assertEqual(tab1_astropy_arrow, tab1)
1428 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1429 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1430 df1, allColumns = _makeSingleIndexDataFrame()
1432 self.butler.put(df1, self.datasetType, dataId={})
1434 # Read back out as a dataframe.
1435 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1436 self.assertTrue(df1.equals(df2))
1438 # Read back out as an arrow table, convert to dataframe.
1439 tab3 = self.butler.get(self.datasetType, dataId={})
1440 df3 = arrow_to_pandas(tab3)
1441 self.assertTrue(df1.equals(df3))
1443 # Check reading the columns.
1444 columns = df2.reset_index().columns
1445 columns2 = self.butler.get(
1446 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1447 )
1448 # We check the set because pandas reorders the columns.
1449 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1451 # Check reading the schema.
1452 schema = DataFrameSchema(df1)
1453 schema2 = self.butler.get(
1454 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1455 )
1456 self.assertEqual(schema2, schema)
1458 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1459 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1460 df1 = _makeMultiIndexDataFrame()
1462 self.butler.put(df1, self.datasetType, dataId={})
1464 # Read back out as a dataframe.
1465 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1466 self.assertTrue(df1.equals(df2))
1468 # Read back out as an arrow table, convert to dataframe.
1469 atab3 = self.butler.get(self.datasetType, dataId={})
1470 df3 = arrow_to_pandas(atab3)
1471 self.assertTrue(df1.equals(df3))
1473 # Check reading the columns.
1474 columns = df2.columns
1475 columns2 = self.butler.get(
1476 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1477 )
1478 self.assertTrue(columns2.equals(columns))
1480 # Check reading the schema.
1481 schema = DataFrameSchema(df1)
1482 schema2 = self.butler.get(
1483 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1484 )
1485 self.assertEqual(schema2, schema)
1487 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1488 def testWriteArrowTableReadAsAstropyTable(self):
1489 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1491 self.butler.put(tab1, self.datasetType, dataId={})
1493 # Read back out as an astropy table.
1494 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1495 self._checkAstropyTableEquality(tab1, tab2)
1497 # Read back out as an arrow table, convert to astropy table.
1498 atab3 = self.butler.get(self.datasetType, dataId={})
1499 tab3 = arrow_to_astropy(atab3)
1500 self._checkAstropyTableEquality(tab1, tab3)
1502 # Check reading the columns.
1503 columns = list(tab2.columns.keys())
1504 columns2 = self.butler.get(
1505 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1506 )
1507 self.assertEqual(columns2, columns)
1509 # Check reading the schema.
1510 schema = ArrowAstropySchema(tab1)
1511 schema2 = self.butler.get(
1512 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1513 )
1514 self.assertEqual(schema2, schema)
1516 # Check the schema conversions and units.
1517 arrow_schema = schema.to_arrow_schema()
1518 for name in arrow_schema.names:
1519 field_metadata = arrow_schema.field(name).metadata
1520 if (
1521 b"description" in field_metadata
1522 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1523 ):
1524 self.assertEqual(schema2.schema[name].description, description)
1525 else:
1526 self.assertIsNone(schema2.schema[name].description)
1527 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1528 self.assertEqual(schema2.schema[name].unit, units.Unit(unit))
1530 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1531 def testWriteArrowTableReadAsNumpyTable(self):
1532 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1534 self.butler.put(tab1, self.datasetType, dataId={})
1536 # Read back out as a numpy table.
1537 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1538 self._checkNumpyTableEquality(tab1, tab2)
1540 # Read back out as an arrow table, convert to numpy table.
1541 atab3 = self.butler.get(self.datasetType, dataId={})
1542 tab3 = arrow_to_numpy(atab3)
1543 self._checkNumpyTableEquality(tab1, tab3)
1545 # Check reading the columns.
1546 columns = list(tab2.dtype.names)
1547 columns2 = self.butler.get(
1548 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1549 )
1550 self.assertEqual(columns2, columns)
1552 # Check reading the schema.
1553 schema = ArrowNumpySchema(tab1.dtype)
1554 schema2 = self.butler.get(
1555 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1556 )
1557 self.assertEqual(schema2, schema)
1559 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1560 def testWriteArrowTableReadAsNumpyDict(self):
1561 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1563 self.butler.put(tab1, self.datasetType, dataId={})
1565 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1566 tab2_numpy = _numpy_dict_to_numpy(tab2)
1567 self._checkNumpyTableEquality(tab1, tab2_numpy)
1569 def _checkAstropyTableEquality(self, table1, table2):
1570 """Check if two astropy tables have the same columns/values
1572 Parameters
1573 ----------
1574 table1 : `astropy.table.Table`
1575 table2 : `astropy.table.Table`
1576 """
1577 self.assertEqual(table1.dtype, table2.dtype)
1578 for name in table1.columns:
1579 self.assertEqual(table1[name].unit, table2[name].unit)
1580 self.assertEqual(table1[name].description, table2[name].description)
1581 self.assertEqual(table1[name].format, table2[name].format)
1582 self.assertTrue(np.all(table1 == table2))
1584 def _checkNumpyTableEquality(self, table1, table2):
1585 """Check if two numpy tables have the same columns/values
1587 Parameters
1588 ----------
1589 table1 : `numpy.ndarray`
1590 table2 : `numpy.ndarray`
1591 """
1592 self.assertEqual(table1.dtype.names, table2.dtype.names)
1593 for name in table1.dtype.names:
1594 self.assertEqual(table1.dtype[name], table2.dtype[name])
1595 self.assertTrue(np.all(table1 == table2))
1598@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1599class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1600 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1602 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1604 def testBadInput(self):
1605 tab1 = _makeSimpleArrowTable()
1606 delegate = ArrowTableDelegate("ArrowTable")
1608 with self.assertRaises(ValueError):
1609 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1611 with self.assertRaises(NotImplementedError):
1612 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1614 with self.assertRaises(AttributeError):
1615 delegate.getComponent(composite=tab1, componentName="nothing")
1617 def testStorageClass(self):
1618 tab1 = _makeSimpleArrowTable()
1620 factory = StorageClassFactory()
1621 factory.addFromConfig(StorageClassConfig())
1623 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1624 # Force the name lookup to do name matching.
1625 storageClass._pytype = None
1626 self.assertEqual(storageClass.name, "ArrowTable")
1628 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1629 # Force the name lookup to do name matching.
1630 storageClass._pytype = None
1631 self.assertEqual(storageClass.name, "ArrowTable")
1634@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1635@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1636class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1637 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1639 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1641 def setUp(self):
1642 """Create a new butler root for each test."""
1643 self.root = makeTestTempDir(TESTDIR)
1644 config = Config(self.configFile)
1645 self.butler = Butler.from_config(
1646 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1647 )
1648 # No dimensions in dataset type so we don't have to worry about
1649 # inserting dimension data or defining data IDs.
1650 self.datasetType = DatasetType(
1651 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions
1652 )
1653 self.butler.registry.registerDatasetType(self.datasetType)
1655 def tearDown(self):
1656 removeTestTempDir(self.root)
1658 def testNumpyDict(self):
1659 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1660 dict1 = _numpy_to_numpy_dict(tab1)
1662 self.butler.put(dict1, self.datasetType, dataId={})
1663 # Read the whole table.
1664 dict2 = self.butler.get(self.datasetType, dataId={})
1665 self._checkNumpyDictEquality(dict1, dict2)
1666 # Read the columns.
1667 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1668 self.assertEqual(len(columns2), len(dict1.keys()))
1669 for name in dict1:
1670 self.assertIn(name, columns2)
1671 # Read the rowcount.
1672 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1673 self.assertEqual(rowcount, len(dict1["a"]))
1674 # Read the schema.
1675 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1676 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1677 # Read just some columns a few different ways.
1678 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1679 subdict = {key: dict1[key] for key in ["a", "c"]}
1680 self._checkNumpyDictEquality(subdict, tab3)
1681 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1682 subdict = {key: dict1[key] for key in ["a"]}
1683 self._checkNumpyDictEquality(subdict, tab4)
1684 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1685 subdict = {key: dict1[key] for key in ["index", "a"]}
1686 self._checkNumpyDictEquality(subdict, tab5)
1687 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1688 subdict = {key: dict1[key] for key in ["ddd"]}
1689 self._checkNumpyDictEquality(subdict, tab6)
1690 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1691 subdict = {key: dict1[key] for key in ["a"]}
1692 self._checkNumpyDictEquality(subdict, tab7)
1693 # Passing an unrecognized column should be a ValueError.
1694 with self.assertRaises(ValueError):
1695 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1697 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1698 def testWriteNumpyDictReadAsArrowTable(self):
1699 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1700 dict1 = _numpy_to_numpy_dict(tab1)
1702 self.butler.put(dict1, self.datasetType, dataId={})
1704 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1706 tab2_dict = arrow_to_numpy_dict(tab2)
1708 self._checkNumpyDictEquality(dict1, tab2_dict)
1710 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1711 def testWriteNumpyDictReadAsDataFrame(self):
1712 tab1 = _makeSimpleNumpyTable()
1713 dict1 = _numpy_to_numpy_dict(tab1)
1715 self.butler.put(dict1, self.datasetType, dataId={})
1717 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1719 # The order of the dict may get mixed up, so we need to check column
1720 # by column. We also need to do this in dataframe form because pandas
1721 # changes the datatype of the string column.
1722 tab1_df = pd.DataFrame(tab1)
1724 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1725 for col in tab1_df.columns:
1726 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1728 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1729 def testWriteNumpyDictReadAsAstropyTable(self):
1730 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1731 dict1 = _numpy_to_numpy_dict(tab1)
1733 self.butler.put(dict1, self.datasetType, dataId={})
1735 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1736 tab2_dict = _astropy_to_numpy_dict(tab2)
1738 self._checkNumpyDictEquality(dict1, tab2_dict)
1740 def testWriteNumpyDictReadAsNumpyTable(self):
1741 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1742 dict1 = _numpy_to_numpy_dict(tab1)
1744 self.butler.put(dict1, self.datasetType, dataId={})
1746 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1747 tab2_dict = _numpy_to_numpy_dict(tab2)
1749 self._checkNumpyDictEquality(dict1, tab2_dict)
1751 def testWriteNumpyDictBad(self):
1752 dict1 = {"a": 4, "b": np.ndarray([1])}
1753 with self.assertRaises(RuntimeError):
1754 self.butler.put(dict1, self.datasetType, dataId={})
1756 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1757 with self.assertRaises(RuntimeError):
1758 self.butler.put(dict2, self.datasetType, dataId={})
1760 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1761 with self.assertRaises(RuntimeError):
1762 self.butler.put(dict3, self.datasetType, dataId={})
1764 def _checkNumpyDictEquality(self, dict1, dict2):
1765 """Check if two numpy dicts have the same columns/values.
1767 Parameters
1768 ----------
1769 dict1 : `dict` [`str`, `np.ndarray`]
1770 dict2 : `dict` [`str`, `np.ndarray`]
1771 """
1772 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1773 for name in dict1:
1774 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1775 self.assertTrue(np.all(dict1[name] == dict2[name]))
1778@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1779@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1780class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1781 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1783 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1785 def testWriteNumpyDictBad(self):
1786 # The sub-type checking is not done on in-memory datastore.
1787 pass
1790@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.")
1791class ParquetFormatterArrowSchemaTestCase(unittest.TestCase):
1792 """Tests for ParquetFormatter, ArrowSchema, using local file datastore."""
1794 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1796 def setUp(self):
1797 """Create a new butler root for each test."""
1798 self.root = makeTestTempDir(TESTDIR)
1799 config = Config(self.configFile)
1800 self.butler = Butler.from_config(
1801 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1802 )
1803 # No dimensions in dataset type so we don't have to worry about
1804 # inserting dimension data or defining data IDs.
1805 self.datasetType = DatasetType(
1806 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions
1807 )
1808 self.butler.registry.registerDatasetType(self.datasetType)
1810 def tearDown(self):
1811 removeTestTempDir(self.root)
1813 def _makeTestSchema(self):
1814 schema = pa.schema(
1815 [
1816 pa.field(
1817 "int32",
1818 pa.int32(),
1819 nullable=False,
1820 metadata={
1821 "description": "32-bit integer",
1822 "unit": "",
1823 },
1824 ),
1825 pa.field(
1826 "int64",
1827 pa.int64(),
1828 nullable=False,
1829 metadata={
1830 "description": "64-bit integer",
1831 "unit": "",
1832 },
1833 ),
1834 pa.field(
1835 "uint64",
1836 pa.uint64(),
1837 nullable=False,
1838 metadata={
1839 "description": "64-bit unsigned integer",
1840 "unit": "",
1841 },
1842 ),
1843 pa.field(
1844 "float32",
1845 pa.float32(),
1846 nullable=False,
1847 metadata={
1848 "description": "32-bit float",
1849 "unit": "count",
1850 },
1851 ),
1852 pa.field(
1853 "float64",
1854 pa.float64(),
1855 nullable=False,
1856 metadata={
1857 "description": "64-bit float",
1858 "unit": "nJy",
1859 },
1860 ),
1861 pa.field(
1862 "fixed_size_list",
1863 pa.list_(pa.float64(), list_size=10),
1864 nullable=False,
1865 metadata={
1866 "description": "Fixed size list of 64-bit floats.",
1867 "unit": "nJy",
1868 },
1869 ),
1870 pa.field(
1871 "variable_size_list",
1872 pa.list_(pa.float64()),
1873 nullable=False,
1874 metadata={
1875 "description": "Variable size list of 64-bit floats.",
1876 "unit": "nJy",
1877 },
1878 ),
1879 # One of these fields will have no description.
1880 pa.field(
1881 "string",
1882 pa.string(),
1883 nullable=False,
1884 metadata={
1885 "unit": "",
1886 },
1887 ),
1888 # One of these fields will have no metadata.
1889 pa.field(
1890 "binary",
1891 pa.binary(),
1892 nullable=False,
1893 ),
1894 ]
1895 )
1897 return schema
1899 def testArrowSchema(self):
1900 schema1 = self._makeTestSchema()
1901 self.butler.put(schema1, self.datasetType, dataId={})
1903 schema2 = self.butler.get(self.datasetType, dataId={})
1904 self.assertEqual(schema2, schema1)
1906 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.")
1907 def testWriteArrowSchemaReadAsDataFrameSchema(self):
1908 schema1 = self._makeTestSchema()
1909 self.butler.put(schema1, self.datasetType, dataId={})
1911 df_schema1 = DataFrameSchema.from_arrow(schema1)
1913 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema")
1914 self.assertEqual(df_schema2, df_schema1)
1916 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.")
1917 def testWriteArrowSchemaReadAsArrowAstropySchema(self):
1918 schema1 = self._makeTestSchema()
1919 self.butler.put(schema1, self.datasetType, dataId={})
1921 ap_schema1 = ArrowAstropySchema.from_arrow(schema1)
1923 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema")
1924 self.assertEqual(ap_schema2, ap_schema1)
1926 # Confirm that the ap_schema2 has the unit/description we expect.
1927 for name in schema1.names:
1928 field_metadata = schema1.field(name).metadata
1929 if field_metadata is None:
1930 continue
1931 if (
1932 b"description" in field_metadata
1933 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1934 ):
1935 self.assertEqual(ap_schema2.schema[name].description, description)
1936 else:
1937 self.assertIsNone(ap_schema2.schema[name].description)
1938 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1939 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit))
1941 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.")
1942 def testWriteArrowSchemaReadAsArrowNumpySchema(self):
1943 schema1 = self._makeTestSchema()
1944 self.butler.put(schema1, self.datasetType, dataId={})
1946 np_schema1 = ArrowNumpySchema.from_arrow(schema1)
1948 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema")
1949 self.assertEqual(np_schema2, np_schema1)
1952@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.")
1953class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase):
1954 """Tests for InMemoryDatastore and ArrowSchema."""
1956 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1959@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
1960@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
1961class ComputeRowGroupSizeTestCase(unittest.TestCase):
1962 """Tests for compute_row_group_size."""
1964 def testRowGroupSizeNoMetadata(self):
1965 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1967 # We can't use the numpy_to_arrow convenience function because
1968 # that adds metadata.
1969 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
1970 schema = pa.schema(type_list)
1971 arrays = _numpy_style_arrays_to_arrow_arrays(
1972 numpyTable.dtype,
1973 len(numpyTable),
1974 numpyTable,
1975 schema,
1976 )
1977 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
1979 row_group_size = compute_row_group_size(arrowTable.schema)
1981 self.assertGreater(row_group_size, 1_000_000)
1982 self.assertLess(row_group_size, 2_000_000)
1984 def testRowGroupSizeWithMetadata(self):
1985 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1987 arrowTable = numpy_to_arrow(numpyTable)
1989 row_group_size = compute_row_group_size(arrowTable.schema)
1991 self.assertGreater(row_group_size, 1_000_000)
1992 self.assertLess(row_group_size, 2_000_000)
1994 def testRowGroupSizeTinyTable(self):
1995 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
1997 arrowTable = numpy_to_arrow(numpyTable)
1999 row_group_size = compute_row_group_size(arrowTable.schema)
2001 self.assertGreater(row_group_size, 1_000_000)
2003 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
2004 def testRowGroupSizeDataFrameWithLists(self):
2005 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
2006 arrowTable = pandas_to_arrow(df)
2007 row_group_size = compute_row_group_size(arrowTable.schema)
2009 self.assertGreater(row_group_size, 1_000_000)
2012if __name__ == "__main__":
2013 unittest.main()