Coverage for tests/test_parquet.py: 23%
1039 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for ParquetFormatter.
30Tests in this module are disabled unless pandas and pyarrow are importable.
31"""
33import os
34import unittest
36try:
37 import pyarrow as pa
38except ImportError:
39 pa = None
40try:
41 import astropy.table as atable
42 from astropy import units
43except ImportError:
44 atable = None
45try:
46 import numpy as np
47except ImportError:
48 np = None
49try:
50 import pandas as pd
51except ImportError:
52 pd = None
54from lsst.daf.butler import (
55 Butler,
56 Config,
57 DatasetRef,
58 DatasetType,
59 FileDataset,
60 StorageClassConfig,
61 StorageClassFactory,
62)
64try:
65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
66except ImportError:
67 atable = None
68 pa = None
69try:
70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
71except ImportError:
72 np = None
73 pa = None
74try:
75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
76except ImportError:
77 pa = None
78try:
79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
80except ImportError:
81 pd = None
82try:
83 from lsst.daf.butler.formatters.parquet import (
84 ArrowAstropySchema,
85 ArrowNumpySchema,
86 DataFrameSchema,
87 ParquetFormatter,
88 _append_numpy_multidim_metadata,
89 _astropy_to_numpy_dict,
90 _numpy_dict_to_numpy,
91 _numpy_dtype_to_arrow_types,
92 _numpy_style_arrays_to_arrow_arrays,
93 _numpy_to_numpy_dict,
94 arrow_to_astropy,
95 arrow_to_numpy,
96 arrow_to_numpy_dict,
97 arrow_to_pandas,
98 astropy_to_arrow,
99 compute_row_group_size,
100 numpy_dict_to_arrow,
101 numpy_to_arrow,
102 pandas_to_arrow,
103 )
104except ImportError:
105 pa = None
106 pd = None
107 atable = None
108 np = None
109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
111TESTDIR = os.path.abspath(os.path.dirname(__file__))
114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
115 """Make a simple numpy table with random data.
117 Parameters
118 ----------
119 include_multidim : `bool`
120 Include multi-dimensional columns.
121 include_bigendian : `bool`
122 Include big-endian columns.
124 Returns
125 -------
126 numpyTable : `numpy.ndarray`
127 """
128 nrow = 5
130 dtype = [
131 ("index", "i4"),
132 ("a", "f8"),
133 ("b", "f8"),
134 ("c", "f8"),
135 ("ddd", "f8"),
136 ("f", "i8"),
137 ("strcol", "U10"),
138 ("bytecol", "a10"),
139 ]
141 if include_multidim:
142 dtype.extend(
143 [
144 ("d1", "f4", (5,)),
145 ("d2", "i8", (5, 10)),
146 ("d3", "f8", (5, 10)),
147 ]
148 )
150 if include_bigendian:
151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
153 data = np.zeros(nrow, dtype=dtype)
154 data["index"][:] = np.arange(nrow)
155 data["a"] = np.random.randn(nrow)
156 data["b"] = np.random.randn(nrow)
157 data["c"] = np.random.randn(nrow)
158 data["ddd"] = np.random.randn(nrow)
159 data["f"] = np.arange(nrow) * 10
160 data["strcol"][:] = "teststring"
161 data["bytecol"][:] = "teststring"
163 if include_multidim:
164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
168 if include_bigendian:
169 data["a_bigendian"][:] = data["a"]
170 data["f_bigendian"][:] = data["f"]
172 return data
175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
176 """Make a single index data frame for testing.
178 Parameters
179 ----------
180 include_masked : `bool`
181 Include masked columns.
182 include_lists : `bool`
183 Include list columns.
185 Returns
186 -------
187 dataFrame : `~pandas.DataFrame`
188 The test dataframe.
189 allColumns : `list` [`str`]
190 List of all the columns (including index columns).
191 """
192 data = _makeSimpleNumpyTable()
193 df = pd.DataFrame(data)
194 df = df.set_index("index")
196 if include_masked:
197 nrow = len(df)
199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
201 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
202 df.loc[1, ["m1", "m2", "mstrcol"]] = None
204 if include_lists:
205 nrow = len(df)
207 df["l1"] = [[0, 0]] * nrow
208 df["l2"] = [[0.0, 0.0]] * nrow
209 df["l3"] = [[]] * nrow
211 allColumns = df.columns.append(pd.Index(df.index.names))
213 return df, allColumns
216def _makeMultiIndexDataFrame():
217 """Make a multi-index data frame for testing.
219 Returns
220 -------
221 dataFrame : `~pandas.DataFrame`
222 The test dataframe.
223 """
224 columns = pd.MultiIndex.from_tuples(
225 [
226 ("g", "a"),
227 ("g", "b"),
228 ("g", "c"),
229 ("r", "a"),
230 ("r", "b"),
231 ("r", "c"),
232 ],
233 names=["filter", "column"],
234 )
235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
237 return df
240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
241 """Make an astropy table for testing.
243 Parameters
244 ----------
245 include_multidim : `bool`
246 Include multi-dimensional columns.
247 include_masked : `bool`
248 Include masked columns.
249 include_bigendian : `bool`
250 Include big-endian columns.
252 Returns
253 -------
254 astropyTable : `astropy.table.Table`
255 The test table.
256 """
257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
258 # Add a couple of units.
259 table = atable.Table(data)
260 table["a"].unit = units.degree
261 table["a"].description = "Description of column a"
262 table["b"].unit = units.meter
263 table["b"].description = "Description of column b"
265 # Add some masked columns.
266 if include_masked:
267 nrow = len(table)
268 mask = np.zeros(nrow, dtype=bool)
269 mask[1] = True
270 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
271 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
272 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
273 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
275 return table
278def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
279 """Make an arrow table for testing.
281 Parameters
282 ----------
283 include_multidim : `bool`
284 Include multi-dimensional columns.
285 include_masked : `bool`
286 Include masked columns.
288 Returns
289 -------
290 arrowTable : `pyarrow.Table`
291 The test table.
292 """
293 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
294 return astropy_to_arrow(data)
297@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
298@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
299class ParquetFormatterDataFrameTestCase(unittest.TestCase):
300 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
304 def setUp(self):
305 """Create a new butler root for each test."""
306 self.root = makeTestTempDir(TESTDIR)
307 config = Config(self.configFile)
308 self.run = "test_run"
309 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
310 # No dimensions in dataset type so we don't have to worry about
311 # inserting dimension data or defining data IDs.
312 self.datasetType = DatasetType(
313 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions
314 )
315 self.butler.registry.registerDatasetType(self.datasetType)
317 def tearDown(self):
318 removeTestTempDir(self.root)
320 def testSingleIndexDataFrame(self):
321 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
323 self.butler.put(df1, self.datasetType, dataId={})
324 # Read the whole DataFrame.
325 df2 = self.butler.get(self.datasetType, dataId={})
326 self.assertTrue(df1.equals(df2))
327 # Read just the column descriptions.
328 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
329 self.assertTrue(allColumns.equals(columns2))
330 # Read the rowcount.
331 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
332 self.assertEqual(rowcount, len(df1))
333 # Read the schema.
334 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
335 self.assertEqual(schema, DataFrameSchema(df1))
336 # Read just some columns a few different ways.
337 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
338 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
339 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
340 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
341 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
342 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
343 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
344 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
345 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
346 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
347 # Passing an unrecognized column should be a ValueError.
348 with self.assertRaises(ValueError):
349 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
351 def testSingleIndexDataFrameWithLists(self):
352 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
354 self.butler.put(df1, self.datasetType, dataId={})
355 # Read the whole DataFrame.
356 df2 = self.butler.get(self.datasetType, dataId={})
358 # We need to check the list columns specially because they go
359 # from lists to arrays.
360 for col in ["l1", "l2", "l3"]:
361 for i in range(len(df1)):
362 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
364 def testMultiIndexDataFrame(self):
365 df1 = _makeMultiIndexDataFrame()
367 self.butler.put(df1, self.datasetType, dataId={})
368 # Read the whole DataFrame.
369 df2 = self.butler.get(self.datasetType, dataId={})
370 self.assertTrue(df1.equals(df2))
371 # Read just the column descriptions.
372 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
373 self.assertTrue(df1.columns.equals(columns2))
374 self.assertEqual(columns2.names, df1.columns.names)
375 # Read the rowcount.
376 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
377 self.assertEqual(rowcount, len(df1))
378 # Read the schema.
379 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
380 self.assertEqual(schema, DataFrameSchema(df1))
381 # Read just some columns a few different ways.
382 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
383 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
384 df4 = self.butler.get(
385 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
386 )
387 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
388 column_list = [("g", "a"), ("r", "c")]
389 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
390 self.assertTrue(df1.loc[:, column_list].equals(df5))
391 column_dict = {"filter": "r", "column": ["a", "b"]}
392 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
393 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
394 # Passing an unrecognized column should be a ValueError.
395 with self.assertRaises(ValueError):
396 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
398 def testSingleIndexDataFrameEmptyString(self):
399 """Test persisting a single index dataframe with empty strings."""
400 df1, _ = _makeSingleIndexDataFrame()
402 # Set one of the strings to None
403 df1.at[1, "strcol"] = None
405 self.butler.put(df1, self.datasetType, dataId={})
406 # Read the whole DataFrame.
407 df2 = self.butler.get(self.datasetType, dataId={})
408 self.assertTrue(df1.equals(df2))
410 def testSingleIndexDataFrameAllEmptyStrings(self):
411 """Test persisting a single index dataframe with an empty string
412 column.
413 """
414 df1, _ = _makeSingleIndexDataFrame()
416 # Set all of the strings to None
417 df1.loc[0:, "strcol"] = None
419 self.butler.put(df1, self.datasetType, dataId={})
420 # Read the whole DataFrame.
421 df2 = self.butler.get(self.datasetType, dataId={})
422 self.assertTrue(df1.equals(df2))
424 def testLegacyDataFrame(self):
425 """Test writing a dataframe to parquet via pandas (without additional
426 metadata) and ensure that we can read it back with all the new
427 functionality.
428 """
429 df1, allColumns = _makeSingleIndexDataFrame()
431 fname = os.path.join(self.root, "test_dataframe.parq")
432 df1.to_parquet(fname)
434 legacy_type = DatasetType(
435 "legacy_dataframe",
436 dimensions=(),
437 storageClass="DataFrame",
438 universe=self.butler.dimensions,
439 )
440 self.butler.registry.registerDatasetType(legacy_type)
442 data_id = {}
443 ref = DatasetRef(legacy_type, data_id, run=self.run)
444 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
446 self.butler.ingest(dataset, transfer="copy")
448 self.butler.put(df1, self.datasetType, dataId={})
450 df2a = self.butler.get(self.datasetType, dataId={})
451 df2b = self.butler.get("legacy_dataframe", dataId={})
452 self.assertTrue(df2a.equals(df2b))
454 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
455 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
456 self.assertTrue(df3a.equals(df3b))
458 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
459 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
460 self.assertTrue(columns2a.equals(columns2b))
462 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
463 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
464 self.assertEqual(rowcount2a, rowcount2b)
466 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
467 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
468 self.assertEqual(schema2a, schema2b)
470 def testDataFrameSchema(self):
471 tab1 = _makeSimpleArrowTable()
473 schema = DataFrameSchema.from_arrow(tab1.schema)
475 self.assertIsInstance(schema.schema, pd.DataFrame)
476 self.assertEqual(repr(schema), repr(schema._schema))
477 self.assertNotEqual(schema, "not_a_schema")
478 self.assertEqual(schema, schema)
480 tab2 = _makeMultiIndexDataFrame()
481 schema2 = DataFrameSchema(tab2)
483 self.assertNotEqual(schema, schema2)
485 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
486 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
487 df1, allColumns = _makeSingleIndexDataFrame()
489 self.butler.put(df1, self.datasetType, dataId={})
491 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
493 tab2_df = tab2.to_pandas(index="index")
494 self.assertTrue(df1.equals(tab2_df))
496 # Check reading the columns.
497 columns = list(tab2.columns.keys())
498 columns2 = self.butler.get(
499 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
500 )
501 # We check the set because pandas reorders the columns.
502 self.assertEqual(set(columns2), set(columns))
504 # Check reading the schema.
505 schema = ArrowAstropySchema(tab2)
506 schema2 = self.butler.get(
507 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
508 )
510 # The string types are objectified by pandas, and the order
511 # will be changed because of pandas indexing.
512 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
513 for name in schema.schema.columns:
514 self.assertIn(name, schema2.schema.columns)
515 if schema2.schema[name].dtype != np.dtype("O"):
516 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
518 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
519 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
520 # We need to special-case the write-as-pandas read-as-astropy code
521 # with masks because pandas has multiple ways to use masked columns.
522 # (The string column mask handling in particular is frustratingly
523 # inconsistent.)
524 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
526 self.butler.put(df1, self.datasetType, dataId={})
528 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
529 tab2_df = tab2.to_pandas(index="index")
531 self.assertTrue(df1.columns.equals(tab2_df.columns))
532 for name in tab2_df.columns:
533 col1 = df1[name]
534 col2 = tab2_df[name]
536 if col1.hasnans:
537 notNull = col1.notnull()
538 self.assertTrue(notNull.equals(col2.notnull()))
539 # Need to check value-by-value because column may
540 # be made of objects, depending on what pandas decides.
541 for index in notNull.values.nonzero()[0]:
542 self.assertEqual(col1[index], col2[index])
543 else:
544 self.assertTrue(col1.equals(col2))
546 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
547 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
548 df1 = _makeMultiIndexDataFrame()
550 self.butler.put(df1, self.datasetType, dataId={})
552 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
554 # This is an odd duck, it doesn't really round-trip.
555 # This test simply checks that it's readable, but definitely not
556 # recommended.
558 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
559 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
560 df1, allColumns = _makeSingleIndexDataFrame()
562 self.butler.put(df1, self.datasetType, dataId={})
564 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
566 tab2_df = arrow_to_pandas(tab2)
567 self.assertTrue(df1.equals(tab2_df))
569 # Check reading the columns.
570 columns = list(tab2.schema.names)
571 columns2 = self.butler.get(
572 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
573 )
574 # We check the set because pandas reorders the columns.
575 self.assertEqual(set(columns), set(columns2))
577 # Check reading the schema.
578 schema = tab2.schema
579 schema2 = self.butler.get(
580 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
581 )
583 # These will not have the same metadata, nor will the string column
584 # information be maintained.
585 self.assertEqual(len(schema.names), len(schema2.names))
586 for name in schema.names:
587 if schema.field(name).type not in (pa.string(), pa.binary()):
588 self.assertEqual(schema.field(name).type, schema2.field(name).type)
590 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
591 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
592 df1 = _makeMultiIndexDataFrame()
594 self.butler.put(df1, self.datasetType, dataId={})
596 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
598 tab2_df = arrow_to_pandas(tab2)
599 self.assertTrue(df1.equals(tab2_df))
601 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
602 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
603 df1, allColumns = _makeSingleIndexDataFrame()
605 self.butler.put(df1, self.datasetType, dataId={})
607 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
609 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
610 self.assertTrue(df1.equals(tab2_df))
612 # Check reading the columns.
613 columns = list(tab2.dtype.names)
614 columns2 = self.butler.get(
615 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
616 )
617 # We check the set because pandas reorders the columns.
618 self.assertEqual(set(columns2), set(columns))
620 # Check reading the schema.
621 schema = ArrowNumpySchema(tab2.dtype)
622 schema2 = self.butler.get(
623 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
624 )
626 # The string types will be objectified by pandas, and the order
627 # will be changed because of pandas indexing.
628 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
629 for name in schema.schema.names:
630 self.assertIn(name, schema2.schema.names)
631 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
633 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
634 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
635 df1 = _makeMultiIndexDataFrame()
637 self.butler.put(df1, self.datasetType, dataId={})
639 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
641 # This is an odd duck, it doesn't really round-trip.
642 # This test simply checks that it's readable, but definitely not
643 # recommended.
645 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
646 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
647 df1, allColumns = _makeSingleIndexDataFrame()
649 self.butler.put(df1, self.datasetType, dataId={})
651 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
653 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
654 # The column order is not maintained.
655 self.assertEqual(set(df1.columns), set(tab2_df.columns))
656 for col in df1.columns:
657 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
659 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
660 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
661 df1 = _makeMultiIndexDataFrame()
663 self.butler.put(df1, self.datasetType, dataId={})
665 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
667 # This is an odd duck, it doesn't really round-trip.
668 # This test simply checks that it's readable, but definitely not
669 # recommended.
672@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
673class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
674 """Tests for InMemoryDatastore, using DataFrameDelegate."""
676 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
678 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
679 df1 = _makeMultiIndexDataFrame()
681 self.butler.put(df1, self.datasetType, dataId={})
683 with self.assertRaises(ValueError):
684 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
686 def testLegacyDataFrame(self):
687 # This test does not work with an inMemoryDatastore.
688 pass
690 def testBadInput(self):
691 df1, _ = _makeSingleIndexDataFrame()
692 delegate = DataFrameDelegate("DataFrame")
694 with self.assertRaises(ValueError):
695 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
697 with self.assertRaises(AttributeError):
698 delegate.getComponent(composite=df1, componentName="nothing")
700 def testStorageClass(self):
701 df1, allColumns = _makeSingleIndexDataFrame()
703 factory = StorageClassFactory()
704 factory.addFromConfig(StorageClassConfig())
706 storageClass = factory.findStorageClass(type(df1), compare_types=False)
707 # Force the name lookup to do name matching.
708 storageClass._pytype = None
709 self.assertEqual(storageClass.name, "DataFrame")
711 storageClass = factory.findStorageClass(type(df1), compare_types=True)
712 # Force the name lookup to do name matching.
713 storageClass._pytype = None
714 self.assertEqual(storageClass.name, "DataFrame")
717@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
718@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
719class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
720 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
722 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
724 def setUp(self):
725 """Create a new butler root for each test."""
726 self.root = makeTestTempDir(TESTDIR)
727 config = Config(self.configFile)
728 self.run = "test_run"
729 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
730 # No dimensions in dataset type so we don't have to worry about
731 # inserting dimension data or defining data IDs.
732 self.datasetType = DatasetType(
733 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions
734 )
735 self.butler.registry.registerDatasetType(self.datasetType)
737 def tearDown(self):
738 removeTestTempDir(self.root)
740 def testAstropyTable(self):
741 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
743 self.butler.put(tab1, self.datasetType, dataId={})
744 # Read the whole Table.
745 tab2 = self.butler.get(self.datasetType, dataId={})
746 self._checkAstropyTableEquality(tab1, tab2)
747 # Read the columns.
748 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
749 self.assertEqual(len(columns2), len(tab1.dtype.names))
750 for i, name in enumerate(tab1.dtype.names):
751 self.assertEqual(columns2[i], name)
752 # Read the rowcount.
753 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
754 self.assertEqual(rowcount, len(tab1))
755 # Read the schema.
756 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
757 self.assertEqual(schema, ArrowAstropySchema(tab1))
758 # Read just some columns a few different ways.
759 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
760 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
761 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
762 self._checkAstropyTableEquality(tab1[("a",)], tab4)
763 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
764 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
765 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
766 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
767 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
768 self._checkAstropyTableEquality(tab1[("a",)], tab7)
769 # Passing an unrecognized column should be a ValueError.
770 with self.assertRaises(ValueError):
771 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
773 def testAstropyTableBigEndian(self):
774 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
776 self.butler.put(tab1, self.datasetType, dataId={})
777 # Read the whole Table.
778 tab2 = self.butler.get(self.datasetType, dataId={})
779 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
781 def testAstropyTableWithMetadata(self):
782 tab1 = _makeSimpleAstropyTable(include_multidim=True)
784 meta = {
785 "meta_a": 5,
786 "meta_b": 10.0,
787 "meta_c": [1, 2, 3],
788 "meta_d": True,
789 "meta_e": "string",
790 }
792 tab1.meta.update(meta)
794 self.butler.put(tab1, self.datasetType, dataId={})
795 # Read the whole Table.
796 tab2 = self.butler.get(self.datasetType, dataId={})
797 # This will check that the metadata is equivalent as well.
798 self._checkAstropyTableEquality(tab1, tab2)
800 def testArrowAstropySchema(self):
801 tab1 = _makeSimpleAstropyTable()
802 tab1_arrow = astropy_to_arrow(tab1)
803 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
805 self.assertIsInstance(schema.schema, atable.Table)
806 self.assertEqual(repr(schema), repr(schema._schema))
807 self.assertNotEqual(schema, "not_a_schema")
808 self.assertEqual(schema, schema)
810 # Test various inequalities
811 tab2 = tab1.copy()
812 tab2.rename_column("index", "index2")
813 schema2 = ArrowAstropySchema(tab2)
814 self.assertNotEqual(schema2, schema)
816 tab2 = tab1.copy()
817 tab2["index"].unit = units.micron
818 schema2 = ArrowAstropySchema(tab2)
819 self.assertNotEqual(schema2, schema)
821 tab2 = tab1.copy()
822 tab2["index"].description = "Index column"
823 schema2 = ArrowAstropySchema(tab2)
824 self.assertNotEqual(schema2, schema)
826 tab2 = tab1.copy()
827 tab2["index"].format = "%05d"
828 schema2 = ArrowAstropySchema(tab2)
829 self.assertNotEqual(schema2, schema)
831 def testAstropyParquet(self):
832 tab1 = _makeSimpleAstropyTable()
834 fname = os.path.join(self.root, "test_astropy.parq")
835 tab1.write(fname)
837 astropy_type = DatasetType(
838 "astropy_parquet",
839 dimensions=(),
840 storageClass="ArrowAstropy",
841 universe=self.butler.dimensions,
842 )
843 self.butler.registry.registerDatasetType(astropy_type)
845 data_id = {}
846 ref = DatasetRef(astropy_type, data_id, run=self.run)
847 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
849 self.butler.ingest(dataset, transfer="copy")
851 self.butler.put(tab1, self.datasetType, dataId={})
853 tab2a = self.butler.get(self.datasetType, dataId={})
854 tab2b = self.butler.get("astropy_parquet", dataId={})
855 self._checkAstropyTableEquality(tab2a, tab2b)
857 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
858 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
859 self.assertEqual(len(columns2b), len(columns2a))
860 for i, name in enumerate(columns2a):
861 self.assertEqual(columns2b[i], name)
863 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
864 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
865 self.assertEqual(rowcount2a, rowcount2b)
867 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
868 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
869 self.assertEqual(schema2a, schema2b)
871 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
872 def testWriteAstropyReadAsArrowTable(self):
873 # This astropy <-> arrow works fine with masked columns.
874 tab1 = _makeSimpleAstropyTable(include_masked=True)
876 self.butler.put(tab1, self.datasetType, dataId={})
878 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
880 tab2_astropy = arrow_to_astropy(tab2)
881 self._checkAstropyTableEquality(tab1, tab2_astropy)
883 # Check reading the columns.
884 columns = tab2.schema.names
885 columns2 = self.butler.get(
886 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
887 )
888 self.assertEqual(columns2, columns)
890 # Check reading the schema.
891 schema = tab2.schema
892 schema2 = self.butler.get(
893 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
894 )
896 self.assertEqual(schema, schema2)
898 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
899 def testWriteAstropyReadAsDataFrame(self):
900 tab1 = _makeSimpleAstropyTable()
902 self.butler.put(tab1, self.datasetType, dataId={})
904 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
906 # This is tricky because it loses the units and gains a bonus pandas
907 # _index_ column, so we just test the dataframe form.
909 tab1_df = tab1.to_pandas()
910 self.assertTrue(tab1_df.equals(tab2))
912 # Check reading the columns.
913 columns = tab2.columns
914 columns2 = self.butler.get(
915 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
916 )
917 self.assertTrue(columns.equals(columns2))
919 # Check reading the schema.
920 schema = DataFrameSchema(tab2)
921 schema2 = self.butler.get(
922 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
923 )
925 self.assertEqual(schema2, schema)
927 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
928 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
929 # We need to special-case the write-as-astropy read-as-pandas code
930 # with masks because pandas has multiple ways to use masked columns.
931 # (When writing an astropy table with masked columns we get an object
932 # column back, but each unmasked element has the correct type.)
933 tab1 = _makeSimpleAstropyTable(include_masked=True)
935 self.butler.put(tab1, self.datasetType, dataId={})
937 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
939 tab1_df = tab1.to_pandas()
941 self.assertTrue(tab1_df.columns.equals(tab2.columns))
942 for name in tab2.columns:
943 col1 = tab1_df[name]
944 col2 = tab2[name]
946 if col1.hasnans:
947 notNull = col1.notnull()
948 self.assertTrue(notNull.equals(col2.notnull()))
949 # Need to check value-by-value because column may
950 # be made of objects, depending on what pandas decides.
951 for index in notNull.values.nonzero()[0]:
952 self.assertEqual(col1[index], col2[index])
953 else:
954 self.assertTrue(col1.equals(col2))
956 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
957 def testWriteAstropyReadAsNumpyTable(self):
958 tab1 = _makeSimpleAstropyTable()
959 self.butler.put(tab1, self.datasetType, dataId={})
961 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
963 # This is tricky because it loses the units.
964 tab2_astropy = atable.Table(tab2)
966 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
968 # Check reading the columns.
969 columns = list(tab2.dtype.names)
970 columns2 = self.butler.get(
971 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
972 )
973 self.assertEqual(columns2, columns)
975 # Check reading the schema.
976 schema = ArrowNumpySchema(tab2.dtype)
977 schema2 = self.butler.get(
978 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
979 )
981 self.assertEqual(schema2, schema)
983 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
984 def testWriteAstropyReadAsNumpyDict(self):
985 tab1 = _makeSimpleAstropyTable()
986 self.butler.put(tab1, self.datasetType, dataId={})
988 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
990 # This is tricky because it loses the units.
991 tab2_astropy = atable.Table(tab2)
993 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
995 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
996 """Check if two astropy tables have the same columns/values.
998 Parameters
999 ----------
1000 table1 : `astropy.table.Table`
1001 table2 : `astropy.table.Table`
1002 skip_units : `bool`
1003 has_bigendian : `bool`
1004 """
1005 if not has_bigendian:
1006 self.assertEqual(table1.dtype, table2.dtype)
1007 else:
1008 for name in table1.dtype.names:
1009 # Only check type matches, force to little-endian.
1010 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1012 self.assertEqual(table1.meta, table2.meta)
1013 if not skip_units:
1014 for name in table1.columns:
1015 self.assertEqual(table1[name].unit, table2[name].unit)
1016 self.assertEqual(table1[name].description, table2[name].description)
1017 self.assertEqual(table1[name].format, table2[name].format)
1018 self.assertTrue(np.all(table1 == table2))
1021@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
1022class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
1023 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
1025 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1027 def testAstropyParquet(self):
1028 # This test does not work with an inMemoryDatastore.
1029 pass
1031 def testBadInput(self):
1032 tab1 = _makeSimpleAstropyTable()
1033 delegate = ArrowAstropyDelegate("ArrowAstropy")
1035 with self.assertRaises(ValueError):
1036 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1038 with self.assertRaises(NotImplementedError):
1039 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1041 with self.assertRaises(AttributeError):
1042 delegate.getComponent(composite=tab1, componentName="nothing")
1045@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1046@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1047class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1048 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1050 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1052 def setUp(self):
1053 """Create a new butler root for each test."""
1054 self.root = makeTestTempDir(TESTDIR)
1055 config = Config(self.configFile)
1056 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1057 # No dimensions in dataset type so we don't have to worry about
1058 # inserting dimension data or defining data IDs.
1059 self.datasetType = DatasetType(
1060 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions
1061 )
1062 self.butler.registry.registerDatasetType(self.datasetType)
1064 def tearDown(self):
1065 removeTestTempDir(self.root)
1067 def testNumpyTable(self):
1068 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1070 self.butler.put(tab1, self.datasetType, dataId={})
1071 # Read the whole Table.
1072 tab2 = self.butler.get(self.datasetType, dataId={})
1073 self._checkNumpyTableEquality(tab1, tab2)
1074 # Read the columns.
1075 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1076 self.assertEqual(len(columns2), len(tab1.dtype.names))
1077 for i, name in enumerate(tab1.dtype.names):
1078 self.assertEqual(columns2[i], name)
1079 # Read the rowcount.
1080 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1081 self.assertEqual(rowcount, len(tab1))
1082 # Read the schema.
1083 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1084 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1085 # Read just some columns a few different ways.
1086 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1087 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1088 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1089 self._checkNumpyTableEquality(
1090 tab1[
1091 [
1092 "a",
1093 ]
1094 ],
1095 tab4,
1096 )
1097 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1098 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1099 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1100 self._checkNumpyTableEquality(
1101 tab1[
1102 [
1103 "ddd",
1104 ]
1105 ],
1106 tab6,
1107 )
1108 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1109 self._checkNumpyTableEquality(
1110 tab1[
1111 [
1112 "a",
1113 ]
1114 ],
1115 tab7,
1116 )
1117 # Passing an unrecognized column should be a ValueError.
1118 with self.assertRaises(ValueError):
1119 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1121 def testNumpyTableBigEndian(self):
1122 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1124 self.butler.put(tab1, self.datasetType, dataId={})
1125 # Read the whole Table.
1126 tab2 = self.butler.get(self.datasetType, dataId={})
1127 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1129 def testArrowNumpySchema(self):
1130 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1131 tab1_arrow = numpy_to_arrow(tab1)
1132 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1134 self.assertIsInstance(schema.schema, np.dtype)
1135 self.assertEqual(repr(schema), repr(schema._dtype))
1136 self.assertNotEqual(schema, "not_a_schema")
1137 self.assertEqual(schema, schema)
1139 # Test inequality
1140 tab2 = tab1.copy()
1141 names = list(tab2.dtype.names)
1142 names[0] = "index2"
1143 tab2.dtype.names = names
1144 schema2 = ArrowNumpySchema(tab2.dtype)
1145 self.assertNotEqual(schema2, schema)
1147 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1148 def testNumpyDictConversions(self):
1149 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1151 # Verify that everything round-trips, including the schema.
1152 tab1_arrow = numpy_to_arrow(tab1)
1153 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1154 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1156 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1157 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1159 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1160 def testWriteNumpyTableReadAsArrowTable(self):
1161 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1163 self.butler.put(tab1, self.datasetType, dataId={})
1165 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1167 tab2_numpy = arrow_to_numpy(tab2)
1169 self._checkNumpyTableEquality(tab1, tab2_numpy)
1171 # Check reading the columns.
1172 columns = tab2.schema.names
1173 columns2 = self.butler.get(
1174 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1175 )
1176 self.assertEqual(columns2, columns)
1178 # Check reading the schema.
1179 schema = tab2.schema
1180 schema2 = self.butler.get(
1181 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1182 )
1183 self.assertEqual(schema2, schema)
1185 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1186 def testWriteNumpyTableReadAsDataFrame(self):
1187 tab1 = _makeSimpleNumpyTable()
1189 self.butler.put(tab1, self.datasetType, dataId={})
1191 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1193 # Converting this back to numpy gets confused with the index column
1194 # and changes the datatype of the string column.
1196 tab1_df = pd.DataFrame(tab1)
1198 self.assertTrue(tab1_df.equals(tab2))
1200 # Check reading the columns.
1201 columns = tab2.columns
1202 columns2 = self.butler.get(
1203 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1204 )
1205 self.assertTrue(columns.equals(columns2))
1207 # Check reading the schema.
1208 schema = DataFrameSchema(tab2)
1209 schema2 = self.butler.get(
1210 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1211 )
1213 self.assertEqual(schema2, schema)
1215 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1216 def testWriteNumpyTableReadAsAstropyTable(self):
1217 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1219 self.butler.put(tab1, self.datasetType, dataId={})
1221 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1222 tab2_numpy = tab2.as_array()
1224 self._checkNumpyTableEquality(tab1, tab2_numpy)
1226 # Check reading the columns.
1227 columns = list(tab2.columns.keys())
1228 columns2 = self.butler.get(
1229 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1230 )
1231 self.assertEqual(columns2, columns)
1233 # Check reading the schema.
1234 schema = ArrowAstropySchema(tab2)
1235 schema2 = self.butler.get(
1236 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1237 )
1239 self.assertEqual(schema2, schema)
1241 def testWriteNumpyTableReadAsNumpyDict(self):
1242 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1244 self.butler.put(tab1, self.datasetType, dataId={})
1246 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1247 tab2_numpy = _numpy_dict_to_numpy(tab2)
1249 self._checkNumpyTableEquality(tab1, tab2_numpy)
1251 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1252 """Check if two numpy tables have the same columns/values
1254 Parameters
1255 ----------
1256 table1 : `numpy.ndarray`
1257 table2 : `numpy.ndarray`
1258 has_bigendian : `bool`
1259 """
1260 self.assertEqual(table1.dtype.names, table2.dtype.names)
1261 for name in table1.dtype.names:
1262 if not has_bigendian:
1263 self.assertEqual(table1.dtype[name], table2.dtype[name])
1264 else:
1265 # Only check type matches, force to little-endian.
1266 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1267 self.assertTrue(np.all(table1 == table2))
1270@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1271class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1272 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1274 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1276 def testBadInput(self):
1277 tab1 = _makeSimpleNumpyTable()
1278 delegate = ArrowNumpyDelegate("ArrowNumpy")
1280 with self.assertRaises(ValueError):
1281 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1283 with self.assertRaises(NotImplementedError):
1284 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1286 with self.assertRaises(AttributeError):
1287 delegate.getComponent(composite=tab1, componentName="nothing")
1289 def testStorageClass(self):
1290 tab1 = _makeSimpleNumpyTable()
1292 factory = StorageClassFactory()
1293 factory.addFromConfig(StorageClassConfig())
1295 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1296 # Force the name lookup to do name matching.
1297 storageClass._pytype = None
1298 self.assertEqual(storageClass.name, "ArrowNumpy")
1300 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1301 # Force the name lookup to do name matching.
1302 storageClass._pytype = None
1303 self.assertEqual(storageClass.name, "ArrowNumpy")
1306@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1307class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1308 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1310 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1312 def setUp(self):
1313 """Create a new butler root for each test."""
1314 self.root = makeTestTempDir(TESTDIR)
1315 config = Config(self.configFile)
1316 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1317 # No dimensions in dataset type so we don't have to worry about
1318 # inserting dimension data or defining data IDs.
1319 self.datasetType = DatasetType(
1320 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions
1321 )
1322 self.butler.registry.registerDatasetType(self.datasetType)
1324 def tearDown(self):
1325 removeTestTempDir(self.root)
1327 def testArrowTable(self):
1328 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1330 self.butler.put(tab1, self.datasetType, dataId={})
1331 # Read the whole Table.
1332 tab2 = self.butler.get(self.datasetType, dataId={})
1333 self.assertEqual(tab2, tab1)
1334 # Read the columns.
1335 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1336 self.assertEqual(len(columns2), len(tab1.schema.names))
1337 for i, name in enumerate(tab1.schema.names):
1338 self.assertEqual(columns2[i], name)
1339 # Read the rowcount.
1340 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1341 self.assertEqual(rowcount, len(tab1))
1342 # Read the schema.
1343 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1344 self.assertEqual(schema, tab1.schema)
1345 # Read just some columns a few different ways.
1346 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1347 self.assertEqual(tab3, tab1.select(("a", "c")))
1348 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1349 self.assertEqual(tab4, tab1.select(("a",)))
1350 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1351 self.assertEqual(tab5, tab1.select(("index", "a")))
1352 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1353 self.assertEqual(tab6, tab1.select(("ddd",)))
1354 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1355 self.assertEqual(tab7, tab1.select(("a",)))
1356 # Passing an unrecognized column should be a ValueError.
1357 with self.assertRaises(ValueError):
1358 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1360 def testEmptyArrowTable(self):
1361 data = _makeSimpleNumpyTable()
1362 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1364 schema = pa.schema(type_list)
1365 arrays = [[]] * len(schema.names)
1367 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1369 self.butler.put(tab1, self.datasetType, dataId={})
1370 tab2 = self.butler.get(self.datasetType, dataId={})
1371 self.assertEqual(tab2, tab1)
1373 tab1_numpy = arrow_to_numpy(tab1)
1374 self.assertEqual(len(tab1_numpy), 0)
1375 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1376 self.assertEqual(tab1_numpy_arrow, tab1)
1378 tab1_pandas = arrow_to_pandas(tab1)
1379 self.assertEqual(len(tab1_pandas), 0)
1380 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1381 # Unfortunately, string/byte columns get mangled when translated
1382 # through empty pandas dataframes.
1383 self.assertEqual(
1384 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1385 tab1.select(("index", "a", "b", "c", "ddd")),
1386 )
1388 tab1_astropy = arrow_to_astropy(tab1)
1389 self.assertEqual(len(tab1_astropy), 0)
1390 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1391 self.assertEqual(tab1_astropy_arrow, tab1)
1393 def testEmptyArrowTableMultidim(self):
1394 data = _makeSimpleNumpyTable(include_multidim=True)
1395 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1397 md = {}
1398 for name in data.dtype.names:
1399 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1401 schema = pa.schema(type_list, metadata=md)
1402 arrays = [[]] * len(schema.names)
1404 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1406 self.butler.put(tab1, self.datasetType, dataId={})
1407 tab2 = self.butler.get(self.datasetType, dataId={})
1408 self.assertEqual(tab2, tab1)
1410 tab1_numpy = arrow_to_numpy(tab1)
1411 self.assertEqual(len(tab1_numpy), 0)
1412 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1413 self.assertEqual(tab1_numpy_arrow, tab1)
1415 tab1_astropy = arrow_to_astropy(tab1)
1416 self.assertEqual(len(tab1_astropy), 0)
1417 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1418 self.assertEqual(tab1_astropy_arrow, tab1)
1420 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1421 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1422 df1, allColumns = _makeSingleIndexDataFrame()
1424 self.butler.put(df1, self.datasetType, dataId={})
1426 # Read back out as a dataframe.
1427 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1428 self.assertTrue(df1.equals(df2))
1430 # Read back out as an arrow table, convert to dataframe.
1431 tab3 = self.butler.get(self.datasetType, dataId={})
1432 df3 = arrow_to_pandas(tab3)
1433 self.assertTrue(df1.equals(df3))
1435 # Check reading the columns.
1436 columns = df2.reset_index().columns
1437 columns2 = self.butler.get(
1438 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1439 )
1440 # We check the set because pandas reorders the columns.
1441 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1443 # Check reading the schema.
1444 schema = DataFrameSchema(df1)
1445 schema2 = self.butler.get(
1446 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1447 )
1448 self.assertEqual(schema2, schema)
1450 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1451 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1452 df1 = _makeMultiIndexDataFrame()
1454 self.butler.put(df1, self.datasetType, dataId={})
1456 # Read back out as a dataframe.
1457 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1458 self.assertTrue(df1.equals(df2))
1460 # Read back out as an arrow table, convert to dataframe.
1461 atab3 = self.butler.get(self.datasetType, dataId={})
1462 df3 = arrow_to_pandas(atab3)
1463 self.assertTrue(df1.equals(df3))
1465 # Check reading the columns.
1466 columns = df2.columns
1467 columns2 = self.butler.get(
1468 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1469 )
1470 self.assertTrue(columns2.equals(columns))
1472 # Check reading the schema.
1473 schema = DataFrameSchema(df1)
1474 schema2 = self.butler.get(
1475 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1476 )
1477 self.assertEqual(schema2, schema)
1479 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1480 def testWriteArrowTableReadAsAstropyTable(self):
1481 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1483 self.butler.put(tab1, self.datasetType, dataId={})
1485 # Read back out as an astropy table.
1486 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1487 self._checkAstropyTableEquality(tab1, tab2)
1489 # Read back out as an arrow table, convert to astropy table.
1490 atab3 = self.butler.get(self.datasetType, dataId={})
1491 tab3 = arrow_to_astropy(atab3)
1492 self._checkAstropyTableEquality(tab1, tab3)
1494 # Check reading the columns.
1495 columns = list(tab2.columns.keys())
1496 columns2 = self.butler.get(
1497 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1498 )
1499 self.assertEqual(columns2, columns)
1501 # Check reading the schema.
1502 schema = ArrowAstropySchema(tab1)
1503 schema2 = self.butler.get(
1504 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1505 )
1506 self.assertEqual(schema2, schema)
1508 # Check the schema conversions and units.
1509 arrow_schema = schema.to_arrow_schema()
1510 for name in arrow_schema.names:
1511 field_metadata = arrow_schema.field(name).metadata
1512 if (
1513 b"description" in field_metadata
1514 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1515 ):
1516 self.assertEqual(schema2.schema[name].description, description)
1517 else:
1518 self.assertIsNone(schema2.schema[name].description)
1519 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1520 self.assertEqual(schema2.schema[name].unit, units.Unit(unit))
1522 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1523 def testWriteArrowTableReadAsNumpyTable(self):
1524 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1526 self.butler.put(tab1, self.datasetType, dataId={})
1528 # Read back out as a numpy table.
1529 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1530 self._checkNumpyTableEquality(tab1, tab2)
1532 # Read back out as an arrow table, convert to numpy table.
1533 atab3 = self.butler.get(self.datasetType, dataId={})
1534 tab3 = arrow_to_numpy(atab3)
1535 self._checkNumpyTableEquality(tab1, tab3)
1537 # Check reading the columns.
1538 columns = list(tab2.dtype.names)
1539 columns2 = self.butler.get(
1540 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1541 )
1542 self.assertEqual(columns2, columns)
1544 # Check reading the schema.
1545 schema = ArrowNumpySchema(tab1.dtype)
1546 schema2 = self.butler.get(
1547 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1548 )
1549 self.assertEqual(schema2, schema)
1551 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1552 def testWriteArrowTableReadAsNumpyDict(self):
1553 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1555 self.butler.put(tab1, self.datasetType, dataId={})
1557 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1558 tab2_numpy = _numpy_dict_to_numpy(tab2)
1559 self._checkNumpyTableEquality(tab1, tab2_numpy)
1561 def _checkAstropyTableEquality(self, table1, table2):
1562 """Check if two astropy tables have the same columns/values
1564 Parameters
1565 ----------
1566 table1 : `astropy.table.Table`
1567 table2 : `astropy.table.Table`
1568 """
1569 self.assertEqual(table1.dtype, table2.dtype)
1570 for name in table1.columns:
1571 self.assertEqual(table1[name].unit, table2[name].unit)
1572 self.assertEqual(table1[name].description, table2[name].description)
1573 self.assertEqual(table1[name].format, table2[name].format)
1574 self.assertTrue(np.all(table1 == table2))
1576 def _checkNumpyTableEquality(self, table1, table2):
1577 """Check if two numpy tables have the same columns/values
1579 Parameters
1580 ----------
1581 table1 : `numpy.ndarray`
1582 table2 : `numpy.ndarray`
1583 """
1584 self.assertEqual(table1.dtype.names, table2.dtype.names)
1585 for name in table1.dtype.names:
1586 self.assertEqual(table1.dtype[name], table2.dtype[name])
1587 self.assertTrue(np.all(table1 == table2))
1590@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1591class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1592 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1594 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1596 def testBadInput(self):
1597 tab1 = _makeSimpleArrowTable()
1598 delegate = ArrowTableDelegate("ArrowTable")
1600 with self.assertRaises(ValueError):
1601 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1603 with self.assertRaises(NotImplementedError):
1604 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1606 with self.assertRaises(AttributeError):
1607 delegate.getComponent(composite=tab1, componentName="nothing")
1609 def testStorageClass(self):
1610 tab1 = _makeSimpleArrowTable()
1612 factory = StorageClassFactory()
1613 factory.addFromConfig(StorageClassConfig())
1615 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1616 # Force the name lookup to do name matching.
1617 storageClass._pytype = None
1618 self.assertEqual(storageClass.name, "ArrowTable")
1620 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1621 # Force the name lookup to do name matching.
1622 storageClass._pytype = None
1623 self.assertEqual(storageClass.name, "ArrowTable")
1626@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1627@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1628class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1629 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1631 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1633 def setUp(self):
1634 """Create a new butler root for each test."""
1635 self.root = makeTestTempDir(TESTDIR)
1636 config = Config(self.configFile)
1637 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1638 # No dimensions in dataset type so we don't have to worry about
1639 # inserting dimension data or defining data IDs.
1640 self.datasetType = DatasetType(
1641 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions
1642 )
1643 self.butler.registry.registerDatasetType(self.datasetType)
1645 def tearDown(self):
1646 removeTestTempDir(self.root)
1648 def testNumpyDict(self):
1649 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1650 dict1 = _numpy_to_numpy_dict(tab1)
1652 self.butler.put(dict1, self.datasetType, dataId={})
1653 # Read the whole table.
1654 dict2 = self.butler.get(self.datasetType, dataId={})
1655 self._checkNumpyDictEquality(dict1, dict2)
1656 # Read the columns.
1657 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1658 self.assertEqual(len(columns2), len(dict1.keys()))
1659 for name in dict1:
1660 self.assertIn(name, columns2)
1661 # Read the rowcount.
1662 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1663 self.assertEqual(rowcount, len(dict1["a"]))
1664 # Read the schema.
1665 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1666 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1667 # Read just some columns a few different ways.
1668 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1669 subdict = {key: dict1[key] for key in ["a", "c"]}
1670 self._checkNumpyDictEquality(subdict, tab3)
1671 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1672 subdict = {key: dict1[key] for key in ["a"]}
1673 self._checkNumpyDictEquality(subdict, tab4)
1674 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1675 subdict = {key: dict1[key] for key in ["index", "a"]}
1676 self._checkNumpyDictEquality(subdict, tab5)
1677 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1678 subdict = {key: dict1[key] for key in ["ddd"]}
1679 self._checkNumpyDictEquality(subdict, tab6)
1680 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1681 subdict = {key: dict1[key] for key in ["a"]}
1682 self._checkNumpyDictEquality(subdict, tab7)
1683 # Passing an unrecognized column should be a ValueError.
1684 with self.assertRaises(ValueError):
1685 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1687 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1688 def testWriteNumpyDictReadAsArrowTable(self):
1689 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1690 dict1 = _numpy_to_numpy_dict(tab1)
1692 self.butler.put(dict1, self.datasetType, dataId={})
1694 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1696 tab2_dict = arrow_to_numpy_dict(tab2)
1698 self._checkNumpyDictEquality(dict1, tab2_dict)
1700 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1701 def testWriteNumpyDictReadAsDataFrame(self):
1702 tab1 = _makeSimpleNumpyTable()
1703 dict1 = _numpy_to_numpy_dict(tab1)
1705 self.butler.put(dict1, self.datasetType, dataId={})
1707 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1709 # The order of the dict may get mixed up, so we need to check column
1710 # by column. We also need to do this in dataframe form because pandas
1711 # changes the datatype of the string column.
1712 tab1_df = pd.DataFrame(tab1)
1714 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1715 for col in tab1_df.columns:
1716 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1718 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1719 def testWriteNumpyDictReadAsAstropyTable(self):
1720 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1721 dict1 = _numpy_to_numpy_dict(tab1)
1723 self.butler.put(dict1, self.datasetType, dataId={})
1725 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1726 tab2_dict = _astropy_to_numpy_dict(tab2)
1728 self._checkNumpyDictEquality(dict1, tab2_dict)
1730 def testWriteNumpyDictReadAsNumpyTable(self):
1731 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1732 dict1 = _numpy_to_numpy_dict(tab1)
1734 self.butler.put(dict1, self.datasetType, dataId={})
1736 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1737 tab2_dict = _numpy_to_numpy_dict(tab2)
1739 self._checkNumpyDictEquality(dict1, tab2_dict)
1741 def testWriteNumpyDictBad(self):
1742 dict1 = {"a": 4, "b": np.ndarray([1])}
1743 with self.assertRaises(RuntimeError):
1744 self.butler.put(dict1, self.datasetType, dataId={})
1746 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1747 with self.assertRaises(RuntimeError):
1748 self.butler.put(dict2, self.datasetType, dataId={})
1750 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1751 with self.assertRaises(RuntimeError):
1752 self.butler.put(dict3, self.datasetType, dataId={})
1754 def _checkNumpyDictEquality(self, dict1, dict2):
1755 """Check if two numpy dicts have the same columns/values.
1757 Parameters
1758 ----------
1759 dict1 : `dict` [`str`, `np.ndarray`]
1760 dict2 : `dict` [`str`, `np.ndarray`]
1761 """
1762 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1763 for name in dict1:
1764 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1765 self.assertTrue(np.all(dict1[name] == dict2[name]))
1768@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1769@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1770class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1771 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1773 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1775 def testWriteNumpyDictBad(self):
1776 # The sub-type checking is not done on in-memory datastore.
1777 pass
1780@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.")
1781class ParquetFormatterArrowSchemaTestCase(unittest.TestCase):
1782 """Tests for ParquetFormatter, ArrowSchema, using local file datastore."""
1784 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1786 def setUp(self):
1787 """Create a new butler root for each test."""
1788 self.root = makeTestTempDir(TESTDIR)
1789 config = Config(self.configFile)
1790 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1791 # No dimensions in dataset type so we don't have to worry about
1792 # inserting dimension data or defining data IDs.
1793 self.datasetType = DatasetType(
1794 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions
1795 )
1796 self.butler.registry.registerDatasetType(self.datasetType)
1798 def tearDown(self):
1799 removeTestTempDir(self.root)
1801 def _makeTestSchema(self):
1802 schema = pa.schema(
1803 [
1804 pa.field(
1805 "int32",
1806 pa.int32(),
1807 nullable=False,
1808 metadata={
1809 "description": "32-bit integer",
1810 "unit": "",
1811 },
1812 ),
1813 pa.field(
1814 "int64",
1815 pa.int64(),
1816 nullable=False,
1817 metadata={
1818 "description": "64-bit integer",
1819 "unit": "",
1820 },
1821 ),
1822 pa.field(
1823 "uint64",
1824 pa.uint64(),
1825 nullable=False,
1826 metadata={
1827 "description": "64-bit unsigned integer",
1828 "unit": "",
1829 },
1830 ),
1831 pa.field(
1832 "float32",
1833 pa.float32(),
1834 nullable=False,
1835 metadata={
1836 "description": "32-bit float",
1837 "unit": "count",
1838 },
1839 ),
1840 pa.field(
1841 "float64",
1842 pa.float64(),
1843 nullable=False,
1844 metadata={
1845 "description": "64-bit float",
1846 "unit": "nJy",
1847 },
1848 ),
1849 pa.field(
1850 "fixed_size_list",
1851 pa.list_(pa.float64(), list_size=10),
1852 nullable=False,
1853 metadata={
1854 "description": "Fixed size list of 64-bit floats.",
1855 "unit": "nJy",
1856 },
1857 ),
1858 pa.field(
1859 "variable_size_list",
1860 pa.list_(pa.float64()),
1861 nullable=False,
1862 metadata={
1863 "description": "Variable size list of 64-bit floats.",
1864 "unit": "nJy",
1865 },
1866 ),
1867 # One of these fields will have no description.
1868 pa.field(
1869 "string",
1870 pa.string(),
1871 nullable=False,
1872 metadata={
1873 "unit": "",
1874 },
1875 ),
1876 # One of these fields will have no metadata.
1877 pa.field(
1878 "binary",
1879 pa.binary(),
1880 nullable=False,
1881 ),
1882 ]
1883 )
1885 return schema
1887 def testArrowSchema(self):
1888 schema1 = self._makeTestSchema()
1889 self.butler.put(schema1, self.datasetType, dataId={})
1891 schema2 = self.butler.get(self.datasetType, dataId={})
1892 self.assertEqual(schema2, schema1)
1894 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.")
1895 def testWriteArrowSchemaReadAsDataFrameSchema(self):
1896 schema1 = self._makeTestSchema()
1897 self.butler.put(schema1, self.datasetType, dataId={})
1899 df_schema1 = DataFrameSchema.from_arrow(schema1)
1901 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema")
1902 self.assertEqual(df_schema2, df_schema1)
1904 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.")
1905 def testWriteArrowSchemaReadAsArrowAstropySchema(self):
1906 schema1 = self._makeTestSchema()
1907 self.butler.put(schema1, self.datasetType, dataId={})
1909 ap_schema1 = ArrowAstropySchema.from_arrow(schema1)
1911 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema")
1912 self.assertEqual(ap_schema2, ap_schema1)
1914 # Confirm that the ap_schema2 has the unit/description we expect.
1915 for name in schema1.names:
1916 field_metadata = schema1.field(name).metadata
1917 if field_metadata is None:
1918 continue
1919 if (
1920 b"description" in field_metadata
1921 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1922 ):
1923 self.assertEqual(ap_schema2.schema[name].description, description)
1924 else:
1925 self.assertIsNone(ap_schema2.schema[name].description)
1926 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1927 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit))
1929 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.")
1930 def testWriteArrowSchemaReadAsArrowNumpySchema(self):
1931 schema1 = self._makeTestSchema()
1932 self.butler.put(schema1, self.datasetType, dataId={})
1934 np_schema1 = ArrowNumpySchema.from_arrow(schema1)
1936 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema")
1937 self.assertEqual(np_schema2, np_schema1)
1940@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.")
1941class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase):
1942 """Tests for InMemoryDatastore and ArrowSchema."""
1944 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1947@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
1948@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
1949class ComputeRowGroupSizeTestCase(unittest.TestCase):
1950 """Tests for compute_row_group_size."""
1952 def testRowGroupSizeNoMetadata(self):
1953 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1955 # We can't use the numpy_to_arrow convenience function because
1956 # that adds metadata.
1957 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
1958 schema = pa.schema(type_list)
1959 arrays = _numpy_style_arrays_to_arrow_arrays(
1960 numpyTable.dtype,
1961 len(numpyTable),
1962 numpyTable,
1963 schema,
1964 )
1965 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
1967 row_group_size = compute_row_group_size(arrowTable.schema)
1969 self.assertGreater(row_group_size, 1_000_000)
1970 self.assertLess(row_group_size, 2_000_000)
1972 def testRowGroupSizeWithMetadata(self):
1973 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1975 arrowTable = numpy_to_arrow(numpyTable)
1977 row_group_size = compute_row_group_size(arrowTable.schema)
1979 self.assertGreater(row_group_size, 1_000_000)
1980 self.assertLess(row_group_size, 2_000_000)
1982 def testRowGroupSizeTinyTable(self):
1983 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
1985 arrowTable = numpy_to_arrow(numpyTable)
1987 row_group_size = compute_row_group_size(arrowTable.schema)
1989 self.assertGreater(row_group_size, 1_000_000)
1991 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
1992 def testRowGroupSizeDataFrameWithLists(self):
1993 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
1994 arrowTable = pandas_to_arrow(df)
1995 row_group_size = compute_row_group_size(arrowTable.schema)
1997 self.assertGreater(row_group_size, 1_000_000)
2000if __name__ == "__main__":
2001 unittest.main()