Coverage for tests/test_parquet.py: 22%
1077 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-12 10:07 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-12 10:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for ParquetFormatter.
30Tests in this module are disabled unless pandas and pyarrow are importable.
31"""
33import os
34import unittest
36try:
37 import pyarrow as pa
38except ImportError:
39 pa = None
40try:
41 import astropy.table as atable
42 from astropy import units
43except ImportError:
44 atable = None
45try:
46 import numpy as np
47except ImportError:
48 np = None
49try:
50 import pandas as pd
51except ImportError:
52 pd = None
54from lsst.daf.butler import (
55 Butler,
56 Config,
57 DatasetRef,
58 DatasetType,
59 FileDataset,
60 StorageClassConfig,
61 StorageClassFactory,
62)
64try:
65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
66except ImportError:
67 atable = None
68 pa = None
69try:
70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
71except ImportError:
72 np = None
73 pa = None
74try:
75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
76except ImportError:
77 pa = None
78try:
79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
80except ImportError:
81 pd = None
82try:
83 from lsst.daf.butler.formatters.parquet import (
84 ArrowAstropySchema,
85 ArrowNumpySchema,
86 DataFrameSchema,
87 ParquetFormatter,
88 _append_numpy_multidim_metadata,
89 _astropy_to_numpy_dict,
90 _numpy_dict_to_numpy,
91 _numpy_dtype_to_arrow_types,
92 _numpy_style_arrays_to_arrow_arrays,
93 _numpy_to_numpy_dict,
94 arrow_to_astropy,
95 arrow_to_numpy,
96 arrow_to_numpy_dict,
97 arrow_to_pandas,
98 astropy_to_arrow,
99 compute_row_group_size,
100 numpy_dict_to_arrow,
101 numpy_to_arrow,
102 pandas_to_arrow,
103 )
104except ImportError:
105 pa = None
106 pd = None
107 atable = None
108 np = None
109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
111TESTDIR = os.path.abspath(os.path.dirname(__file__))
114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
115 """Make a simple numpy table with random data.
117 Parameters
118 ----------
119 include_multidim : `bool`
120 Include multi-dimensional columns.
121 include_bigendian : `bool`
122 Include big-endian columns.
124 Returns
125 -------
126 numpyTable : `numpy.ndarray`
127 """
128 nrow = 5
130 dtype = [
131 ("index", "i4"),
132 ("a", "f8"),
133 ("b", "f8"),
134 ("c", "f8"),
135 ("ddd", "f8"),
136 ("f", "i8"),
137 ("strcol", "U10"),
138 ("bytecol", "a10"),
139 ]
141 if include_multidim:
142 dtype.extend(
143 [
144 ("d1", "f4", (5,)),
145 ("d2", "i8", (5, 10)),
146 ("d3", "f8", (5, 10)),
147 ]
148 )
150 if include_bigendian:
151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
153 data = np.zeros(nrow, dtype=dtype)
154 data["index"][:] = np.arange(nrow)
155 data["a"] = np.random.randn(nrow)
156 data["b"] = np.random.randn(nrow)
157 data["c"] = np.random.randn(nrow)
158 data["ddd"] = np.random.randn(nrow)
159 data["f"] = np.arange(nrow) * 10
160 data["strcol"][:] = "teststring"
161 data["bytecol"][:] = "teststring"
163 if include_multidim:
164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
168 if include_bigendian:
169 data["a_bigendian"][:] = data["a"]
170 data["f_bigendian"][:] = data["f"]
172 return data
175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
176 """Make a single index data frame for testing.
178 Parameters
179 ----------
180 include_masked : `bool`
181 Include masked columns.
182 include_lists : `bool`
183 Include list columns.
185 Returns
186 -------
187 dataFrame : `~pandas.DataFrame`
188 The test dataframe.
189 allColumns : `list` [`str`]
190 List of all the columns (including index columns).
191 """
192 data = _makeSimpleNumpyTable()
193 df = pd.DataFrame(data)
194 df = df.set_index("index")
196 if include_masked:
197 nrow = len(df)
199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
201 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
202 df.loc[1, ["m1", "m2", "mstrcol"]] = None
204 if include_lists:
205 nrow = len(df)
207 df["l1"] = [[0, 0]] * nrow
208 df["l2"] = [[0.0, 0.0]] * nrow
209 df["l3"] = [[]] * nrow
211 allColumns = df.columns.append(pd.Index(df.index.names))
213 return df, allColumns
216def _makeMultiIndexDataFrame():
217 """Make a multi-index data frame for testing.
219 Returns
220 -------
221 dataFrame : `~pandas.DataFrame`
222 The test dataframe.
223 """
224 columns = pd.MultiIndex.from_tuples(
225 [
226 ("g", "a"),
227 ("g", "b"),
228 ("g", "c"),
229 ("r", "a"),
230 ("r", "b"),
231 ("r", "c"),
232 ],
233 names=["filter", "column"],
234 )
235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
237 return df
240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
241 """Make an astropy table for testing.
243 Parameters
244 ----------
245 include_multidim : `bool`
246 Include multi-dimensional columns.
247 include_masked : `bool`
248 Include masked columns.
249 include_bigendian : `bool`
250 Include big-endian columns.
252 Returns
253 -------
254 astropyTable : `astropy.table.Table`
255 The test table.
256 """
257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
258 # Add a couple of units.
259 table = atable.Table(data)
260 table["a"].unit = units.degree
261 table["a"].description = "Description of column a"
262 table["b"].unit = units.meter
263 table["b"].description = "Description of column b"
265 # Add some masked columns.
266 if include_masked:
267 nrow = len(table)
268 mask = np.zeros(nrow, dtype=bool)
269 mask[1] = True
270 # We set the masked columns with the underlying sentinel value
271 # to be able test after serialization.
272 arr = np.arange(nrow, dtype="i8")
273 arr[mask] = -1
274 table["m1"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1)
275 arr = np.arange(nrow, dtype="f4")
276 arr[mask] = np.nan
277 table["m2"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan)
278 table["m3"] = np.arange(nrow, dtype="f4")
279 table["m3"][mask] = np.nan
280 arr = np.zeros(nrow, dtype=np.bool_)
281 arr[mask] = True
282 table["m4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=True)
283 arr = np.arange(nrow, dtype="u4")
284 arr[mask] = 0
285 table["m5"] = np.ma.masked_array(data=arr, mask=mask, fill_value=0)
286 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask, fill_value="")
287 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask, fill_value=b"")
289 return table
292def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
293 """Make an arrow table for testing.
295 Parameters
296 ----------
297 include_multidim : `bool`
298 Include multi-dimensional columns.
299 include_masked : `bool`
300 Include masked columns.
302 Returns
303 -------
304 arrowTable : `pyarrow.Table`
305 The test table.
306 """
307 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
308 return astropy_to_arrow(data)
311@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
312@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
313class ParquetFormatterDataFrameTestCase(unittest.TestCase):
314 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
316 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
318 def setUp(self):
319 """Create a new butler root for each test."""
320 self.root = makeTestTempDir(TESTDIR)
321 config = Config(self.configFile)
322 self.run = "test_run"
323 self.butler = Butler.from_config(
324 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
325 )
326 # No dimensions in dataset type so we don't have to worry about
327 # inserting dimension data or defining data IDs.
328 self.datasetType = DatasetType(
329 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions
330 )
331 self.butler.registry.registerDatasetType(self.datasetType)
333 def tearDown(self):
334 removeTestTempDir(self.root)
336 def testSingleIndexDataFrame(self):
337 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
339 self.butler.put(df1, self.datasetType, dataId={})
340 # Read the whole DataFrame.
341 df2 = self.butler.get(self.datasetType, dataId={})
342 self.assertTrue(df1.equals(df2))
343 # Read just the column descriptions.
344 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
345 self.assertTrue(allColumns.equals(columns2))
346 # Read the rowcount.
347 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
348 self.assertEqual(rowcount, len(df1))
349 # Read the schema.
350 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
351 self.assertEqual(schema, DataFrameSchema(df1))
352 # Read just some columns a few different ways.
353 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
354 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
355 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
356 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
357 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
358 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
359 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
360 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
361 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
362 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
363 # Passing an unrecognized column should be a ValueError.
364 with self.assertRaises(ValueError):
365 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
367 def testSingleIndexDataFrameWithLists(self):
368 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
370 self.butler.put(df1, self.datasetType, dataId={})
371 # Read the whole DataFrame.
372 df2 = self.butler.get(self.datasetType, dataId={})
374 # We need to check the list columns specially because they go
375 # from lists to arrays.
376 for col in ["l1", "l2", "l3"]:
377 for i in range(len(df1)):
378 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
380 def testMultiIndexDataFrame(self):
381 df1 = _makeMultiIndexDataFrame()
383 self.butler.put(df1, self.datasetType, dataId={})
384 # Read the whole DataFrame.
385 df2 = self.butler.get(self.datasetType, dataId={})
386 self.assertTrue(df1.equals(df2))
387 # Read just the column descriptions.
388 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
389 self.assertTrue(df1.columns.equals(columns2))
390 self.assertEqual(columns2.names, df1.columns.names)
391 # Read the rowcount.
392 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
393 self.assertEqual(rowcount, len(df1))
394 # Read the schema.
395 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
396 self.assertEqual(schema, DataFrameSchema(df1))
397 # Read just some columns a few different ways.
398 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
399 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
400 df4 = self.butler.get(
401 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
402 )
403 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
404 column_list = [("g", "a"), ("r", "c")]
405 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
406 self.assertTrue(df1.loc[:, column_list].equals(df5))
407 column_dict = {"filter": "r", "column": ["a", "b"]}
408 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
409 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
410 # Passing an unrecognized column should be a ValueError.
411 with self.assertRaises(ValueError):
412 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
414 def testSingleIndexDataFrameEmptyString(self):
415 """Test persisting a single index dataframe with empty strings."""
416 df1, _ = _makeSingleIndexDataFrame()
418 # Set one of the strings to None
419 df1.at[1, "strcol"] = None
421 self.butler.put(df1, self.datasetType, dataId={})
422 # Read the whole DataFrame.
423 df2 = self.butler.get(self.datasetType, dataId={})
424 self.assertTrue(df1.equals(df2))
426 def testSingleIndexDataFrameAllEmptyStrings(self):
427 """Test persisting a single index dataframe with an empty string
428 column.
429 """
430 df1, _ = _makeSingleIndexDataFrame()
432 # Set all of the strings to None
433 df1.loc[0:, "strcol"] = None
435 self.butler.put(df1, self.datasetType, dataId={})
436 # Read the whole DataFrame.
437 df2 = self.butler.get(self.datasetType, dataId={})
438 self.assertTrue(df1.equals(df2))
440 def testLegacyDataFrame(self):
441 """Test writing a dataframe to parquet via pandas (without additional
442 metadata) and ensure that we can read it back with all the new
443 functionality.
444 """
445 df1, allColumns = _makeSingleIndexDataFrame()
447 fname = os.path.join(self.root, "test_dataframe.parq")
448 df1.to_parquet(fname)
450 legacy_type = DatasetType(
451 "legacy_dataframe",
452 dimensions=(),
453 storageClass="DataFrame",
454 universe=self.butler.dimensions,
455 )
456 self.butler.registry.registerDatasetType(legacy_type)
458 data_id = {}
459 ref = DatasetRef(legacy_type, data_id, run=self.run)
460 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
462 self.butler.ingest(dataset, transfer="copy")
464 self.butler.put(df1, self.datasetType, dataId={})
466 df2a = self.butler.get(self.datasetType, dataId={})
467 df2b = self.butler.get("legacy_dataframe", dataId={})
468 self.assertTrue(df2a.equals(df2b))
470 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
471 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
472 self.assertTrue(df3a.equals(df3b))
474 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
475 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
476 self.assertTrue(columns2a.equals(columns2b))
478 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
479 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
480 self.assertEqual(rowcount2a, rowcount2b)
482 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
483 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
484 self.assertEqual(schema2a, schema2b)
486 def testDataFrameSchema(self):
487 tab1 = _makeSimpleArrowTable()
489 schema = DataFrameSchema.from_arrow(tab1.schema)
491 self.assertIsInstance(schema.schema, pd.DataFrame)
492 self.assertEqual(repr(schema), repr(schema._schema))
493 self.assertNotEqual(schema, "not_a_schema")
494 self.assertEqual(schema, schema)
496 tab2 = _makeMultiIndexDataFrame()
497 schema2 = DataFrameSchema(tab2)
499 self.assertNotEqual(schema, schema2)
501 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
502 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
503 df1, allColumns = _makeSingleIndexDataFrame()
505 self.butler.put(df1, self.datasetType, dataId={})
507 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
509 tab2_df = tab2.to_pandas(index="index")
510 self.assertTrue(df1.equals(tab2_df))
512 # Check reading the columns.
513 columns = list(tab2.columns.keys())
514 columns2 = self.butler.get(
515 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
516 )
517 # We check the set because pandas reorders the columns.
518 self.assertEqual(set(columns2), set(columns))
520 # Check reading the schema.
521 schema = ArrowAstropySchema(tab2)
522 schema2 = self.butler.get(
523 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
524 )
526 # The string types are objectified by pandas, and the order
527 # will be changed because of pandas indexing.
528 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
529 for name in schema.schema.columns:
530 self.assertIn(name, schema2.schema.columns)
531 if schema2.schema[name].dtype != np.dtype("O"):
532 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
534 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
535 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
536 # We need to special-case the write-as-pandas read-as-astropy code
537 # with masks because pandas has multiple ways to use masked columns.
538 # (The string column mask handling in particular is frustratingly
539 # inconsistent.)
540 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
542 self.butler.put(df1, self.datasetType, dataId={})
544 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
545 tab2_df = tab2.to_pandas(index="index")
547 self.assertTrue(df1.columns.equals(tab2_df.columns))
548 for name in tab2_df.columns:
549 col1 = df1[name]
550 col2 = tab2_df[name]
552 if col1.hasnans:
553 notNull = col1.notnull()
554 self.assertTrue(notNull.equals(col2.notnull()))
555 # Need to check value-by-value because column may
556 # be made of objects, depending on what pandas decides.
557 for index in notNull.values.nonzero()[0]:
558 self.assertEqual(col1[index], col2[index])
559 else:
560 self.assertTrue(col1.equals(col2))
562 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
563 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
564 df1 = _makeMultiIndexDataFrame()
566 self.butler.put(df1, self.datasetType, dataId={})
568 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
570 # This is an odd duck, it doesn't really round-trip.
571 # This test simply checks that it's readable, but definitely not
572 # recommended.
574 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
575 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
576 df1, allColumns = _makeSingleIndexDataFrame()
578 self.butler.put(df1, self.datasetType, dataId={})
580 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
582 tab2_df = arrow_to_pandas(tab2)
583 self.assertTrue(df1.equals(tab2_df))
585 # Check reading the columns.
586 columns = list(tab2.schema.names)
587 columns2 = self.butler.get(
588 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
589 )
590 # We check the set because pandas reorders the columns.
591 self.assertEqual(set(columns), set(columns2))
593 # Check reading the schema.
594 schema = tab2.schema
595 schema2 = self.butler.get(
596 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
597 )
599 # These will not have the same metadata, nor will the string column
600 # information be maintained.
601 self.assertEqual(len(schema.names), len(schema2.names))
602 for name in schema.names:
603 if schema.field(name).type not in (pa.string(), pa.binary()):
604 self.assertEqual(schema.field(name).type, schema2.field(name).type)
606 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
607 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
608 df1 = _makeMultiIndexDataFrame()
610 self.butler.put(df1, self.datasetType, dataId={})
612 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
614 tab2_df = arrow_to_pandas(tab2)
615 self.assertTrue(df1.equals(tab2_df))
617 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
618 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
619 df1, allColumns = _makeSingleIndexDataFrame()
621 self.butler.put(df1, self.datasetType, dataId={})
623 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
625 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
626 self.assertTrue(df1.equals(tab2_df))
628 # Check reading the columns.
629 columns = list(tab2.dtype.names)
630 columns2 = self.butler.get(
631 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
632 )
633 # We check the set because pandas reorders the columns.
634 self.assertEqual(set(columns2), set(columns))
636 # Check reading the schema.
637 schema = ArrowNumpySchema(tab2.dtype)
638 schema2 = self.butler.get(
639 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
640 )
642 # The string types will be objectified by pandas, and the order
643 # will be changed because of pandas indexing.
644 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
645 for name in schema.schema.names:
646 self.assertIn(name, schema2.schema.names)
647 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
649 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
650 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
651 df1 = _makeMultiIndexDataFrame()
653 self.butler.put(df1, self.datasetType, dataId={})
655 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
657 # This is an odd duck, it doesn't really round-trip.
658 # This test simply checks that it's readable, but definitely not
659 # recommended.
661 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
662 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
663 df1, allColumns = _makeSingleIndexDataFrame()
665 self.butler.put(df1, self.datasetType, dataId={})
667 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
669 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
670 # The column order is not maintained.
671 self.assertEqual(set(df1.columns), set(tab2_df.columns))
672 for col in df1.columns:
673 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
675 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
676 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
677 df1 = _makeMultiIndexDataFrame()
679 self.butler.put(df1, self.datasetType, dataId={})
681 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
683 # This is an odd duck, it doesn't really round-trip.
684 # This test simply checks that it's readable, but definitely not
685 # recommended.
688@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
689class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
690 """Tests for InMemoryDatastore, using DataFrameDelegate."""
692 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
694 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
695 df1 = _makeMultiIndexDataFrame()
697 self.butler.put(df1, self.datasetType, dataId={})
699 with self.assertRaises(ValueError):
700 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
702 def testLegacyDataFrame(self):
703 # This test does not work with an inMemoryDatastore.
704 pass
706 def testBadInput(self):
707 df1, _ = _makeSingleIndexDataFrame()
708 delegate = DataFrameDelegate("DataFrame")
710 with self.assertRaises(ValueError):
711 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
713 with self.assertRaises(AttributeError):
714 delegate.getComponent(composite=df1, componentName="nothing")
716 def testStorageClass(self):
717 df1, allColumns = _makeSingleIndexDataFrame()
719 factory = StorageClassFactory()
720 factory.addFromConfig(StorageClassConfig())
722 storageClass = factory.findStorageClass(type(df1), compare_types=False)
723 # Force the name lookup to do name matching.
724 storageClass._pytype = None
725 self.assertEqual(storageClass.name, "DataFrame")
727 storageClass = factory.findStorageClass(type(df1), compare_types=True)
728 # Force the name lookup to do name matching.
729 storageClass._pytype = None
730 self.assertEqual(storageClass.name, "DataFrame")
733@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
734@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
735class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
736 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
738 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
740 def setUp(self):
741 """Create a new butler root for each test."""
742 self.root = makeTestTempDir(TESTDIR)
743 config = Config(self.configFile)
744 self.run = "test_run"
745 self.butler = Butler.from_config(
746 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
747 )
748 # No dimensions in dataset type so we don't have to worry about
749 # inserting dimension data or defining data IDs.
750 self.datasetType = DatasetType(
751 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions
752 )
753 self.butler.registry.registerDatasetType(self.datasetType)
755 def tearDown(self):
756 removeTestTempDir(self.root)
758 def testAstropyTable(self):
759 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
761 self.butler.put(tab1, self.datasetType, dataId={})
762 # Read the whole Table.
763 tab2 = self.butler.get(self.datasetType, dataId={})
764 self._checkAstropyTableEquality(tab1, tab2)
765 # Read the columns.
766 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
767 self.assertEqual(len(columns2), len(tab1.dtype.names))
768 for i, name in enumerate(tab1.dtype.names):
769 self.assertEqual(columns2[i], name)
770 # Read the rowcount.
771 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
772 self.assertEqual(rowcount, len(tab1))
773 # Read the schema.
774 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
775 self.assertEqual(schema, ArrowAstropySchema(tab1))
776 # Read just some columns a few different ways.
777 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
778 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
779 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
780 self._checkAstropyTableEquality(tab1[("a",)], tab4)
781 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
782 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
783 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
784 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
785 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
786 self._checkAstropyTableEquality(tab1[("a",)], tab7)
787 # Passing an unrecognized column should be a ValueError.
788 with self.assertRaises(ValueError):
789 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
791 def testAstropyTableBigEndian(self):
792 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
794 self.butler.put(tab1, self.datasetType, dataId={})
795 # Read the whole Table.
796 tab2 = self.butler.get(self.datasetType, dataId={})
797 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
799 def testAstropyTableWithMetadata(self):
800 tab1 = _makeSimpleAstropyTable(include_multidim=True)
802 meta = {
803 "meta_a": 5,
804 "meta_b": 10.0,
805 "meta_c": [1, 2, 3],
806 "meta_d": True,
807 "meta_e": "string",
808 }
810 tab1.meta.update(meta)
812 self.butler.put(tab1, self.datasetType, dataId={})
813 # Read the whole Table.
814 tab2 = self.butler.get(self.datasetType, dataId={})
815 # This will check that the metadata is equivalent as well.
816 self._checkAstropyTableEquality(tab1, tab2)
818 def testArrowAstropySchema(self):
819 tab1 = _makeSimpleAstropyTable()
820 tab1_arrow = astropy_to_arrow(tab1)
821 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
823 self.assertIsInstance(schema.schema, atable.Table)
824 self.assertEqual(repr(schema), repr(schema._schema))
825 self.assertNotEqual(schema, "not_a_schema")
826 self.assertEqual(schema, schema)
828 # Test various inequalities
829 tab2 = tab1.copy()
830 tab2.rename_column("index", "index2")
831 schema2 = ArrowAstropySchema(tab2)
832 self.assertNotEqual(schema2, schema)
834 tab2 = tab1.copy()
835 tab2["index"].unit = units.micron
836 schema2 = ArrowAstropySchema(tab2)
837 self.assertNotEqual(schema2, schema)
839 tab2 = tab1.copy()
840 tab2["index"].description = "Index column"
841 schema2 = ArrowAstropySchema(tab2)
842 self.assertNotEqual(schema2, schema)
844 tab2 = tab1.copy()
845 tab2["index"].format = "%05d"
846 schema2 = ArrowAstropySchema(tab2)
847 self.assertNotEqual(schema2, schema)
849 def testAstropyParquet(self):
850 tab1 = _makeSimpleAstropyTable()
852 fname = os.path.join(self.root, "test_astropy.parq")
853 tab1.write(fname)
855 astropy_type = DatasetType(
856 "astropy_parquet",
857 dimensions=(),
858 storageClass="ArrowAstropy",
859 universe=self.butler.dimensions,
860 )
861 self.butler.registry.registerDatasetType(astropy_type)
863 data_id = {}
864 ref = DatasetRef(astropy_type, data_id, run=self.run)
865 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
867 self.butler.ingest(dataset, transfer="copy")
869 self.butler.put(tab1, self.datasetType, dataId={})
871 tab2a = self.butler.get(self.datasetType, dataId={})
872 tab2b = self.butler.get("astropy_parquet", dataId={})
873 self._checkAstropyTableEquality(tab2a, tab2b)
875 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
876 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
877 self.assertEqual(len(columns2b), len(columns2a))
878 for i, name in enumerate(columns2a):
879 self.assertEqual(columns2b[i], name)
881 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
882 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
883 self.assertEqual(rowcount2a, rowcount2b)
885 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
886 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
887 self.assertEqual(schema2a, schema2b)
889 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
890 def testWriteAstropyReadAsArrowTable(self):
891 # This astropy <-> arrow works fine with masked columns.
892 tab1 = _makeSimpleAstropyTable(include_masked=True)
894 self.butler.put(tab1, self.datasetType, dataId={})
896 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
898 tab2_astropy = arrow_to_astropy(tab2)
899 self._checkAstropyTableEquality(tab1, tab2_astropy)
901 # Check reading the columns.
902 columns = tab2.schema.names
903 columns2 = self.butler.get(
904 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
905 )
906 self.assertEqual(columns2, columns)
908 # Check reading the schema.
909 schema = tab2.schema
910 schema2 = self.butler.get(
911 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
912 )
914 self.assertEqual(schema, schema2)
916 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
917 def testWriteAstropyReadAsDataFrame(self):
918 tab1 = _makeSimpleAstropyTable()
920 self.butler.put(tab1, self.datasetType, dataId={})
922 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
924 # This is tricky because it loses the units and gains a bonus pandas
925 # _index_ column, so we just test the dataframe form.
927 tab1_df = tab1.to_pandas()
928 self.assertTrue(tab1_df.equals(tab2))
930 # Check reading the columns.
931 columns = tab2.columns
932 columns2 = self.butler.get(
933 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
934 )
935 self.assertTrue(columns.equals(columns2))
937 # Check reading the schema.
938 schema = DataFrameSchema(tab2)
939 schema2 = self.butler.get(
940 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
941 )
943 self.assertEqual(schema2, schema)
945 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
946 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
947 # We need to special-case the write-as-astropy read-as-pandas code
948 # with masks because pandas has multiple ways to use masked columns.
949 # (When writing an astropy table with masked columns we get an object
950 # column back, but each unmasked element has the correct type.)
951 tab1 = _makeSimpleAstropyTable(include_masked=True)
953 self.butler.put(tab1, self.datasetType, dataId={})
955 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
957 tab1_df = tab1.to_pandas()
959 self.assertTrue(tab1_df.columns.equals(tab2.columns))
960 for name in tab2.columns:
961 col1 = tab1_df[name]
962 col2 = tab2[name]
964 if col1.hasnans:
965 notNull = col1.notnull()
966 self.assertTrue(notNull.equals(col2.notnull()))
967 # Need to check value-by-value because column may
968 # be made of objects, depending on what pandas decides.
969 for index in notNull.values.nonzero()[0]:
970 self.assertEqual(col1[index], col2[index])
971 else:
972 self.assertTrue(col1.equals(col2))
974 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
975 def testWriteAstropyReadAsNumpyTable(self):
976 tab1 = _makeSimpleAstropyTable()
977 self.butler.put(tab1, self.datasetType, dataId={})
979 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
981 # This is tricky because it loses the units.
982 tab2_astropy = atable.Table(tab2)
984 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
986 # Check reading the columns.
987 columns = list(tab2.dtype.names)
988 columns2 = self.butler.get(
989 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
990 )
991 self.assertEqual(columns2, columns)
993 # Check reading the schema.
994 schema = ArrowNumpySchema(tab2.dtype)
995 schema2 = self.butler.get(
996 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
997 )
999 self.assertEqual(schema2, schema)
1001 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1002 def testWriteAstropyReadAsNumpyDict(self):
1003 tab1 = _makeSimpleAstropyTable()
1004 self.butler.put(tab1, self.datasetType, dataId={})
1006 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1008 # This is tricky because it loses the units.
1009 tab2_astropy = atable.Table(tab2)
1011 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
1013 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
1014 """Check if two astropy tables have the same columns/values.
1016 Parameters
1017 ----------
1018 table1 : `astropy.table.Table`
1019 table2 : `astropy.table.Table`
1020 skip_units : `bool`
1021 has_bigendian : `bool`
1022 """
1023 if not has_bigendian:
1024 self.assertEqual(table1.dtype, table2.dtype)
1025 else:
1026 for name in table1.dtype.names:
1027 # Only check type matches, force to little-endian.
1028 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1030 self.assertEqual(table1.meta, table2.meta)
1031 if not skip_units:
1032 for name in table1.columns:
1033 self.assertEqual(table1[name].unit, table2[name].unit)
1034 self.assertEqual(table1[name].description, table2[name].description)
1035 self.assertEqual(table1[name].format, table2[name].format)
1036 # We need to check masked/regular columns after filling.
1037 has_masked = False
1038 if isinstance(table1[name], atable.column.MaskedColumn):
1039 c1 = table1[name].filled()
1040 has_masked = True
1041 else:
1042 c1 = np.array(table1[name])
1043 if isinstance(table2[name], atable.column.MaskedColumn):
1044 c2 = table2[name].filled()
1045 has_masked = True
1046 else:
1047 c2 = np.array(table2[name])
1048 np.testing.assert_array_equal(c1, c2)
1049 # If we have a masked column then we test the underlying data.
1050 if has_masked:
1051 np.testing.assert_array_equal(np.array(c1), np.array(c2))
1054@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
1055class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
1056 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
1058 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1060 def testAstropyParquet(self):
1061 # This test does not work with an inMemoryDatastore.
1062 pass
1064 def testBadInput(self):
1065 tab1 = _makeSimpleAstropyTable()
1066 delegate = ArrowAstropyDelegate("ArrowAstropy")
1068 with self.assertRaises(ValueError):
1069 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1071 with self.assertRaises(NotImplementedError):
1072 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1074 with self.assertRaises(AttributeError):
1075 delegate.getComponent(composite=tab1, componentName="nothing")
1078@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1079@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1080class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1081 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1083 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1085 def setUp(self):
1086 """Create a new butler root for each test."""
1087 self.root = makeTestTempDir(TESTDIR)
1088 config = Config(self.configFile)
1089 self.butler = Butler.from_config(
1090 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1091 )
1092 # No dimensions in dataset type so we don't have to worry about
1093 # inserting dimension data or defining data IDs.
1094 self.datasetType = DatasetType(
1095 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions
1096 )
1097 self.butler.registry.registerDatasetType(self.datasetType)
1099 def tearDown(self):
1100 removeTestTempDir(self.root)
1102 def testNumpyTable(self):
1103 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1105 self.butler.put(tab1, self.datasetType, dataId={})
1106 # Read the whole Table.
1107 tab2 = self.butler.get(self.datasetType, dataId={})
1108 self._checkNumpyTableEquality(tab1, tab2)
1109 # Read the columns.
1110 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1111 self.assertEqual(len(columns2), len(tab1.dtype.names))
1112 for i, name in enumerate(tab1.dtype.names):
1113 self.assertEqual(columns2[i], name)
1114 # Read the rowcount.
1115 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1116 self.assertEqual(rowcount, len(tab1))
1117 # Read the schema.
1118 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1119 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1120 # Read just some columns a few different ways.
1121 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1122 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1123 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1124 self._checkNumpyTableEquality(
1125 tab1[
1126 [
1127 "a",
1128 ]
1129 ],
1130 tab4,
1131 )
1132 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1133 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1134 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1135 self._checkNumpyTableEquality(
1136 tab1[
1137 [
1138 "ddd",
1139 ]
1140 ],
1141 tab6,
1142 )
1143 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1144 self._checkNumpyTableEquality(
1145 tab1[
1146 [
1147 "a",
1148 ]
1149 ],
1150 tab7,
1151 )
1152 # Passing an unrecognized column should be a ValueError.
1153 with self.assertRaises(ValueError):
1154 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1156 def testNumpyTableBigEndian(self):
1157 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1159 self.butler.put(tab1, self.datasetType, dataId={})
1160 # Read the whole Table.
1161 tab2 = self.butler.get(self.datasetType, dataId={})
1162 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1164 def testArrowNumpySchema(self):
1165 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1166 tab1_arrow = numpy_to_arrow(tab1)
1167 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1169 self.assertIsInstance(schema.schema, np.dtype)
1170 self.assertEqual(repr(schema), repr(schema._dtype))
1171 self.assertNotEqual(schema, "not_a_schema")
1172 self.assertEqual(schema, schema)
1174 # Test inequality
1175 tab2 = tab1.copy()
1176 names = list(tab2.dtype.names)
1177 names[0] = "index2"
1178 tab2.dtype.names = names
1179 schema2 = ArrowNumpySchema(tab2.dtype)
1180 self.assertNotEqual(schema2, schema)
1182 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1183 def testNumpyDictConversions(self):
1184 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1186 # Verify that everything round-trips, including the schema.
1187 tab1_arrow = numpy_to_arrow(tab1)
1188 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1189 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1191 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1192 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1194 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1195 def testWriteNumpyTableReadAsArrowTable(self):
1196 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1198 self.butler.put(tab1, self.datasetType, dataId={})
1200 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1202 tab2_numpy = arrow_to_numpy(tab2)
1204 self._checkNumpyTableEquality(tab1, tab2_numpy)
1206 # Check reading the columns.
1207 columns = tab2.schema.names
1208 columns2 = self.butler.get(
1209 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1210 )
1211 self.assertEqual(columns2, columns)
1213 # Check reading the schema.
1214 schema = tab2.schema
1215 schema2 = self.butler.get(
1216 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1217 )
1218 self.assertEqual(schema2, schema)
1220 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1221 def testWriteNumpyTableReadAsDataFrame(self):
1222 tab1 = _makeSimpleNumpyTable()
1224 self.butler.put(tab1, self.datasetType, dataId={})
1226 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1228 # Converting this back to numpy gets confused with the index column
1229 # and changes the datatype of the string column.
1231 tab1_df = pd.DataFrame(tab1)
1233 self.assertTrue(tab1_df.equals(tab2))
1235 # Check reading the columns.
1236 columns = tab2.columns
1237 columns2 = self.butler.get(
1238 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1239 )
1240 self.assertTrue(columns.equals(columns2))
1242 # Check reading the schema.
1243 schema = DataFrameSchema(tab2)
1244 schema2 = self.butler.get(
1245 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1246 )
1248 self.assertEqual(schema2, schema)
1250 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1251 def testWriteNumpyTableReadAsAstropyTable(self):
1252 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1254 self.butler.put(tab1, self.datasetType, dataId={})
1256 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1257 tab2_numpy = tab2.as_array()
1259 self._checkNumpyTableEquality(tab1, tab2_numpy)
1261 # Check reading the columns.
1262 columns = list(tab2.columns.keys())
1263 columns2 = self.butler.get(
1264 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1265 )
1266 self.assertEqual(columns2, columns)
1268 # Check reading the schema.
1269 schema = ArrowAstropySchema(tab2)
1270 schema2 = self.butler.get(
1271 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1272 )
1274 self.assertEqual(schema2, schema)
1276 def testWriteNumpyTableReadAsNumpyDict(self):
1277 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1279 self.butler.put(tab1, self.datasetType, dataId={})
1281 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1282 tab2_numpy = _numpy_dict_to_numpy(tab2)
1284 self._checkNumpyTableEquality(tab1, tab2_numpy)
1286 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1287 """Check if two numpy tables have the same columns/values
1289 Parameters
1290 ----------
1291 table1 : `numpy.ndarray`
1292 table2 : `numpy.ndarray`
1293 has_bigendian : `bool`
1294 """
1295 self.assertEqual(table1.dtype.names, table2.dtype.names)
1296 for name in table1.dtype.names:
1297 if not has_bigendian:
1298 self.assertEqual(table1.dtype[name], table2.dtype[name])
1299 else:
1300 # Only check type matches, force to little-endian.
1301 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1302 self.assertTrue(np.all(table1 == table2))
1305@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1306class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1307 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1309 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1311 def testBadInput(self):
1312 tab1 = _makeSimpleNumpyTable()
1313 delegate = ArrowNumpyDelegate("ArrowNumpy")
1315 with self.assertRaises(ValueError):
1316 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1318 with self.assertRaises(NotImplementedError):
1319 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1321 with self.assertRaises(AttributeError):
1322 delegate.getComponent(composite=tab1, componentName="nothing")
1324 def testStorageClass(self):
1325 tab1 = _makeSimpleNumpyTable()
1327 factory = StorageClassFactory()
1328 factory.addFromConfig(StorageClassConfig())
1330 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1331 # Force the name lookup to do name matching.
1332 storageClass._pytype = None
1333 self.assertEqual(storageClass.name, "ArrowNumpy")
1335 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1336 # Force the name lookup to do name matching.
1337 storageClass._pytype = None
1338 self.assertEqual(storageClass.name, "ArrowNumpy")
1341@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1342class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1343 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1345 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1347 def setUp(self):
1348 """Create a new butler root for each test."""
1349 self.root = makeTestTempDir(TESTDIR)
1350 config = Config(self.configFile)
1351 self.butler = Butler.from_config(
1352 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1353 )
1354 # No dimensions in dataset type so we don't have to worry about
1355 # inserting dimension data or defining data IDs.
1356 self.datasetType = DatasetType(
1357 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions
1358 )
1359 self.butler.registry.registerDatasetType(self.datasetType)
1361 def tearDown(self):
1362 removeTestTempDir(self.root)
1364 def testArrowTable(self):
1365 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1367 self.butler.put(tab1, self.datasetType, dataId={})
1368 # Read the whole Table.
1369 tab2 = self.butler.get(self.datasetType, dataId={})
1370 # We convert to use the numpy testing framework to handle nan
1371 # comparisons.
1372 self.assertEqual(tab1.schema, tab2.schema)
1373 tab1_np = arrow_to_numpy(tab1)
1374 tab2_np = arrow_to_numpy(tab2)
1375 for col in tab1.column_names:
1376 np.testing.assert_array_equal(tab2_np[col], tab1_np[col])
1377 # Read the columns.
1378 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1379 self.assertEqual(len(columns2), len(tab1.schema.names))
1380 for i, name in enumerate(tab1.schema.names):
1381 self.assertEqual(columns2[i], name)
1382 # Read the rowcount.
1383 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1384 self.assertEqual(rowcount, len(tab1))
1385 # Read the schema.
1386 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1387 self.assertEqual(schema, tab1.schema)
1388 # Read just some columns a few different ways.
1389 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1390 self.assertEqual(tab3, tab1.select(("a", "c")))
1391 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1392 self.assertEqual(tab4, tab1.select(("a",)))
1393 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1394 self.assertEqual(tab5, tab1.select(("index", "a")))
1395 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1396 self.assertEqual(tab6, tab1.select(("ddd",)))
1397 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1398 self.assertEqual(tab7, tab1.select(("a",)))
1399 # Passing an unrecognized column should be a ValueError.
1400 with self.assertRaises(ValueError):
1401 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1403 def testEmptyArrowTable(self):
1404 data = _makeSimpleNumpyTable()
1405 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1407 schema = pa.schema(type_list)
1408 arrays = [[]] * len(schema.names)
1410 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1412 self.butler.put(tab1, self.datasetType, dataId={})
1413 tab2 = self.butler.get(self.datasetType, dataId={})
1414 self.assertEqual(tab2, tab1)
1416 tab1_numpy = arrow_to_numpy(tab1)
1417 self.assertEqual(len(tab1_numpy), 0)
1418 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1419 self.assertEqual(tab1_numpy_arrow, tab1)
1421 tab1_pandas = arrow_to_pandas(tab1)
1422 self.assertEqual(len(tab1_pandas), 0)
1423 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1424 # Unfortunately, string/byte columns get mangled when translated
1425 # through empty pandas dataframes.
1426 self.assertEqual(
1427 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1428 tab1.select(("index", "a", "b", "c", "ddd")),
1429 )
1431 tab1_astropy = arrow_to_astropy(tab1)
1432 self.assertEqual(len(tab1_astropy), 0)
1433 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1434 self.assertEqual(tab1_astropy_arrow, tab1)
1436 def testEmptyArrowTableMultidim(self):
1437 data = _makeSimpleNumpyTable(include_multidim=True)
1438 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1440 md = {}
1441 for name in data.dtype.names:
1442 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1444 schema = pa.schema(type_list, metadata=md)
1445 arrays = [[]] * len(schema.names)
1447 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1449 self.butler.put(tab1, self.datasetType, dataId={})
1450 tab2 = self.butler.get(self.datasetType, dataId={})
1451 self.assertEqual(tab2, tab1)
1453 tab1_numpy = arrow_to_numpy(tab1)
1454 self.assertEqual(len(tab1_numpy), 0)
1455 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1456 self.assertEqual(tab1_numpy_arrow, tab1)
1458 tab1_astropy = arrow_to_astropy(tab1)
1459 self.assertEqual(len(tab1_astropy), 0)
1460 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1461 self.assertEqual(tab1_astropy_arrow, tab1)
1463 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1464 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1465 df1, allColumns = _makeSingleIndexDataFrame()
1467 self.butler.put(df1, self.datasetType, dataId={})
1469 # Read back out as a dataframe.
1470 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1471 self.assertTrue(df1.equals(df2))
1473 # Read back out as an arrow table, convert to dataframe.
1474 tab3 = self.butler.get(self.datasetType, dataId={})
1475 df3 = arrow_to_pandas(tab3)
1476 self.assertTrue(df1.equals(df3))
1478 # Check reading the columns.
1479 columns = df2.reset_index().columns
1480 columns2 = self.butler.get(
1481 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1482 )
1483 # We check the set because pandas reorders the columns.
1484 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1486 # Check reading the schema.
1487 schema = DataFrameSchema(df1)
1488 schema2 = self.butler.get(
1489 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1490 )
1491 self.assertEqual(schema2, schema)
1493 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1494 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1495 df1 = _makeMultiIndexDataFrame()
1497 self.butler.put(df1, self.datasetType, dataId={})
1499 # Read back out as a dataframe.
1500 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1501 self.assertTrue(df1.equals(df2))
1503 # Read back out as an arrow table, convert to dataframe.
1504 atab3 = self.butler.get(self.datasetType, dataId={})
1505 df3 = arrow_to_pandas(atab3)
1506 self.assertTrue(df1.equals(df3))
1508 # Check reading the columns.
1509 columns = df2.columns
1510 columns2 = self.butler.get(
1511 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1512 )
1513 self.assertTrue(columns2.equals(columns))
1515 # Check reading the schema.
1516 schema = DataFrameSchema(df1)
1517 schema2 = self.butler.get(
1518 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1519 )
1520 self.assertEqual(schema2, schema)
1522 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1523 def testWriteArrowTableReadAsAstropyTable(self):
1524 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1526 self.butler.put(tab1, self.datasetType, dataId={})
1528 # Read back out as an astropy table.
1529 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1530 self._checkAstropyTableEquality(tab1, tab2)
1532 # Read back out as an arrow table, convert to astropy table.
1533 atab3 = self.butler.get(self.datasetType, dataId={})
1534 tab3 = arrow_to_astropy(atab3)
1535 self._checkAstropyTableEquality(tab1, tab3)
1537 # Check reading the columns.
1538 columns = list(tab2.columns.keys())
1539 columns2 = self.butler.get(
1540 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1541 )
1542 self.assertEqual(columns2, columns)
1544 # Check reading the schema.
1545 schema = ArrowAstropySchema(tab1)
1546 schema2 = self.butler.get(
1547 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1548 )
1549 self.assertEqual(schema2, schema)
1551 # Check the schema conversions and units.
1552 arrow_schema = schema.to_arrow_schema()
1553 for name in arrow_schema.names:
1554 field_metadata = arrow_schema.field(name).metadata
1555 if (
1556 b"description" in field_metadata
1557 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1558 ):
1559 self.assertEqual(schema2.schema[name].description, description)
1560 else:
1561 self.assertIsNone(schema2.schema[name].description)
1562 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1563 self.assertEqual(schema2.schema[name].unit, units.Unit(unit))
1565 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1566 def testWriteArrowTableReadAsNumpyTable(self):
1567 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1569 self.butler.put(tab1, self.datasetType, dataId={})
1571 # Read back out as a numpy table.
1572 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1573 self._checkNumpyTableEquality(tab1, tab2)
1575 # Read back out as an arrow table, convert to numpy table.
1576 atab3 = self.butler.get(self.datasetType, dataId={})
1577 tab3 = arrow_to_numpy(atab3)
1578 self._checkNumpyTableEquality(tab1, tab3)
1580 # Check reading the columns.
1581 columns = list(tab2.dtype.names)
1582 columns2 = self.butler.get(
1583 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1584 )
1585 self.assertEqual(columns2, columns)
1587 # Check reading the schema.
1588 schema = ArrowNumpySchema(tab1.dtype)
1589 schema2 = self.butler.get(
1590 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1591 )
1592 self.assertEqual(schema2, schema)
1594 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1595 def testWriteArrowTableReadAsNumpyDict(self):
1596 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1598 self.butler.put(tab1, self.datasetType, dataId={})
1600 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1601 tab2_numpy = _numpy_dict_to_numpy(tab2)
1602 self._checkNumpyTableEquality(tab1, tab2_numpy)
1604 def _checkAstropyTableEquality(self, table1, table2):
1605 """Check if two astropy tables have the same columns/values
1607 Parameters
1608 ----------
1609 table1 : `astropy.table.Table`
1610 table2 : `astropy.table.Table`
1611 """
1612 self.assertEqual(table1.dtype, table2.dtype)
1613 for name in table1.columns:
1614 self.assertEqual(table1[name].unit, table2[name].unit)
1615 self.assertEqual(table1[name].description, table2[name].description)
1616 self.assertEqual(table1[name].format, table2[name].format)
1617 # We need to check masked/regular columns after filling.
1618 has_masked = False
1619 if isinstance(table1[name], atable.column.MaskedColumn):
1620 c1 = table1[name].filled()
1621 has_masked = True
1622 else:
1623 c1 = np.array(table1[name])
1624 if isinstance(table2[name], atable.column.MaskedColumn):
1625 c2 = table2[name].filled()
1626 has_masked = True
1627 else:
1628 c2 = np.array(table2[name])
1629 np.testing.assert_array_equal(c1, c2)
1630 # If we have a masked column then we test the underlying data.
1631 if has_masked:
1632 np.testing.assert_array_equal(np.array(c1), np.array(c2))
1634 def _checkNumpyTableEquality(self, table1, table2):
1635 """Check if two numpy tables have the same columns/values
1637 Parameters
1638 ----------
1639 table1 : `numpy.ndarray`
1640 table2 : `numpy.ndarray`
1641 """
1642 self.assertEqual(table1.dtype.names, table2.dtype.names)
1643 for name in table1.dtype.names:
1644 self.assertEqual(table1.dtype[name], table2.dtype[name])
1645 self.assertTrue(np.all(table1 == table2))
1648@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1649class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1650 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1652 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1654 def testBadInput(self):
1655 tab1 = _makeSimpleArrowTable()
1656 delegate = ArrowTableDelegate("ArrowTable")
1658 with self.assertRaises(ValueError):
1659 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1661 with self.assertRaises(NotImplementedError):
1662 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1664 with self.assertRaises(AttributeError):
1665 delegate.getComponent(composite=tab1, componentName="nothing")
1667 def testStorageClass(self):
1668 tab1 = _makeSimpleArrowTable()
1670 factory = StorageClassFactory()
1671 factory.addFromConfig(StorageClassConfig())
1673 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1674 # Force the name lookup to do name matching.
1675 storageClass._pytype = None
1676 self.assertEqual(storageClass.name, "ArrowTable")
1678 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1679 # Force the name lookup to do name matching.
1680 storageClass._pytype = None
1681 self.assertEqual(storageClass.name, "ArrowTable")
1684@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1685@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1686class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1687 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1689 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1691 def setUp(self):
1692 """Create a new butler root for each test."""
1693 self.root = makeTestTempDir(TESTDIR)
1694 config = Config(self.configFile)
1695 self.butler = Butler.from_config(
1696 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1697 )
1698 # No dimensions in dataset type so we don't have to worry about
1699 # inserting dimension data or defining data IDs.
1700 self.datasetType = DatasetType(
1701 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions
1702 )
1703 self.butler.registry.registerDatasetType(self.datasetType)
1705 def tearDown(self):
1706 removeTestTempDir(self.root)
1708 def testNumpyDict(self):
1709 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1710 dict1 = _numpy_to_numpy_dict(tab1)
1712 self.butler.put(dict1, self.datasetType, dataId={})
1713 # Read the whole table.
1714 dict2 = self.butler.get(self.datasetType, dataId={})
1715 self._checkNumpyDictEquality(dict1, dict2)
1716 # Read the columns.
1717 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1718 self.assertEqual(len(columns2), len(dict1.keys()))
1719 for name in dict1:
1720 self.assertIn(name, columns2)
1721 # Read the rowcount.
1722 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1723 self.assertEqual(rowcount, len(dict1["a"]))
1724 # Read the schema.
1725 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1726 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1727 # Read just some columns a few different ways.
1728 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1729 subdict = {key: dict1[key] for key in ["a", "c"]}
1730 self._checkNumpyDictEquality(subdict, tab3)
1731 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1732 subdict = {key: dict1[key] for key in ["a"]}
1733 self._checkNumpyDictEquality(subdict, tab4)
1734 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1735 subdict = {key: dict1[key] for key in ["index", "a"]}
1736 self._checkNumpyDictEquality(subdict, tab5)
1737 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1738 subdict = {key: dict1[key] for key in ["ddd"]}
1739 self._checkNumpyDictEquality(subdict, tab6)
1740 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1741 subdict = {key: dict1[key] for key in ["a"]}
1742 self._checkNumpyDictEquality(subdict, tab7)
1743 # Passing an unrecognized column should be a ValueError.
1744 with self.assertRaises(ValueError):
1745 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1747 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1748 def testWriteNumpyDictReadAsArrowTable(self):
1749 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1750 dict1 = _numpy_to_numpy_dict(tab1)
1752 self.butler.put(dict1, self.datasetType, dataId={})
1754 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1756 tab2_dict = arrow_to_numpy_dict(tab2)
1758 self._checkNumpyDictEquality(dict1, tab2_dict)
1760 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1761 def testWriteNumpyDictReadAsDataFrame(self):
1762 tab1 = _makeSimpleNumpyTable()
1763 dict1 = _numpy_to_numpy_dict(tab1)
1765 self.butler.put(dict1, self.datasetType, dataId={})
1767 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1769 # The order of the dict may get mixed up, so we need to check column
1770 # by column. We also need to do this in dataframe form because pandas
1771 # changes the datatype of the string column.
1772 tab1_df = pd.DataFrame(tab1)
1774 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1775 for col in tab1_df.columns:
1776 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1778 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1779 def testWriteNumpyDictReadAsAstropyTable(self):
1780 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1781 dict1 = _numpy_to_numpy_dict(tab1)
1783 self.butler.put(dict1, self.datasetType, dataId={})
1785 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1786 tab2_dict = _astropy_to_numpy_dict(tab2)
1788 self._checkNumpyDictEquality(dict1, tab2_dict)
1790 def testWriteNumpyDictReadAsNumpyTable(self):
1791 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1792 dict1 = _numpy_to_numpy_dict(tab1)
1794 self.butler.put(dict1, self.datasetType, dataId={})
1796 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1797 tab2_dict = _numpy_to_numpy_dict(tab2)
1799 self._checkNumpyDictEquality(dict1, tab2_dict)
1801 def testWriteNumpyDictBad(self):
1802 dict1 = {"a": 4, "b": np.ndarray([1])}
1803 with self.assertRaises(RuntimeError):
1804 self.butler.put(dict1, self.datasetType, dataId={})
1806 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1807 with self.assertRaises(RuntimeError):
1808 self.butler.put(dict2, self.datasetType, dataId={})
1810 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1811 with self.assertRaises(RuntimeError):
1812 self.butler.put(dict3, self.datasetType, dataId={})
1814 def _checkNumpyDictEquality(self, dict1, dict2):
1815 """Check if two numpy dicts have the same columns/values.
1817 Parameters
1818 ----------
1819 dict1 : `dict` [`str`, `np.ndarray`]
1820 dict2 : `dict` [`str`, `np.ndarray`]
1821 """
1822 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1823 for name in dict1:
1824 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1825 self.assertTrue(np.all(dict1[name] == dict2[name]))
1828@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1829@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1830class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1831 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1833 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1835 def testWriteNumpyDictBad(self):
1836 # The sub-type checking is not done on in-memory datastore.
1837 pass
1840@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.")
1841class ParquetFormatterArrowSchemaTestCase(unittest.TestCase):
1842 """Tests for ParquetFormatter, ArrowSchema, using local file datastore."""
1844 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1846 def setUp(self):
1847 """Create a new butler root for each test."""
1848 self.root = makeTestTempDir(TESTDIR)
1849 config = Config(self.configFile)
1850 self.butler = Butler.from_config(
1851 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1852 )
1853 # No dimensions in dataset type so we don't have to worry about
1854 # inserting dimension data or defining data IDs.
1855 self.datasetType = DatasetType(
1856 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions
1857 )
1858 self.butler.registry.registerDatasetType(self.datasetType)
1860 def tearDown(self):
1861 removeTestTempDir(self.root)
1863 def _makeTestSchema(self):
1864 schema = pa.schema(
1865 [
1866 pa.field(
1867 "int32",
1868 pa.int32(),
1869 nullable=False,
1870 metadata={
1871 "description": "32-bit integer",
1872 "unit": "",
1873 },
1874 ),
1875 pa.field(
1876 "int64",
1877 pa.int64(),
1878 nullable=False,
1879 metadata={
1880 "description": "64-bit integer",
1881 "unit": "",
1882 },
1883 ),
1884 pa.field(
1885 "uint64",
1886 pa.uint64(),
1887 nullable=False,
1888 metadata={
1889 "description": "64-bit unsigned integer",
1890 "unit": "",
1891 },
1892 ),
1893 pa.field(
1894 "float32",
1895 pa.float32(),
1896 nullable=False,
1897 metadata={
1898 "description": "32-bit float",
1899 "unit": "count",
1900 },
1901 ),
1902 pa.field(
1903 "float64",
1904 pa.float64(),
1905 nullable=False,
1906 metadata={
1907 "description": "64-bit float",
1908 "unit": "nJy",
1909 },
1910 ),
1911 pa.field(
1912 "fixed_size_list",
1913 pa.list_(pa.float64(), list_size=10),
1914 nullable=False,
1915 metadata={
1916 "description": "Fixed size list of 64-bit floats.",
1917 "unit": "nJy",
1918 },
1919 ),
1920 pa.field(
1921 "variable_size_list",
1922 pa.list_(pa.float64()),
1923 nullable=False,
1924 metadata={
1925 "description": "Variable size list of 64-bit floats.",
1926 "unit": "nJy",
1927 },
1928 ),
1929 # One of these fields will have no description.
1930 pa.field(
1931 "string",
1932 pa.string(),
1933 nullable=False,
1934 metadata={
1935 "unit": "",
1936 },
1937 ),
1938 # One of these fields will have no metadata.
1939 pa.field(
1940 "binary",
1941 pa.binary(),
1942 nullable=False,
1943 ),
1944 ]
1945 )
1947 return schema
1949 def testArrowSchema(self):
1950 schema1 = self._makeTestSchema()
1951 self.butler.put(schema1, self.datasetType, dataId={})
1953 schema2 = self.butler.get(self.datasetType, dataId={})
1954 self.assertEqual(schema2, schema1)
1956 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.")
1957 def testWriteArrowSchemaReadAsDataFrameSchema(self):
1958 schema1 = self._makeTestSchema()
1959 self.butler.put(schema1, self.datasetType, dataId={})
1961 df_schema1 = DataFrameSchema.from_arrow(schema1)
1963 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema")
1964 self.assertEqual(df_schema2, df_schema1)
1966 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.")
1967 def testWriteArrowSchemaReadAsArrowAstropySchema(self):
1968 schema1 = self._makeTestSchema()
1969 self.butler.put(schema1, self.datasetType, dataId={})
1971 ap_schema1 = ArrowAstropySchema.from_arrow(schema1)
1973 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema")
1974 self.assertEqual(ap_schema2, ap_schema1)
1976 # Confirm that the ap_schema2 has the unit/description we expect.
1977 for name in schema1.names:
1978 field_metadata = schema1.field(name).metadata
1979 if field_metadata is None:
1980 continue
1981 if (
1982 b"description" in field_metadata
1983 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1984 ):
1985 self.assertEqual(ap_schema2.schema[name].description, description)
1986 else:
1987 self.assertIsNone(ap_schema2.schema[name].description)
1988 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1989 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit))
1991 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.")
1992 def testWriteArrowSchemaReadAsArrowNumpySchema(self):
1993 schema1 = self._makeTestSchema()
1994 self.butler.put(schema1, self.datasetType, dataId={})
1996 np_schema1 = ArrowNumpySchema.from_arrow(schema1)
1998 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema")
1999 self.assertEqual(np_schema2, np_schema1)
2002@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.")
2003class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase):
2004 """Tests for InMemoryDatastore and ArrowSchema."""
2006 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
2009@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
2010@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
2011class ComputeRowGroupSizeTestCase(unittest.TestCase):
2012 """Tests for compute_row_group_size."""
2014 def testRowGroupSizeNoMetadata(self):
2015 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
2017 # We can't use the numpy_to_arrow convenience function because
2018 # that adds metadata.
2019 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
2020 schema = pa.schema(type_list)
2021 arrays = _numpy_style_arrays_to_arrow_arrays(
2022 numpyTable.dtype,
2023 len(numpyTable),
2024 numpyTable,
2025 schema,
2026 )
2027 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
2029 row_group_size = compute_row_group_size(arrowTable.schema)
2031 self.assertGreater(row_group_size, 1_000_000)
2032 self.assertLess(row_group_size, 2_000_000)
2034 def testRowGroupSizeWithMetadata(self):
2035 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
2037 arrowTable = numpy_to_arrow(numpyTable)
2039 row_group_size = compute_row_group_size(arrowTable.schema)
2041 self.assertGreater(row_group_size, 1_000_000)
2042 self.assertLess(row_group_size, 2_000_000)
2044 def testRowGroupSizeTinyTable(self):
2045 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
2047 arrowTable = numpy_to_arrow(numpyTable)
2049 row_group_size = compute_row_group_size(arrowTable.schema)
2051 self.assertGreater(row_group_size, 1_000_000)
2053 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
2054 def testRowGroupSizeDataFrameWithLists(self):
2055 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
2056 arrowTable = pandas_to_arrow(df)
2057 row_group_size = compute_row_group_size(arrowTable.schema)
2059 self.assertGreater(row_group_size, 1_000_000)
2062if __name__ == "__main__":
2063 unittest.main()