Coverage for tests/test_parquet.py: 22%
1084 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for ParquetFormatter.
30Tests in this module are disabled unless pandas and pyarrow are importable.
31"""
33import os
34import unittest
36try:
37 import pyarrow as pa
38except ImportError:
39 pa = None
40try:
41 import astropy.table as atable
42 from astropy import units
43except ImportError:
44 atable = None
45try:
46 import numpy as np
47except ImportError:
48 np = None
49try:
50 import pandas as pd
51except ImportError:
52 pd = None
54from lsst.daf.butler import (
55 Butler,
56 Config,
57 DatasetRef,
58 DatasetType,
59 FileDataset,
60 StorageClassConfig,
61 StorageClassFactory,
62)
64try:
65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
66except ImportError:
67 atable = None
68 pa = None
69try:
70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
71except ImportError:
72 np = None
73 pa = None
74try:
75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
76except ImportError:
77 pa = None
78try:
79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
80except ImportError:
81 pd = None
82try:
83 from lsst.daf.butler.formatters.parquet import (
84 ArrowAstropySchema,
85 ArrowNumpySchema,
86 DataFrameSchema,
87 ParquetFormatter,
88 _append_numpy_multidim_metadata,
89 _astropy_to_numpy_dict,
90 _numpy_dict_to_numpy,
91 _numpy_dtype_to_arrow_types,
92 _numpy_style_arrays_to_arrow_arrays,
93 _numpy_to_numpy_dict,
94 arrow_to_astropy,
95 arrow_to_numpy,
96 arrow_to_numpy_dict,
97 arrow_to_pandas,
98 astropy_to_arrow,
99 compute_row_group_size,
100 numpy_dict_to_arrow,
101 numpy_to_arrow,
102 pandas_to_arrow,
103 )
104except ImportError:
105 pa = None
106 pd = None
107 atable = None
108 np = None
109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
111TESTDIR = os.path.abspath(os.path.dirname(__file__))
114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
115 """Make a simple numpy table with random data.
117 Parameters
118 ----------
119 include_multidim : `bool`
120 Include multi-dimensional columns.
121 include_bigendian : `bool`
122 Include big-endian columns.
124 Returns
125 -------
126 numpyTable : `numpy.ndarray`
127 """
128 nrow = 5
130 dtype = [
131 ("index", "i4"),
132 ("a", "f8"),
133 ("b", "f8"),
134 ("c", "f8"),
135 ("ddd", "f8"),
136 ("f", "i8"),
137 ("strcol", "U10"),
138 ("bytecol", "a10"),
139 ]
141 if include_multidim:
142 dtype.extend(
143 [
144 ("d1", "f4", (5,)),
145 ("d2", "i8", (5, 10)),
146 ("d3", "f8", (5, 10)),
147 ]
148 )
150 if include_bigendian:
151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
153 data = np.zeros(nrow, dtype=dtype)
154 data["index"][:] = np.arange(nrow)
155 data["a"] = np.random.randn(nrow)
156 data["b"] = np.random.randn(nrow)
157 data["c"] = np.random.randn(nrow)
158 data["ddd"] = np.random.randn(nrow)
159 data["f"] = np.arange(nrow) * 10
160 data["strcol"][:] = "teststring"
161 data["bytecol"][:] = "teststring"
163 if include_multidim:
164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
168 if include_bigendian:
169 data["a_bigendian"][:] = data["a"]
170 data["f_bigendian"][:] = data["f"]
172 return data
175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
176 """Make a single index data frame for testing.
178 Parameters
179 ----------
180 include_masked : `bool`
181 Include masked columns.
182 include_lists : `bool`
183 Include list columns.
185 Returns
186 -------
187 dataFrame : `~pandas.DataFrame`
188 The test dataframe.
189 allColumns : `list` [`str`]
190 List of all the columns (including index columns).
191 """
192 data = _makeSimpleNumpyTable()
193 df = pd.DataFrame(data)
194 df = df.set_index("index")
196 if include_masked:
197 nrow = len(df)
199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
201 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
202 df.loc[1, ["m1", "m2", "mstrcol"]] = None
204 if include_lists:
205 nrow = len(df)
207 df["l1"] = [[0, 0]] * nrow
208 df["l2"] = [[0.0, 0.0]] * nrow
209 df["l3"] = [[]] * nrow
211 allColumns = df.columns.append(pd.Index(df.index.names))
213 return df, allColumns
216def _makeMultiIndexDataFrame():
217 """Make a multi-index data frame for testing.
219 Returns
220 -------
221 dataFrame : `~pandas.DataFrame`
222 The test dataframe.
223 """
224 columns = pd.MultiIndex.from_tuples(
225 [
226 ("g", "a"),
227 ("g", "b"),
228 ("g", "c"),
229 ("r", "a"),
230 ("r", "b"),
231 ("r", "c"),
232 ],
233 names=["filter", "column"],
234 )
235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
237 return df
240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
241 """Make an astropy table for testing.
243 Parameters
244 ----------
245 include_multidim : `bool`
246 Include multi-dimensional columns.
247 include_masked : `bool`
248 Include masked columns.
249 include_bigendian : `bool`
250 Include big-endian columns.
252 Returns
253 -------
254 astropyTable : `astropy.table.Table`
255 The test table.
256 """
257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
258 # Add a couple of units.
259 table = atable.Table(data)
260 table["a"].unit = units.degree
261 table["a"].description = "Description of column a"
262 table["b"].unit = units.meter
263 table["b"].description = "Description of column b"
265 # Add some masked columns.
266 if include_masked:
267 nrow = len(table)
268 mask = np.zeros(nrow, dtype=bool)
269 mask[1] = True
270 # We set the masked columns with the underlying sentinel value
271 # to be able test after serialization.
273 # Masked 64-bit integer.
274 arr = np.arange(nrow, dtype="i8")
275 arr[mask] = -1
276 table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1)
277 # Masked 32-bit float.
278 arr = np.arange(nrow, dtype="f4")
279 arr[mask] = np.nan
280 table["m_f4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan)
281 # Unmasked 32-bit float with NaNs.
282 table["um_f4"] = arr
283 # Masked 64-bit float.
284 arr = np.arange(nrow, dtype="f8")
285 arr[mask] = np.nan
286 table["m_f8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan)
287 # Unmasked 64-bit float with NaNs.
288 table["um_f8"] = arr
289 # Masked boolean.
290 arr = np.zeros(nrow, dtype=np.bool_)
291 arr[mask] = True
292 table["m_bool"] = np.ma.masked_array(data=arr, mask=mask, fill_value=True)
293 # Masked unsigned 32-bit unsigned int.
294 arr = np.arange(nrow, dtype="u4")
295 arr[mask] = 0
296 table["m_u4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=0)
297 # Masked string.
298 table["m_str"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask, fill_value="")
299 # Masked bytes.
300 table["m_byte"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask, fill_value=b"")
302 return table
305def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
306 """Make an arrow table for testing.
308 Parameters
309 ----------
310 include_multidim : `bool`
311 Include multi-dimensional columns.
312 include_masked : `bool`
313 Include masked columns.
315 Returns
316 -------
317 arrowTable : `pyarrow.Table`
318 The test table.
319 """
320 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
321 return astropy_to_arrow(data)
324@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
325@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
326class ParquetFormatterDataFrameTestCase(unittest.TestCase):
327 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
329 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
331 def setUp(self):
332 """Create a new butler root for each test."""
333 self.root = makeTestTempDir(TESTDIR)
334 config = Config(self.configFile)
335 self.run = "test_run"
336 self.butler = Butler.from_config(
337 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
338 )
339 # No dimensions in dataset type so we don't have to worry about
340 # inserting dimension data or defining data IDs.
341 self.datasetType = DatasetType(
342 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions
343 )
344 self.butler.registry.registerDatasetType(self.datasetType)
346 def tearDown(self):
347 removeTestTempDir(self.root)
349 def testSingleIndexDataFrame(self):
350 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
352 self.butler.put(df1, self.datasetType, dataId={})
353 # Read the whole DataFrame.
354 df2 = self.butler.get(self.datasetType, dataId={})
355 self.assertTrue(df1.equals(df2))
356 # Read just the column descriptions.
357 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
358 self.assertTrue(allColumns.equals(columns2))
359 # Read the rowcount.
360 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
361 self.assertEqual(rowcount, len(df1))
362 # Read the schema.
363 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
364 self.assertEqual(schema, DataFrameSchema(df1))
365 # Read just some columns a few different ways.
366 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
367 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
368 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
369 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
370 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
371 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
372 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
373 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
374 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
375 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
376 # Passing an unrecognized column should be a ValueError.
377 with self.assertRaises(ValueError):
378 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
380 def testSingleIndexDataFrameWithLists(self):
381 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
383 self.butler.put(df1, self.datasetType, dataId={})
384 # Read the whole DataFrame.
385 df2 = self.butler.get(self.datasetType, dataId={})
387 # We need to check the list columns specially because they go
388 # from lists to arrays.
389 for col in ["l1", "l2", "l3"]:
390 for i in range(len(df1)):
391 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
393 def testMultiIndexDataFrame(self):
394 df1 = _makeMultiIndexDataFrame()
396 self.butler.put(df1, self.datasetType, dataId={})
397 # Read the whole DataFrame.
398 df2 = self.butler.get(self.datasetType, dataId={})
399 self.assertTrue(df1.equals(df2))
400 # Read just the column descriptions.
401 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
402 self.assertTrue(df1.columns.equals(columns2))
403 self.assertEqual(columns2.names, df1.columns.names)
404 # Read the rowcount.
405 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
406 self.assertEqual(rowcount, len(df1))
407 # Read the schema.
408 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
409 self.assertEqual(schema, DataFrameSchema(df1))
410 # Read just some columns a few different ways.
411 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
412 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
413 df4 = self.butler.get(
414 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
415 )
416 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
417 column_list = [("g", "a"), ("r", "c")]
418 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
419 self.assertTrue(df1.loc[:, column_list].equals(df5))
420 column_dict = {"filter": "r", "column": ["a", "b"]}
421 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
422 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
423 # Passing an unrecognized column should be a ValueError.
424 with self.assertRaises(ValueError):
425 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
427 def testSingleIndexDataFrameEmptyString(self):
428 """Test persisting a single index dataframe with empty strings."""
429 df1, _ = _makeSingleIndexDataFrame()
431 # Set one of the strings to None
432 df1.at[1, "strcol"] = None
434 self.butler.put(df1, self.datasetType, dataId={})
435 # Read the whole DataFrame.
436 df2 = self.butler.get(self.datasetType, dataId={})
437 self.assertTrue(df1.equals(df2))
439 def testSingleIndexDataFrameAllEmptyStrings(self):
440 """Test persisting a single index dataframe with an empty string
441 column.
442 """
443 df1, _ = _makeSingleIndexDataFrame()
445 # Set all of the strings to None
446 df1.loc[0:, "strcol"] = None
448 self.butler.put(df1, self.datasetType, dataId={})
449 # Read the whole DataFrame.
450 df2 = self.butler.get(self.datasetType, dataId={})
451 self.assertTrue(df1.equals(df2))
453 def testLegacyDataFrame(self):
454 """Test writing a dataframe to parquet via pandas (without additional
455 metadata) and ensure that we can read it back with all the new
456 functionality.
457 """
458 df1, allColumns = _makeSingleIndexDataFrame()
460 fname = os.path.join(self.root, "test_dataframe.parq")
461 df1.to_parquet(fname)
463 legacy_type = DatasetType(
464 "legacy_dataframe",
465 dimensions=(),
466 storageClass="DataFrame",
467 universe=self.butler.dimensions,
468 )
469 self.butler.registry.registerDatasetType(legacy_type)
471 data_id = {}
472 ref = DatasetRef(legacy_type, data_id, run=self.run)
473 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
475 self.butler.ingest(dataset, transfer="copy")
477 self.butler.put(df1, self.datasetType, dataId={})
479 df2a = self.butler.get(self.datasetType, dataId={})
480 df2b = self.butler.get("legacy_dataframe", dataId={})
481 self.assertTrue(df2a.equals(df2b))
483 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
484 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
485 self.assertTrue(df3a.equals(df3b))
487 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
488 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
489 self.assertTrue(columns2a.equals(columns2b))
491 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
492 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
493 self.assertEqual(rowcount2a, rowcount2b)
495 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
496 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
497 self.assertEqual(schema2a, schema2b)
499 def testDataFrameSchema(self):
500 tab1 = _makeSimpleArrowTable()
502 schema = DataFrameSchema.from_arrow(tab1.schema)
504 self.assertIsInstance(schema.schema, pd.DataFrame)
505 self.assertEqual(repr(schema), repr(schema._schema))
506 self.assertNotEqual(schema, "not_a_schema")
507 self.assertEqual(schema, schema)
509 tab2 = _makeMultiIndexDataFrame()
510 schema2 = DataFrameSchema(tab2)
512 self.assertNotEqual(schema, schema2)
514 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
515 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
516 df1, allColumns = _makeSingleIndexDataFrame()
518 self.butler.put(df1, self.datasetType, dataId={})
520 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
522 tab2_df = tab2.to_pandas(index="index")
523 self.assertTrue(df1.equals(tab2_df))
525 # Check reading the columns.
526 columns = list(tab2.columns.keys())
527 columns2 = self.butler.get(
528 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
529 )
530 # We check the set because pandas reorders the columns.
531 self.assertEqual(set(columns2), set(columns))
533 # Check reading the schema.
534 schema = ArrowAstropySchema(tab2)
535 schema2 = self.butler.get(
536 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
537 )
539 # The string types are objectified by pandas, and the order
540 # will be changed because of pandas indexing.
541 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
542 for name in schema.schema.columns:
543 self.assertIn(name, schema2.schema.columns)
544 if schema2.schema[name].dtype != np.dtype("O"):
545 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
547 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
548 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
549 # We need to special-case the write-as-pandas read-as-astropy code
550 # with masks because pandas has multiple ways to use masked columns.
551 # (The string column mask handling in particular is frustratingly
552 # inconsistent.)
553 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
555 self.butler.put(df1, self.datasetType, dataId={})
557 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
558 tab2_df = tab2.to_pandas(index="index")
560 self.assertTrue(df1.columns.equals(tab2_df.columns))
561 for name in tab2_df.columns:
562 col1 = df1[name]
563 col2 = tab2_df[name]
565 if col1.hasnans:
566 notNull = col1.notnull()
567 self.assertTrue(notNull.equals(col2.notnull()))
568 # Need to check value-by-value because column may
569 # be made of objects, depending on what pandas decides.
570 for index in notNull.values.nonzero()[0]:
571 self.assertEqual(col1[index], col2[index])
572 else:
573 self.assertTrue(col1.equals(col2))
575 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
576 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
577 df1 = _makeMultiIndexDataFrame()
579 self.butler.put(df1, self.datasetType, dataId={})
581 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
583 # This is an odd duck, it doesn't really round-trip.
584 # This test simply checks that it's readable, but definitely not
585 # recommended.
587 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
588 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
589 df1, allColumns = _makeSingleIndexDataFrame()
591 self.butler.put(df1, self.datasetType, dataId={})
593 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
595 tab2_df = arrow_to_pandas(tab2)
596 self.assertTrue(df1.equals(tab2_df))
598 # Check reading the columns.
599 columns = list(tab2.schema.names)
600 columns2 = self.butler.get(
601 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
602 )
603 # We check the set because pandas reorders the columns.
604 self.assertEqual(set(columns), set(columns2))
606 # Check reading the schema.
607 schema = tab2.schema
608 schema2 = self.butler.get(
609 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
610 )
612 # These will not have the same metadata, nor will the string column
613 # information be maintained.
614 self.assertEqual(len(schema.names), len(schema2.names))
615 for name in schema.names:
616 if schema.field(name).type not in (pa.string(), pa.binary()):
617 self.assertEqual(schema.field(name).type, schema2.field(name).type)
619 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
620 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
621 df1 = _makeMultiIndexDataFrame()
623 self.butler.put(df1, self.datasetType, dataId={})
625 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
627 tab2_df = arrow_to_pandas(tab2)
628 self.assertTrue(df1.equals(tab2_df))
630 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
631 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
632 df1, allColumns = _makeSingleIndexDataFrame()
634 self.butler.put(df1, self.datasetType, dataId={})
636 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
638 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
639 self.assertTrue(df1.equals(tab2_df))
641 # Check reading the columns.
642 columns = list(tab2.dtype.names)
643 columns2 = self.butler.get(
644 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
645 )
646 # We check the set because pandas reorders the columns.
647 self.assertEqual(set(columns2), set(columns))
649 # Check reading the schema.
650 schema = ArrowNumpySchema(tab2.dtype)
651 schema2 = self.butler.get(
652 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
653 )
655 # The string types will be objectified by pandas, and the order
656 # will be changed because of pandas indexing.
657 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
658 for name in schema.schema.names:
659 self.assertIn(name, schema2.schema.names)
660 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
662 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
663 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
664 df1 = _makeMultiIndexDataFrame()
666 self.butler.put(df1, self.datasetType, dataId={})
668 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
670 # This is an odd duck, it doesn't really round-trip.
671 # This test simply checks that it's readable, but definitely not
672 # recommended.
674 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
675 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
676 df1, allColumns = _makeSingleIndexDataFrame()
678 self.butler.put(df1, self.datasetType, dataId={})
680 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
682 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
683 # The column order is not maintained.
684 self.assertEqual(set(df1.columns), set(tab2_df.columns))
685 for col in df1.columns:
686 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
688 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
689 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
690 df1 = _makeMultiIndexDataFrame()
692 self.butler.put(df1, self.datasetType, dataId={})
694 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
696 # This is an odd duck, it doesn't really round-trip.
697 # This test simply checks that it's readable, but definitely not
698 # recommended.
701@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
702class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
703 """Tests for InMemoryDatastore, using DataFrameDelegate."""
705 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
707 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
708 df1 = _makeMultiIndexDataFrame()
710 self.butler.put(df1, self.datasetType, dataId={})
712 with self.assertRaises(ValueError):
713 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
715 def testLegacyDataFrame(self):
716 # This test does not work with an inMemoryDatastore.
717 pass
719 def testBadInput(self):
720 df1, _ = _makeSingleIndexDataFrame()
721 delegate = DataFrameDelegate("DataFrame")
723 with self.assertRaises(ValueError):
724 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
726 with self.assertRaises(AttributeError):
727 delegate.getComponent(composite=df1, componentName="nothing")
729 def testStorageClass(self):
730 df1, allColumns = _makeSingleIndexDataFrame()
732 factory = StorageClassFactory()
733 factory.addFromConfig(StorageClassConfig())
735 storageClass = factory.findStorageClass(type(df1), compare_types=False)
736 # Force the name lookup to do name matching.
737 storageClass._pytype = None
738 self.assertEqual(storageClass.name, "DataFrame")
740 storageClass = factory.findStorageClass(type(df1), compare_types=True)
741 # Force the name lookup to do name matching.
742 storageClass._pytype = None
743 self.assertEqual(storageClass.name, "DataFrame")
746@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
747@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
748class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
749 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
751 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
753 def setUp(self):
754 """Create a new butler root for each test."""
755 self.root = makeTestTempDir(TESTDIR)
756 config = Config(self.configFile)
757 self.run = "test_run"
758 self.butler = Butler.from_config(
759 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
760 )
761 # No dimensions in dataset type so we don't have to worry about
762 # inserting dimension data or defining data IDs.
763 self.datasetType = DatasetType(
764 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions
765 )
766 self.butler.registry.registerDatasetType(self.datasetType)
768 def tearDown(self):
769 removeTestTempDir(self.root)
771 def testAstropyTable(self):
772 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
774 self.butler.put(tab1, self.datasetType, dataId={})
775 # Read the whole Table.
776 tab2 = self.butler.get(self.datasetType, dataId={})
777 self._checkAstropyTableEquality(tab1, tab2)
778 # Read the columns.
779 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
780 self.assertEqual(len(columns2), len(tab1.dtype.names))
781 for i, name in enumerate(tab1.dtype.names):
782 self.assertEqual(columns2[i], name)
783 # Read the rowcount.
784 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
785 self.assertEqual(rowcount, len(tab1))
786 # Read the schema.
787 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
788 self.assertEqual(schema, ArrowAstropySchema(tab1))
789 # Read just some columns a few different ways.
790 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
791 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
792 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
793 self._checkAstropyTableEquality(tab1[("a",)], tab4)
794 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
795 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
796 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
797 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
798 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
799 self._checkAstropyTableEquality(tab1[("a",)], tab7)
800 # Passing an unrecognized column should be a ValueError.
801 with self.assertRaises(ValueError):
802 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
804 def testAstropyTableBigEndian(self):
805 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
807 self.butler.put(tab1, self.datasetType, dataId={})
808 # Read the whole Table.
809 tab2 = self.butler.get(self.datasetType, dataId={})
810 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
812 def testAstropyTableWithMetadata(self):
813 tab1 = _makeSimpleAstropyTable(include_multidim=True)
815 meta = {
816 "meta_a": 5,
817 "meta_b": 10.0,
818 "meta_c": [1, 2, 3],
819 "meta_d": True,
820 "meta_e": "string",
821 }
823 tab1.meta.update(meta)
825 self.butler.put(tab1, self.datasetType, dataId={})
826 # Read the whole Table.
827 tab2 = self.butler.get(self.datasetType, dataId={})
828 # This will check that the metadata is equivalent as well.
829 self._checkAstropyTableEquality(tab1, tab2)
831 def testArrowAstropySchema(self):
832 tab1 = _makeSimpleAstropyTable()
833 tab1_arrow = astropy_to_arrow(tab1)
834 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
836 self.assertIsInstance(schema.schema, atable.Table)
837 self.assertEqual(repr(schema), repr(schema._schema))
838 self.assertNotEqual(schema, "not_a_schema")
839 self.assertEqual(schema, schema)
841 # Test various inequalities
842 tab2 = tab1.copy()
843 tab2.rename_column("index", "index2")
844 schema2 = ArrowAstropySchema(tab2)
845 self.assertNotEqual(schema2, schema)
847 tab2 = tab1.copy()
848 tab2["index"].unit = units.micron
849 schema2 = ArrowAstropySchema(tab2)
850 self.assertNotEqual(schema2, schema)
852 tab2 = tab1.copy()
853 tab2["index"].description = "Index column"
854 schema2 = ArrowAstropySchema(tab2)
855 self.assertNotEqual(schema2, schema)
857 tab2 = tab1.copy()
858 tab2["index"].format = "%05d"
859 schema2 = ArrowAstropySchema(tab2)
860 self.assertNotEqual(schema2, schema)
862 def testAstropyParquet(self):
863 tab1 = _makeSimpleAstropyTable()
865 fname = os.path.join(self.root, "test_astropy.parq")
866 tab1.write(fname)
868 astropy_type = DatasetType(
869 "astropy_parquet",
870 dimensions=(),
871 storageClass="ArrowAstropy",
872 universe=self.butler.dimensions,
873 )
874 self.butler.registry.registerDatasetType(astropy_type)
876 data_id = {}
877 ref = DatasetRef(astropy_type, data_id, run=self.run)
878 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
880 self.butler.ingest(dataset, transfer="copy")
882 self.butler.put(tab1, self.datasetType, dataId={})
884 tab2a = self.butler.get(self.datasetType, dataId={})
885 tab2b = self.butler.get("astropy_parquet", dataId={})
886 self._checkAstropyTableEquality(tab2a, tab2b)
888 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
889 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
890 self.assertEqual(len(columns2b), len(columns2a))
891 for i, name in enumerate(columns2a):
892 self.assertEqual(columns2b[i], name)
894 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
895 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
896 self.assertEqual(rowcount2a, rowcount2b)
898 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
899 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
900 self.assertEqual(schema2a, schema2b)
902 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
903 def testWriteAstropyReadAsArrowTable(self):
904 # This astropy <-> arrow works fine with masked columns.
905 tab1 = _makeSimpleAstropyTable(include_masked=True)
907 self.butler.put(tab1, self.datasetType, dataId={})
909 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
911 tab2_astropy = arrow_to_astropy(tab2)
912 self._checkAstropyTableEquality(tab1, tab2_astropy)
914 # Check reading the columns.
915 columns = tab2.schema.names
916 columns2 = self.butler.get(
917 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
918 )
919 self.assertEqual(columns2, columns)
921 # Check reading the schema.
922 schema = tab2.schema
923 schema2 = self.butler.get(
924 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
925 )
927 self.assertEqual(schema, schema2)
929 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
930 def testWriteAstropyReadAsDataFrame(self):
931 tab1 = _makeSimpleAstropyTable()
933 self.butler.put(tab1, self.datasetType, dataId={})
935 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
937 # This is tricky because it loses the units and gains a bonus pandas
938 # _index_ column, so we just test the dataframe form.
940 tab1_df = tab1.to_pandas()
941 self.assertTrue(tab1_df.equals(tab2))
943 # Check reading the columns.
944 columns = tab2.columns
945 columns2 = self.butler.get(
946 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
947 )
948 self.assertTrue(columns.equals(columns2))
950 # Check reading the schema.
951 schema = DataFrameSchema(tab2)
952 schema2 = self.butler.get(
953 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
954 )
956 self.assertEqual(schema2, schema)
958 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
959 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
960 # We need to special-case the write-as-astropy read-as-pandas code
961 # with masks because pandas has multiple ways to use masked columns.
962 # (When writing an astropy table with masked columns we get an object
963 # column back, but each unmasked element has the correct type.)
964 tab1 = _makeSimpleAstropyTable(include_masked=True)
966 self.butler.put(tab1, self.datasetType, dataId={})
968 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
970 tab1_df = tab1.to_pandas()
972 self.assertTrue(tab1_df.columns.equals(tab2.columns))
973 for name in tab2.columns:
974 col1 = tab1_df[name]
975 col2 = tab2[name]
977 if col1.hasnans:
978 notNull = col1.notnull()
979 self.assertTrue(notNull.equals(col2.notnull()))
980 # Need to check value-by-value because column may
981 # be made of objects, depending on what pandas decides.
982 for index in notNull.values.nonzero()[0]:
983 self.assertEqual(col1[index], col2[index])
984 else:
985 self.assertTrue(col1.equals(col2))
987 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
988 def testWriteAstropyReadAsNumpyTable(self):
989 tab1 = _makeSimpleAstropyTable()
990 self.butler.put(tab1, self.datasetType, dataId={})
992 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
994 # This is tricky because it loses the units.
995 tab2_astropy = atable.Table(tab2)
997 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
999 # Check reading the columns.
1000 columns = list(tab2.dtype.names)
1001 columns2 = self.butler.get(
1002 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1003 )
1004 self.assertEqual(columns2, columns)
1006 # Check reading the schema.
1007 schema = ArrowNumpySchema(tab2.dtype)
1008 schema2 = self.butler.get(
1009 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1010 )
1012 self.assertEqual(schema2, schema)
1014 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1015 def testWriteAstropyReadAsNumpyDict(self):
1016 tab1 = _makeSimpleAstropyTable()
1017 self.butler.put(tab1, self.datasetType, dataId={})
1019 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1021 # This is tricky because it loses the units.
1022 tab2_astropy = atable.Table(tab2)
1024 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
1026 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
1027 """Check if two astropy tables have the same columns/values.
1029 Parameters
1030 ----------
1031 table1 : `astropy.table.Table`
1032 table2 : `astropy.table.Table`
1033 skip_units : `bool`
1034 has_bigendian : `bool`
1035 """
1036 if not has_bigendian:
1037 self.assertEqual(table1.dtype, table2.dtype)
1038 else:
1039 for name in table1.dtype.names:
1040 # Only check type matches, force to little-endian.
1041 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1043 self.assertEqual(table1.meta, table2.meta)
1044 if not skip_units:
1045 for name in table1.columns:
1046 self.assertEqual(table1[name].unit, table2[name].unit)
1047 self.assertEqual(table1[name].description, table2[name].description)
1048 self.assertEqual(table1[name].format, table2[name].format)
1049 # We need to check masked/regular columns after filling.
1050 has_masked = False
1051 if isinstance(table1[name], atable.column.MaskedColumn):
1052 c1 = table1[name].filled()
1053 has_masked = True
1054 else:
1055 c1 = np.array(table1[name])
1056 if has_masked:
1057 self.assertIsInstance(table2[name], atable.column.MaskedColumn)
1058 c2 = table2[name].filled()
1059 else:
1060 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn))
1061 c2 = np.array(table2[name])
1062 np.testing.assert_array_equal(c1, c2)
1063 # If we have a masked column then we test the underlying data.
1064 if has_masked:
1065 np.testing.assert_array_equal(np.array(c1), np.array(c2))
1066 np.testing.assert_array_equal(table1[name].mask, table2[name].mask)
1069@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
1070class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
1071 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
1073 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1075 def testAstropyParquet(self):
1076 # This test does not work with an inMemoryDatastore.
1077 pass
1079 def testBadInput(self):
1080 tab1 = _makeSimpleAstropyTable()
1081 delegate = ArrowAstropyDelegate("ArrowAstropy")
1083 with self.assertRaises(ValueError):
1084 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1086 with self.assertRaises(NotImplementedError):
1087 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1089 with self.assertRaises(AttributeError):
1090 delegate.getComponent(composite=tab1, componentName="nothing")
1093@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1094@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1095class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1096 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1098 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1100 def setUp(self):
1101 """Create a new butler root for each test."""
1102 self.root = makeTestTempDir(TESTDIR)
1103 config = Config(self.configFile)
1104 self.butler = Butler.from_config(
1105 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1106 )
1107 # No dimensions in dataset type so we don't have to worry about
1108 # inserting dimension data or defining data IDs.
1109 self.datasetType = DatasetType(
1110 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions
1111 )
1112 self.butler.registry.registerDatasetType(self.datasetType)
1114 def tearDown(self):
1115 removeTestTempDir(self.root)
1117 def testNumpyTable(self):
1118 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1120 self.butler.put(tab1, self.datasetType, dataId={})
1121 # Read the whole Table.
1122 tab2 = self.butler.get(self.datasetType, dataId={})
1123 self._checkNumpyTableEquality(tab1, tab2)
1124 # Read the columns.
1125 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1126 self.assertEqual(len(columns2), len(tab1.dtype.names))
1127 for i, name in enumerate(tab1.dtype.names):
1128 self.assertEqual(columns2[i], name)
1129 # Read the rowcount.
1130 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1131 self.assertEqual(rowcount, len(tab1))
1132 # Read the schema.
1133 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1134 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1135 # Read just some columns a few different ways.
1136 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1137 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1138 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1139 self._checkNumpyTableEquality(
1140 tab1[
1141 [
1142 "a",
1143 ]
1144 ],
1145 tab4,
1146 )
1147 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1148 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1149 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1150 self._checkNumpyTableEquality(
1151 tab1[
1152 [
1153 "ddd",
1154 ]
1155 ],
1156 tab6,
1157 )
1158 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1159 self._checkNumpyTableEquality(
1160 tab1[
1161 [
1162 "a",
1163 ]
1164 ],
1165 tab7,
1166 )
1167 # Passing an unrecognized column should be a ValueError.
1168 with self.assertRaises(ValueError):
1169 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1171 def testNumpyTableBigEndian(self):
1172 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1174 self.butler.put(tab1, self.datasetType, dataId={})
1175 # Read the whole Table.
1176 tab2 = self.butler.get(self.datasetType, dataId={})
1177 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1179 def testArrowNumpySchema(self):
1180 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1181 tab1_arrow = numpy_to_arrow(tab1)
1182 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1184 self.assertIsInstance(schema.schema, np.dtype)
1185 self.assertEqual(repr(schema), repr(schema._dtype))
1186 self.assertNotEqual(schema, "not_a_schema")
1187 self.assertEqual(schema, schema)
1189 # Test inequality
1190 tab2 = tab1.copy()
1191 names = list(tab2.dtype.names)
1192 names[0] = "index2"
1193 tab2.dtype.names = names
1194 schema2 = ArrowNumpySchema(tab2.dtype)
1195 self.assertNotEqual(schema2, schema)
1197 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1198 def testNumpyDictConversions(self):
1199 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1201 # Verify that everything round-trips, including the schema.
1202 tab1_arrow = numpy_to_arrow(tab1)
1203 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1204 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1206 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1207 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1209 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1210 def testWriteNumpyTableReadAsArrowTable(self):
1211 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1213 self.butler.put(tab1, self.datasetType, dataId={})
1215 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1217 tab2_numpy = arrow_to_numpy(tab2)
1219 self._checkNumpyTableEquality(tab1, tab2_numpy)
1221 # Check reading the columns.
1222 columns = tab2.schema.names
1223 columns2 = self.butler.get(
1224 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1225 )
1226 self.assertEqual(columns2, columns)
1228 # Check reading the schema.
1229 schema = tab2.schema
1230 schema2 = self.butler.get(
1231 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1232 )
1233 self.assertEqual(schema2, schema)
1235 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1236 def testWriteNumpyTableReadAsDataFrame(self):
1237 tab1 = _makeSimpleNumpyTable()
1239 self.butler.put(tab1, self.datasetType, dataId={})
1241 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1243 # Converting this back to numpy gets confused with the index column
1244 # and changes the datatype of the string column.
1246 tab1_df = pd.DataFrame(tab1)
1248 self.assertTrue(tab1_df.equals(tab2))
1250 # Check reading the columns.
1251 columns = tab2.columns
1252 columns2 = self.butler.get(
1253 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1254 )
1255 self.assertTrue(columns.equals(columns2))
1257 # Check reading the schema.
1258 schema = DataFrameSchema(tab2)
1259 schema2 = self.butler.get(
1260 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1261 )
1263 self.assertEqual(schema2, schema)
1265 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1266 def testWriteNumpyTableReadAsAstropyTable(self):
1267 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1269 self.butler.put(tab1, self.datasetType, dataId={})
1271 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1272 tab2_numpy = tab2.as_array()
1274 self._checkNumpyTableEquality(tab1, tab2_numpy)
1276 # Check reading the columns.
1277 columns = list(tab2.columns.keys())
1278 columns2 = self.butler.get(
1279 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1280 )
1281 self.assertEqual(columns2, columns)
1283 # Check reading the schema.
1284 schema = ArrowAstropySchema(tab2)
1285 schema2 = self.butler.get(
1286 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1287 )
1289 self.assertEqual(schema2, schema)
1291 def testWriteNumpyTableReadAsNumpyDict(self):
1292 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1294 self.butler.put(tab1, self.datasetType, dataId={})
1296 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1297 tab2_numpy = _numpy_dict_to_numpy(tab2)
1299 self._checkNumpyTableEquality(tab1, tab2_numpy)
1301 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1302 """Check if two numpy tables have the same columns/values
1304 Parameters
1305 ----------
1306 table1 : `numpy.ndarray`
1307 table2 : `numpy.ndarray`
1308 has_bigendian : `bool`
1309 """
1310 self.assertEqual(table1.dtype.names, table2.dtype.names)
1311 for name in table1.dtype.names:
1312 if not has_bigendian:
1313 self.assertEqual(table1.dtype[name], table2.dtype[name])
1314 else:
1315 # Only check type matches, force to little-endian.
1316 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1317 self.assertTrue(np.all(table1 == table2))
1320@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1321class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1322 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1324 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1326 def testBadInput(self):
1327 tab1 = _makeSimpleNumpyTable()
1328 delegate = ArrowNumpyDelegate("ArrowNumpy")
1330 with self.assertRaises(ValueError):
1331 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1333 with self.assertRaises(NotImplementedError):
1334 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1336 with self.assertRaises(AttributeError):
1337 delegate.getComponent(composite=tab1, componentName="nothing")
1339 def testStorageClass(self):
1340 tab1 = _makeSimpleNumpyTable()
1342 factory = StorageClassFactory()
1343 factory.addFromConfig(StorageClassConfig())
1345 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1346 # Force the name lookup to do name matching.
1347 storageClass._pytype = None
1348 self.assertEqual(storageClass.name, "ArrowNumpy")
1350 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1351 # Force the name lookup to do name matching.
1352 storageClass._pytype = None
1353 self.assertEqual(storageClass.name, "ArrowNumpy")
1356@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1357class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1358 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1360 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1362 def setUp(self):
1363 """Create a new butler root for each test."""
1364 self.root = makeTestTempDir(TESTDIR)
1365 config = Config(self.configFile)
1366 self.butler = Butler.from_config(
1367 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1368 )
1369 # No dimensions in dataset type so we don't have to worry about
1370 # inserting dimension data or defining data IDs.
1371 self.datasetType = DatasetType(
1372 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions
1373 )
1374 self.butler.registry.registerDatasetType(self.datasetType)
1376 def tearDown(self):
1377 removeTestTempDir(self.root)
1379 def testArrowTable(self):
1380 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1382 self.butler.put(tab1, self.datasetType, dataId={})
1383 # Read the whole Table.
1384 tab2 = self.butler.get(self.datasetType, dataId={})
1385 # We convert to use the numpy testing framework to handle nan
1386 # comparisons.
1387 self.assertEqual(tab1.schema, tab2.schema)
1388 tab1_np = arrow_to_numpy(tab1)
1389 tab2_np = arrow_to_numpy(tab2)
1390 for col in tab1.column_names:
1391 np.testing.assert_array_equal(tab2_np[col], tab1_np[col])
1392 # Read the columns.
1393 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1394 self.assertEqual(len(columns2), len(tab1.schema.names))
1395 for i, name in enumerate(tab1.schema.names):
1396 self.assertEqual(columns2[i], name)
1397 # Read the rowcount.
1398 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1399 self.assertEqual(rowcount, len(tab1))
1400 # Read the schema.
1401 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1402 self.assertEqual(schema, tab1.schema)
1403 # Read just some columns a few different ways.
1404 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1405 self.assertEqual(tab3, tab1.select(("a", "c")))
1406 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1407 self.assertEqual(tab4, tab1.select(("a",)))
1408 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1409 self.assertEqual(tab5, tab1.select(("index", "a")))
1410 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1411 self.assertEqual(tab6, tab1.select(("ddd",)))
1412 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1413 self.assertEqual(tab7, tab1.select(("a",)))
1414 # Passing an unrecognized column should be a ValueError.
1415 with self.assertRaises(ValueError):
1416 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1418 def testEmptyArrowTable(self):
1419 data = _makeSimpleNumpyTable()
1420 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1422 schema = pa.schema(type_list)
1423 arrays = [[]] * len(schema.names)
1425 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1427 self.butler.put(tab1, self.datasetType, dataId={})
1428 tab2 = self.butler.get(self.datasetType, dataId={})
1429 self.assertEqual(tab2, tab1)
1431 tab1_numpy = arrow_to_numpy(tab1)
1432 self.assertEqual(len(tab1_numpy), 0)
1433 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1434 self.assertEqual(tab1_numpy_arrow, tab1)
1436 tab1_pandas = arrow_to_pandas(tab1)
1437 self.assertEqual(len(tab1_pandas), 0)
1438 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1439 # Unfortunately, string/byte columns get mangled when translated
1440 # through empty pandas dataframes.
1441 self.assertEqual(
1442 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1443 tab1.select(("index", "a", "b", "c", "ddd")),
1444 )
1446 tab1_astropy = arrow_to_astropy(tab1)
1447 self.assertEqual(len(tab1_astropy), 0)
1448 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1449 self.assertEqual(tab1_astropy_arrow, tab1)
1451 def testEmptyArrowTableMultidim(self):
1452 data = _makeSimpleNumpyTable(include_multidim=True)
1453 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1455 md = {}
1456 for name in data.dtype.names:
1457 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1459 schema = pa.schema(type_list, metadata=md)
1460 arrays = [[]] * len(schema.names)
1462 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1464 self.butler.put(tab1, self.datasetType, dataId={})
1465 tab2 = self.butler.get(self.datasetType, dataId={})
1466 self.assertEqual(tab2, tab1)
1468 tab1_numpy = arrow_to_numpy(tab1)
1469 self.assertEqual(len(tab1_numpy), 0)
1470 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1471 self.assertEqual(tab1_numpy_arrow, tab1)
1473 tab1_astropy = arrow_to_astropy(tab1)
1474 self.assertEqual(len(tab1_astropy), 0)
1475 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1476 self.assertEqual(tab1_astropy_arrow, tab1)
1478 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1479 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1480 df1, allColumns = _makeSingleIndexDataFrame()
1482 self.butler.put(df1, self.datasetType, dataId={})
1484 # Read back out as a dataframe.
1485 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1486 self.assertTrue(df1.equals(df2))
1488 # Read back out as an arrow table, convert to dataframe.
1489 tab3 = self.butler.get(self.datasetType, dataId={})
1490 df3 = arrow_to_pandas(tab3)
1491 self.assertTrue(df1.equals(df3))
1493 # Check reading the columns.
1494 columns = df2.reset_index().columns
1495 columns2 = self.butler.get(
1496 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1497 )
1498 # We check the set because pandas reorders the columns.
1499 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1501 # Check reading the schema.
1502 schema = DataFrameSchema(df1)
1503 schema2 = self.butler.get(
1504 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1505 )
1506 self.assertEqual(schema2, schema)
1508 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1509 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1510 df1 = _makeMultiIndexDataFrame()
1512 self.butler.put(df1, self.datasetType, dataId={})
1514 # Read back out as a dataframe.
1515 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1516 self.assertTrue(df1.equals(df2))
1518 # Read back out as an arrow table, convert to dataframe.
1519 atab3 = self.butler.get(self.datasetType, dataId={})
1520 df3 = arrow_to_pandas(atab3)
1521 self.assertTrue(df1.equals(df3))
1523 # Check reading the columns.
1524 columns = df2.columns
1525 columns2 = self.butler.get(
1526 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1527 )
1528 self.assertTrue(columns2.equals(columns))
1530 # Check reading the schema.
1531 schema = DataFrameSchema(df1)
1532 schema2 = self.butler.get(
1533 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1534 )
1535 self.assertEqual(schema2, schema)
1537 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1538 def testWriteArrowTableReadAsAstropyTable(self):
1539 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1541 self.butler.put(tab1, self.datasetType, dataId={})
1543 # Read back out as an astropy table.
1544 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1545 self._checkAstropyTableEquality(tab1, tab2)
1547 # Read back out as an arrow table, convert to astropy table.
1548 atab3 = self.butler.get(self.datasetType, dataId={})
1549 tab3 = arrow_to_astropy(atab3)
1550 self._checkAstropyTableEquality(tab1, tab3)
1552 # Check reading the columns.
1553 columns = list(tab2.columns.keys())
1554 columns2 = self.butler.get(
1555 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1556 )
1557 self.assertEqual(columns2, columns)
1559 # Check reading the schema.
1560 schema = ArrowAstropySchema(tab1)
1561 schema2 = self.butler.get(
1562 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1563 )
1564 self.assertEqual(schema2, schema)
1566 # Check the schema conversions and units.
1567 arrow_schema = schema.to_arrow_schema()
1568 for name in arrow_schema.names:
1569 field_metadata = arrow_schema.field(name).metadata
1570 if (
1571 b"description" in field_metadata
1572 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1573 ):
1574 self.assertEqual(schema2.schema[name].description, description)
1575 else:
1576 self.assertIsNone(schema2.schema[name].description)
1577 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1578 self.assertEqual(schema2.schema[name].unit, units.Unit(unit))
1580 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1581 def testWriteArrowTableReadAsNumpyTable(self):
1582 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1584 self.butler.put(tab1, self.datasetType, dataId={})
1586 # Read back out as a numpy table.
1587 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1588 self._checkNumpyTableEquality(tab1, tab2)
1590 # Read back out as an arrow table, convert to numpy table.
1591 atab3 = self.butler.get(self.datasetType, dataId={})
1592 tab3 = arrow_to_numpy(atab3)
1593 self._checkNumpyTableEquality(tab1, tab3)
1595 # Check reading the columns.
1596 columns = list(tab2.dtype.names)
1597 columns2 = self.butler.get(
1598 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1599 )
1600 self.assertEqual(columns2, columns)
1602 # Check reading the schema.
1603 schema = ArrowNumpySchema(tab1.dtype)
1604 schema2 = self.butler.get(
1605 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1606 )
1607 self.assertEqual(schema2, schema)
1609 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1610 def testWriteArrowTableReadAsNumpyDict(self):
1611 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1613 self.butler.put(tab1, self.datasetType, dataId={})
1615 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1616 tab2_numpy = _numpy_dict_to_numpy(tab2)
1617 self._checkNumpyTableEquality(tab1, tab2_numpy)
1619 def _checkAstropyTableEquality(self, table1, table2):
1620 """Check if two astropy tables have the same columns/values
1622 Parameters
1623 ----------
1624 table1 : `astropy.table.Table`
1625 table2 : `astropy.table.Table`
1626 """
1627 self.assertEqual(table1.dtype, table2.dtype)
1628 for name in table1.columns:
1629 self.assertEqual(table1[name].unit, table2[name].unit)
1630 self.assertEqual(table1[name].description, table2[name].description)
1631 self.assertEqual(table1[name].format, table2[name].format)
1632 # We need to check masked/regular columns after filling.
1633 has_masked = False
1634 if isinstance(table1[name], atable.column.MaskedColumn):
1635 c1 = table1[name].filled()
1636 has_masked = True
1637 else:
1638 c1 = np.array(table1[name])
1639 if has_masked:
1640 self.assertIsInstance(table2[name], atable.column.MaskedColumn)
1641 c2 = table2[name].filled()
1642 else:
1643 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn))
1644 c2 = np.array(table2[name])
1645 np.testing.assert_array_equal(c1, c2)
1646 # If we have a masked column then we test the underlying data.
1647 if has_masked:
1648 np.testing.assert_array_equal(np.array(c1), np.array(c2))
1649 np.testing.assert_array_equal(table1[name].mask, table2[name].mask)
1651 def _checkNumpyTableEquality(self, table1, table2):
1652 """Check if two numpy tables have the same columns/values
1654 Parameters
1655 ----------
1656 table1 : `numpy.ndarray`
1657 table2 : `numpy.ndarray`
1658 """
1659 self.assertEqual(table1.dtype.names, table2.dtype.names)
1660 for name in table1.dtype.names:
1661 self.assertEqual(table1.dtype[name], table2.dtype[name])
1662 self.assertTrue(np.all(table1 == table2))
1665@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1666class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1667 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1669 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1671 def testBadInput(self):
1672 tab1 = _makeSimpleArrowTable()
1673 delegate = ArrowTableDelegate("ArrowTable")
1675 with self.assertRaises(ValueError):
1676 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1678 with self.assertRaises(NotImplementedError):
1679 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1681 with self.assertRaises(AttributeError):
1682 delegate.getComponent(composite=tab1, componentName="nothing")
1684 def testStorageClass(self):
1685 tab1 = _makeSimpleArrowTable()
1687 factory = StorageClassFactory()
1688 factory.addFromConfig(StorageClassConfig())
1690 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1691 # Force the name lookup to do name matching.
1692 storageClass._pytype = None
1693 self.assertEqual(storageClass.name, "ArrowTable")
1695 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1696 # Force the name lookup to do name matching.
1697 storageClass._pytype = None
1698 self.assertEqual(storageClass.name, "ArrowTable")
1701@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1702@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1703class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1704 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1706 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1708 def setUp(self):
1709 """Create a new butler root for each test."""
1710 self.root = makeTestTempDir(TESTDIR)
1711 config = Config(self.configFile)
1712 self.butler = Butler.from_config(
1713 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1714 )
1715 # No dimensions in dataset type so we don't have to worry about
1716 # inserting dimension data or defining data IDs.
1717 self.datasetType = DatasetType(
1718 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions
1719 )
1720 self.butler.registry.registerDatasetType(self.datasetType)
1722 def tearDown(self):
1723 removeTestTempDir(self.root)
1725 def testNumpyDict(self):
1726 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1727 dict1 = _numpy_to_numpy_dict(tab1)
1729 self.butler.put(dict1, self.datasetType, dataId={})
1730 # Read the whole table.
1731 dict2 = self.butler.get(self.datasetType, dataId={})
1732 self._checkNumpyDictEquality(dict1, dict2)
1733 # Read the columns.
1734 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1735 self.assertEqual(len(columns2), len(dict1.keys()))
1736 for name in dict1:
1737 self.assertIn(name, columns2)
1738 # Read the rowcount.
1739 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1740 self.assertEqual(rowcount, len(dict1["a"]))
1741 # Read the schema.
1742 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1743 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1744 # Read just some columns a few different ways.
1745 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1746 subdict = {key: dict1[key] for key in ["a", "c"]}
1747 self._checkNumpyDictEquality(subdict, tab3)
1748 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1749 subdict = {key: dict1[key] for key in ["a"]}
1750 self._checkNumpyDictEquality(subdict, tab4)
1751 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1752 subdict = {key: dict1[key] for key in ["index", "a"]}
1753 self._checkNumpyDictEquality(subdict, tab5)
1754 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1755 subdict = {key: dict1[key] for key in ["ddd"]}
1756 self._checkNumpyDictEquality(subdict, tab6)
1757 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1758 subdict = {key: dict1[key] for key in ["a"]}
1759 self._checkNumpyDictEquality(subdict, tab7)
1760 # Passing an unrecognized column should be a ValueError.
1761 with self.assertRaises(ValueError):
1762 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1764 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1765 def testWriteNumpyDictReadAsArrowTable(self):
1766 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1767 dict1 = _numpy_to_numpy_dict(tab1)
1769 self.butler.put(dict1, self.datasetType, dataId={})
1771 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1773 tab2_dict = arrow_to_numpy_dict(tab2)
1775 self._checkNumpyDictEquality(dict1, tab2_dict)
1777 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1778 def testWriteNumpyDictReadAsDataFrame(self):
1779 tab1 = _makeSimpleNumpyTable()
1780 dict1 = _numpy_to_numpy_dict(tab1)
1782 self.butler.put(dict1, self.datasetType, dataId={})
1784 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1786 # The order of the dict may get mixed up, so we need to check column
1787 # by column. We also need to do this in dataframe form because pandas
1788 # changes the datatype of the string column.
1789 tab1_df = pd.DataFrame(tab1)
1791 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1792 for col in tab1_df.columns:
1793 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1795 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1796 def testWriteNumpyDictReadAsAstropyTable(self):
1797 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1798 dict1 = _numpy_to_numpy_dict(tab1)
1800 self.butler.put(dict1, self.datasetType, dataId={})
1802 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1803 tab2_dict = _astropy_to_numpy_dict(tab2)
1805 self._checkNumpyDictEquality(dict1, tab2_dict)
1807 def testWriteNumpyDictReadAsNumpyTable(self):
1808 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1809 dict1 = _numpy_to_numpy_dict(tab1)
1811 self.butler.put(dict1, self.datasetType, dataId={})
1813 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1814 tab2_dict = _numpy_to_numpy_dict(tab2)
1816 self._checkNumpyDictEquality(dict1, tab2_dict)
1818 def testWriteNumpyDictBad(self):
1819 dict1 = {"a": 4, "b": np.ndarray([1])}
1820 with self.assertRaises(RuntimeError):
1821 self.butler.put(dict1, self.datasetType, dataId={})
1823 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1824 with self.assertRaises(RuntimeError):
1825 self.butler.put(dict2, self.datasetType, dataId={})
1827 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1828 with self.assertRaises(RuntimeError):
1829 self.butler.put(dict3, self.datasetType, dataId={})
1831 def _checkNumpyDictEquality(self, dict1, dict2):
1832 """Check if two numpy dicts have the same columns/values.
1834 Parameters
1835 ----------
1836 dict1 : `dict` [`str`, `np.ndarray`]
1837 dict2 : `dict` [`str`, `np.ndarray`]
1838 """
1839 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1840 for name in dict1:
1841 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1842 self.assertTrue(np.all(dict1[name] == dict2[name]))
1845@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1846@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1847class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1848 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1850 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1852 def testWriteNumpyDictBad(self):
1853 # The sub-type checking is not done on in-memory datastore.
1854 pass
1857@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.")
1858class ParquetFormatterArrowSchemaTestCase(unittest.TestCase):
1859 """Tests for ParquetFormatter, ArrowSchema, using local file datastore."""
1861 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1863 def setUp(self):
1864 """Create a new butler root for each test."""
1865 self.root = makeTestTempDir(TESTDIR)
1866 config = Config(self.configFile)
1867 self.butler = Butler.from_config(
1868 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1869 )
1870 # No dimensions in dataset type so we don't have to worry about
1871 # inserting dimension data or defining data IDs.
1872 self.datasetType = DatasetType(
1873 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions
1874 )
1875 self.butler.registry.registerDatasetType(self.datasetType)
1877 def tearDown(self):
1878 removeTestTempDir(self.root)
1880 def _makeTestSchema(self):
1881 schema = pa.schema(
1882 [
1883 pa.field(
1884 "int32",
1885 pa.int32(),
1886 nullable=False,
1887 metadata={
1888 "description": "32-bit integer",
1889 "unit": "",
1890 },
1891 ),
1892 pa.field(
1893 "int64",
1894 pa.int64(),
1895 nullable=False,
1896 metadata={
1897 "description": "64-bit integer",
1898 "unit": "",
1899 },
1900 ),
1901 pa.field(
1902 "uint64",
1903 pa.uint64(),
1904 nullable=False,
1905 metadata={
1906 "description": "64-bit unsigned integer",
1907 "unit": "",
1908 },
1909 ),
1910 pa.field(
1911 "float32",
1912 pa.float32(),
1913 nullable=False,
1914 metadata={
1915 "description": "32-bit float",
1916 "unit": "count",
1917 },
1918 ),
1919 pa.field(
1920 "float64",
1921 pa.float64(),
1922 nullable=False,
1923 metadata={
1924 "description": "64-bit float",
1925 "unit": "nJy",
1926 },
1927 ),
1928 pa.field(
1929 "fixed_size_list",
1930 pa.list_(pa.float64(), list_size=10),
1931 nullable=False,
1932 metadata={
1933 "description": "Fixed size list of 64-bit floats.",
1934 "unit": "nJy",
1935 },
1936 ),
1937 pa.field(
1938 "variable_size_list",
1939 pa.list_(pa.float64()),
1940 nullable=False,
1941 metadata={
1942 "description": "Variable size list of 64-bit floats.",
1943 "unit": "nJy",
1944 },
1945 ),
1946 # One of these fields will have no description.
1947 pa.field(
1948 "string",
1949 pa.string(),
1950 nullable=False,
1951 metadata={
1952 "unit": "",
1953 },
1954 ),
1955 # One of these fields will have no metadata.
1956 pa.field(
1957 "binary",
1958 pa.binary(),
1959 nullable=False,
1960 ),
1961 ]
1962 )
1964 return schema
1966 def testArrowSchema(self):
1967 schema1 = self._makeTestSchema()
1968 self.butler.put(schema1, self.datasetType, dataId={})
1970 schema2 = self.butler.get(self.datasetType, dataId={})
1971 self.assertEqual(schema2, schema1)
1973 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.")
1974 def testWriteArrowSchemaReadAsDataFrameSchema(self):
1975 schema1 = self._makeTestSchema()
1976 self.butler.put(schema1, self.datasetType, dataId={})
1978 df_schema1 = DataFrameSchema.from_arrow(schema1)
1980 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema")
1981 self.assertEqual(df_schema2, df_schema1)
1983 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.")
1984 def testWriteArrowSchemaReadAsArrowAstropySchema(self):
1985 schema1 = self._makeTestSchema()
1986 self.butler.put(schema1, self.datasetType, dataId={})
1988 ap_schema1 = ArrowAstropySchema.from_arrow(schema1)
1990 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema")
1991 self.assertEqual(ap_schema2, ap_schema1)
1993 # Confirm that the ap_schema2 has the unit/description we expect.
1994 for name in schema1.names:
1995 field_metadata = schema1.field(name).metadata
1996 if field_metadata is None:
1997 continue
1998 if (
1999 b"description" in field_metadata
2000 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
2001 ):
2002 self.assertEqual(ap_schema2.schema[name].description, description)
2003 else:
2004 self.assertIsNone(ap_schema2.schema[name].description)
2005 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
2006 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit))
2008 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.")
2009 def testWriteArrowSchemaReadAsArrowNumpySchema(self):
2010 schema1 = self._makeTestSchema()
2011 self.butler.put(schema1, self.datasetType, dataId={})
2013 np_schema1 = ArrowNumpySchema.from_arrow(schema1)
2015 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema")
2016 self.assertEqual(np_schema2, np_schema1)
2019@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.")
2020class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase):
2021 """Tests for InMemoryDatastore and ArrowSchema."""
2023 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
2026@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
2027@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
2028class ComputeRowGroupSizeTestCase(unittest.TestCase):
2029 """Tests for compute_row_group_size."""
2031 def testRowGroupSizeNoMetadata(self):
2032 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
2034 # We can't use the numpy_to_arrow convenience function because
2035 # that adds metadata.
2036 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
2037 schema = pa.schema(type_list)
2038 arrays = _numpy_style_arrays_to_arrow_arrays(
2039 numpyTable.dtype,
2040 len(numpyTable),
2041 numpyTable,
2042 schema,
2043 )
2044 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
2046 row_group_size = compute_row_group_size(arrowTable.schema)
2048 self.assertGreater(row_group_size, 1_000_000)
2049 self.assertLess(row_group_size, 2_000_000)
2051 def testRowGroupSizeWithMetadata(self):
2052 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
2054 arrowTable = numpy_to_arrow(numpyTable)
2056 row_group_size = compute_row_group_size(arrowTable.schema)
2058 self.assertGreater(row_group_size, 1_000_000)
2059 self.assertLess(row_group_size, 2_000_000)
2061 def testRowGroupSizeTinyTable(self):
2062 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
2064 arrowTable = numpy_to_arrow(numpyTable)
2066 row_group_size = compute_row_group_size(arrowTable.schema)
2068 self.assertGreater(row_group_size, 1_000_000)
2070 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
2071 def testRowGroupSizeDataFrameWithLists(self):
2072 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
2073 arrowTable = pandas_to_arrow(df)
2074 row_group_size = compute_row_group_size(arrowTable.schema)
2076 self.assertGreater(row_group_size, 1_000_000)
2079if __name__ == "__main__":
2080 unittest.main()