Coverage for tests/test_parquet.py: 22%
1105 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 02:58 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 02:58 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for ParquetFormatter.
30Tests in this module are disabled unless pandas and pyarrow are importable.
31"""
33import os
34import unittest
36try:
37 import pyarrow as pa
38except ImportError:
39 pa = None
40try:
41 import astropy.table as atable
42 from astropy import units
43except ImportError:
44 atable = None
45try:
46 import numpy as np
47except ImportError:
48 np = None
49try:
50 import pandas as pd
51except ImportError:
52 pd = None
54from lsst.daf.butler import (
55 Butler,
56 Config,
57 DatasetRef,
58 DatasetType,
59 FileDataset,
60 StorageClassConfig,
61 StorageClassFactory,
62)
64try:
65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
66except ImportError:
67 atable = None
68 pa = None
69try:
70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
71except ImportError:
72 np = None
73 pa = None
74try:
75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
76except ImportError:
77 pa = None
78try:
79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
80except ImportError:
81 pd = None
82try:
83 from lsst.daf.butler.formatters.parquet import (
84 ArrowAstropySchema,
85 ArrowNumpySchema,
86 DataFrameSchema,
87 ParquetFormatter,
88 _append_numpy_multidim_metadata,
89 _astropy_to_numpy_dict,
90 _numpy_dict_to_numpy,
91 _numpy_dtype_to_arrow_types,
92 _numpy_style_arrays_to_arrow_arrays,
93 _numpy_to_numpy_dict,
94 arrow_to_astropy,
95 arrow_to_numpy,
96 arrow_to_numpy_dict,
97 arrow_to_pandas,
98 astropy_to_arrow,
99 astropy_to_pandas,
100 compute_row_group_size,
101 numpy_dict_to_arrow,
102 numpy_to_arrow,
103 pandas_to_arrow,
104 pandas_to_astropy,
105 )
106except ImportError:
107 pa = None
108 pd = None
109 atable = None
110 np = None
111from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
113TESTDIR = os.path.abspath(os.path.dirname(__file__))
116def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
117 """Make a simple numpy table with random data.
119 Parameters
120 ----------
121 include_multidim : `bool`
122 Include multi-dimensional columns.
123 include_bigendian : `bool`
124 Include big-endian columns.
126 Returns
127 -------
128 numpyTable : `numpy.ndarray`
129 """
130 nrow = 5
132 dtype = [
133 ("index", "i4"),
134 ("a", "f8"),
135 ("b", "f8"),
136 ("c", "f8"),
137 ("ddd", "f8"),
138 ("f", "i8"),
139 ("strcol", "U10"),
140 ("bytecol", "a10"),
141 ]
143 if include_multidim:
144 dtype.extend(
145 [
146 ("d1", "f4", (5,)),
147 ("d2", "i8", (5, 10)),
148 ("d3", "f8", (5, 10)),
149 ]
150 )
152 if include_bigendian:
153 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
155 data = np.zeros(nrow, dtype=dtype)
156 data["index"][:] = np.arange(nrow)
157 data["a"] = np.random.randn(nrow)
158 data["b"] = np.random.randn(nrow)
159 data["c"] = np.random.randn(nrow)
160 data["ddd"] = np.random.randn(nrow)
161 data["f"] = np.arange(nrow) * 10
162 data["strcol"][:] = "teststring"
163 data["bytecol"][:] = "teststring"
165 if include_multidim:
166 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
167 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
168 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
170 if include_bigendian:
171 data["a_bigendian"][:] = data["a"]
172 data["f_bigendian"][:] = data["f"]
174 return data
177def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
178 """Make a single index data frame for testing.
180 Parameters
181 ----------
182 include_masked : `bool`
183 Include masked columns.
184 include_lists : `bool`
185 Include list columns.
187 Returns
188 -------
189 dataFrame : `~pandas.DataFrame`
190 The test dataframe.
191 allColumns : `list` [`str`]
192 List of all the columns (including index columns).
193 """
194 data = _makeSimpleNumpyTable()
195 df = pd.DataFrame(data)
196 df = df.set_index("index")
198 if include_masked:
199 nrow = len(df)
201 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
202 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
203 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
204 df.loc[1, ["m1", "m2", "mstrcol"]] = None
205 df.loc[0, "m1"] = 1649900760361600113
207 if include_lists:
208 nrow = len(df)
210 df["l1"] = [[0, 0]] * nrow
211 df["l2"] = [[0.0, 0.0]] * nrow
212 df["l3"] = [[]] * nrow
214 allColumns = df.columns.append(pd.Index(df.index.names))
216 return df, allColumns
219def _makeMultiIndexDataFrame():
220 """Make a multi-index data frame for testing.
222 Returns
223 -------
224 dataFrame : `~pandas.DataFrame`
225 The test dataframe.
226 """
227 columns = pd.MultiIndex.from_tuples(
228 [
229 ("g", "a"),
230 ("g", "b"),
231 ("g", "c"),
232 ("r", "a"),
233 ("r", "b"),
234 ("r", "c"),
235 ],
236 names=["filter", "column"],
237 )
238 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
240 return df
243def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
244 """Make an astropy table for testing.
246 Parameters
247 ----------
248 include_multidim : `bool`
249 Include multi-dimensional columns.
250 include_masked : `bool`
251 Include masked columns.
252 include_bigendian : `bool`
253 Include big-endian columns.
255 Returns
256 -------
257 astropyTable : `astropy.table.Table`
258 The test table.
259 """
260 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
261 # Add a couple of units.
262 table = atable.Table(data)
263 table["a"].unit = units.degree
264 table["a"].description = "Description of column a"
265 table["b"].unit = units.meter
266 table["b"].description = "Description of column b"
268 # Add some masked columns.
269 if include_masked:
270 nrow = len(table)
271 mask = np.zeros(nrow, dtype=bool)
272 mask[1] = True
273 # We set the masked columns with the underlying sentinel value
274 # to be able test after serialization.
276 # Masked 64-bit integer.
277 arr = np.arange(nrow, dtype="i8")
278 arr[mask] = -1
279 arr[0] = 1649900760361600113
280 table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1)
281 # Masked 32-bit float.
282 arr = np.arange(nrow, dtype="f4")
283 arr[mask] = np.nan
284 table["m_f4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan)
285 # Unmasked 32-bit float with NaNs.
286 table["um_f4"] = arr
287 # Masked 64-bit float.
288 arr = np.arange(nrow, dtype="f8")
289 arr[mask] = np.nan
290 table["m_f8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan)
291 # Unmasked 64-bit float with NaNs.
292 table["um_f8"] = arr
293 # Masked boolean.
294 arr = np.zeros(nrow, dtype=np.bool_)
295 arr[mask] = True
296 table["m_bool"] = np.ma.masked_array(data=arr, mask=mask, fill_value=True)
297 # Masked unsigned 32-bit unsigned int.
298 arr = np.arange(nrow, dtype="u4")
299 arr[mask] = 0
300 table["m_u4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=0)
301 # Masked string.
302 table["m_str"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask, fill_value="")
303 # Masked bytes.
304 table["m_byte"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask, fill_value=b"")
306 return table
309def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
310 """Make an arrow table for testing.
312 Parameters
313 ----------
314 include_multidim : `bool`
315 Include multi-dimensional columns.
316 include_masked : `bool`
317 Include masked columns.
319 Returns
320 -------
321 arrowTable : `pyarrow.Table`
322 The test table.
323 """
324 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
325 return astropy_to_arrow(data)
328@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
329@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
330class ParquetFormatterDataFrameTestCase(unittest.TestCase):
331 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
333 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
335 def setUp(self):
336 """Create a new butler root for each test."""
337 self.root = makeTestTempDir(TESTDIR)
338 config = Config(self.configFile)
339 self.run = "test_run"
340 self.butler = Butler.from_config(
341 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
342 )
343 # No dimensions in dataset type so we don't have to worry about
344 # inserting dimension data or defining data IDs.
345 self.datasetType = DatasetType(
346 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions
347 )
348 self.butler.registry.registerDatasetType(self.datasetType)
350 def tearDown(self):
351 removeTestTempDir(self.root)
353 def testSingleIndexDataFrame(self):
354 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
356 self.butler.put(df1, self.datasetType, dataId={})
357 # Read the whole DataFrame.
358 df2 = self.butler.get(self.datasetType, dataId={})
359 self.assertTrue(df1.equals(df2))
360 # Read just the column descriptions.
361 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
362 self.assertTrue(allColumns.equals(columns2))
363 # Read the rowcount.
364 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
365 self.assertEqual(rowcount, len(df1))
366 # Read the schema.
367 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
368 self.assertEqual(schema, DataFrameSchema(df1))
369 # Read just some columns a few different ways.
370 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
371 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
372 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
373 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
374 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
375 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
376 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
377 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
378 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
379 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
380 # Passing an unrecognized column should be a ValueError.
381 with self.assertRaises(ValueError):
382 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
384 def testSingleIndexDataFrameWithLists(self):
385 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
387 self.butler.put(df1, self.datasetType, dataId={})
388 # Read the whole DataFrame.
389 df2 = self.butler.get(self.datasetType, dataId={})
391 # We need to check the list columns specially because they go
392 # from lists to arrays.
393 for col in ["l1", "l2", "l3"]:
394 for i in range(len(df1)):
395 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
397 def testMultiIndexDataFrame(self):
398 df1 = _makeMultiIndexDataFrame()
400 self.butler.put(df1, self.datasetType, dataId={})
401 # Read the whole DataFrame.
402 df2 = self.butler.get(self.datasetType, dataId={})
403 self.assertTrue(df1.equals(df2))
404 # Read just the column descriptions.
405 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
406 self.assertTrue(df1.columns.equals(columns2))
407 self.assertEqual(columns2.names, df1.columns.names)
408 # Read the rowcount.
409 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
410 self.assertEqual(rowcount, len(df1))
411 # Read the schema.
412 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
413 self.assertEqual(schema, DataFrameSchema(df1))
414 # Read just some columns a few different ways.
415 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
416 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
417 df4 = self.butler.get(
418 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
419 )
420 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
421 column_list = [("g", "a"), ("r", "c")]
422 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
423 self.assertTrue(df1.loc[:, column_list].equals(df5))
424 column_dict = {"filter": "r", "column": ["a", "b"]}
425 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
426 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
427 # Passing an unrecognized column should be a ValueError.
428 with self.assertRaises(ValueError):
429 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
431 def testSingleIndexDataFrameEmptyString(self):
432 """Test persisting a single index dataframe with empty strings."""
433 df1, _ = _makeSingleIndexDataFrame()
435 # Set one of the strings to None
436 df1.at[1, "strcol"] = None
438 self.butler.put(df1, self.datasetType, dataId={})
439 # Read the whole DataFrame.
440 df2 = self.butler.get(self.datasetType, dataId={})
441 self.assertTrue(df1.equals(df2))
443 def testSingleIndexDataFrameAllEmptyStrings(self):
444 """Test persisting a single index dataframe with an empty string
445 column.
446 """
447 df1, _ = _makeSingleIndexDataFrame()
449 # Set all of the strings to None
450 df1.loc[0:, "strcol"] = None
452 self.butler.put(df1, self.datasetType, dataId={})
453 # Read the whole DataFrame.
454 df2 = self.butler.get(self.datasetType, dataId={})
455 self.assertTrue(df1.equals(df2))
457 def testLegacyDataFrame(self):
458 """Test writing a dataframe to parquet via pandas (without additional
459 metadata) and ensure that we can read it back with all the new
460 functionality.
461 """
462 df1, allColumns = _makeSingleIndexDataFrame()
464 fname = os.path.join(self.root, "test_dataframe.parq")
465 df1.to_parquet(fname)
467 legacy_type = DatasetType(
468 "legacy_dataframe",
469 dimensions=(),
470 storageClass="DataFrame",
471 universe=self.butler.dimensions,
472 )
473 self.butler.registry.registerDatasetType(legacy_type)
475 data_id = {}
476 ref = DatasetRef(legacy_type, data_id, run=self.run)
477 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
479 self.butler.ingest(dataset, transfer="copy")
481 self.butler.put(df1, self.datasetType, dataId={})
483 df2a = self.butler.get(self.datasetType, dataId={})
484 df2b = self.butler.get("legacy_dataframe", dataId={})
485 self.assertTrue(df2a.equals(df2b))
487 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
488 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
489 self.assertTrue(df3a.equals(df3b))
491 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
492 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
493 self.assertTrue(columns2a.equals(columns2b))
495 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
496 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
497 self.assertEqual(rowcount2a, rowcount2b)
499 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
500 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
501 self.assertEqual(schema2a, schema2b)
503 def testDataFrameSchema(self):
504 tab1 = _makeSimpleArrowTable()
506 schema = DataFrameSchema.from_arrow(tab1.schema)
508 self.assertIsInstance(schema.schema, pd.DataFrame)
509 self.assertEqual(repr(schema), repr(schema._schema))
510 self.assertNotEqual(schema, "not_a_schema")
511 self.assertEqual(schema, schema)
513 tab2 = _makeMultiIndexDataFrame()
514 schema2 = DataFrameSchema(tab2)
516 self.assertNotEqual(schema, schema2)
518 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
519 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
520 df1, allColumns = _makeSingleIndexDataFrame()
522 self.butler.put(df1, self.datasetType, dataId={})
524 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
526 tab2_df = tab2.to_pandas(index="index")
527 self.assertTrue(df1.equals(tab2_df))
529 # Check reading the columns.
530 columns = list(tab2.columns.keys())
531 columns2 = self.butler.get(
532 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
533 )
534 # We check the set because pandas reorders the columns.
535 self.assertEqual(set(columns2), set(columns))
537 # Check reading the schema.
538 schema = ArrowAstropySchema(tab2)
539 schema2 = self.butler.get(
540 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
541 )
543 # The string types are objectified by pandas, and the order
544 # will be changed because of pandas indexing.
545 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
546 for name in schema.schema.columns:
547 self.assertIn(name, schema2.schema.columns)
548 if schema2.schema[name].dtype != np.dtype("O"):
549 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
551 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
552 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
553 # We need to special-case the write-as-pandas read-as-astropy code
554 # with masks because pandas has multiple ways to use masked columns.
555 # (The string column mask handling in particular is frustratingly
556 # inconsistent.)
557 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
559 self.butler.put(df1, self.datasetType, dataId={})
561 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
562 tab2_df = astropy_to_pandas(tab2, index="index")
564 self.assertTrue(df1.columns.equals(tab2_df.columns))
565 for name in tab2_df.columns:
566 col1 = df1[name]
567 col2 = tab2_df[name]
569 if col1.hasnans:
570 notNull = col1.notnull()
571 self.assertTrue(notNull.equals(col2.notnull()))
572 # Need to check value-by-value because column may
573 # be made of objects, depending on what pandas decides.
574 for index in notNull.values.nonzero()[0]:
575 self.assertEqual(col1[index], col2[index])
576 else:
577 self.assertTrue(col1.equals(col2))
579 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
580 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
581 df1 = _makeMultiIndexDataFrame()
583 self.butler.put(df1, self.datasetType, dataId={})
585 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
587 # This is an odd duck, it doesn't really round-trip.
588 # This test simply checks that it's readable, but definitely not
589 # recommended.
591 @unittest.skipUnless(atable is not None, "Cannot test writing as astropy without astropy.")
592 def testWriteAstropyTableWithMaskedColsReadAsSingleIndexDataFrame(self):
593 tab1 = _makeSimpleAstropyTable(include_masked=True)
595 self.butler.put(tab1, self.datasetType, dataId={})
597 tab2 = self.butler.get(self.datasetType, dataId={})
599 tab1_df = astropy_to_pandas(tab1)
600 self.assertTrue(tab1_df.equals(tab2))
602 tab2_astropy = pandas_to_astropy(tab2)
603 for col in tab1.dtype.names:
604 np.testing.assert_array_equal(tab2_astropy[col], tab1[col])
605 if isinstance(tab1[col], atable.column.MaskedColumn):
606 np.testing.assert_array_equal(tab2_astropy[col].mask, tab1[col].mask)
608 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
609 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
610 df1, allColumns = _makeSingleIndexDataFrame()
612 self.butler.put(df1, self.datasetType, dataId={})
614 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
616 tab2_df = arrow_to_pandas(tab2)
617 self.assertTrue(df1.equals(tab2_df))
619 # Check reading the columns.
620 columns = list(tab2.schema.names)
621 columns2 = self.butler.get(
622 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
623 )
624 # We check the set because pandas reorders the columns.
625 self.assertEqual(set(columns), set(columns2))
627 # Check reading the schema.
628 schema = tab2.schema
629 schema2 = self.butler.get(
630 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
631 )
633 # These will not have the same metadata, nor will the string column
634 # information be maintained.
635 self.assertEqual(len(schema.names), len(schema2.names))
636 for name in schema.names:
637 if schema.field(name).type not in (pa.string(), pa.binary()):
638 self.assertEqual(schema.field(name).type, schema2.field(name).type)
640 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
641 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
642 df1 = _makeMultiIndexDataFrame()
644 self.butler.put(df1, self.datasetType, dataId={})
646 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
648 tab2_df = arrow_to_pandas(tab2)
649 self.assertTrue(df1.equals(tab2_df))
651 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
652 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
653 df1, allColumns = _makeSingleIndexDataFrame()
655 self.butler.put(df1, self.datasetType, dataId={})
657 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
659 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
660 self.assertTrue(df1.equals(tab2_df))
662 # Check reading the columns.
663 columns = list(tab2.dtype.names)
664 columns2 = self.butler.get(
665 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
666 )
667 # We check the set because pandas reorders the columns.
668 self.assertEqual(set(columns2), set(columns))
670 # Check reading the schema.
671 schema = ArrowNumpySchema(tab2.dtype)
672 schema2 = self.butler.get(
673 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
674 )
676 # The string types will be objectified by pandas, and the order
677 # will be changed because of pandas indexing.
678 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
679 for name in schema.schema.names:
680 self.assertIn(name, schema2.schema.names)
681 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
683 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
684 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
685 df1 = _makeMultiIndexDataFrame()
687 self.butler.put(df1, self.datasetType, dataId={})
689 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
691 # This is an odd duck, it doesn't really round-trip.
692 # This test simply checks that it's readable, but definitely not
693 # recommended.
695 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
696 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
697 df1, allColumns = _makeSingleIndexDataFrame()
699 self.butler.put(df1, self.datasetType, dataId={})
701 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
703 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
704 # The column order is not maintained.
705 self.assertEqual(set(df1.columns), set(tab2_df.columns))
706 for col in df1.columns:
707 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
709 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
710 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
711 df1 = _makeMultiIndexDataFrame()
713 self.butler.put(df1, self.datasetType, dataId={})
715 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
717 # This is an odd duck, it doesn't really round-trip.
718 # This test simply checks that it's readable, but definitely not
719 # recommended.
722@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
723class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
724 """Tests for InMemoryDatastore, using DataFrameDelegate."""
726 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
728 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
729 df1 = _makeMultiIndexDataFrame()
731 self.butler.put(df1, self.datasetType, dataId={})
733 with self.assertRaises(ValueError):
734 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
736 def testLegacyDataFrame(self):
737 # This test does not work with an inMemoryDatastore.
738 pass
740 def testBadInput(self):
741 df1, _ = _makeSingleIndexDataFrame()
742 delegate = DataFrameDelegate("DataFrame")
744 with self.assertRaises(ValueError):
745 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
747 with self.assertRaises(AttributeError):
748 delegate.getComponent(composite=df1, componentName="nothing")
750 def testStorageClass(self):
751 df1, allColumns = _makeSingleIndexDataFrame()
753 factory = StorageClassFactory()
754 factory.addFromConfig(StorageClassConfig())
756 storageClass = factory.findStorageClass(type(df1), compare_types=False)
757 # Force the name lookup to do name matching.
758 storageClass._pytype = None
759 self.assertEqual(storageClass.name, "DataFrame")
761 storageClass = factory.findStorageClass(type(df1), compare_types=True)
762 # Force the name lookup to do name matching.
763 storageClass._pytype = None
764 self.assertEqual(storageClass.name, "DataFrame")
767@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
768@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
769class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
770 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
772 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
774 def setUp(self):
775 """Create a new butler root for each test."""
776 self.root = makeTestTempDir(TESTDIR)
777 config = Config(self.configFile)
778 self.run = "test_run"
779 self.butler = Butler.from_config(
780 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
781 )
782 # No dimensions in dataset type so we don't have to worry about
783 # inserting dimension data or defining data IDs.
784 self.datasetType = DatasetType(
785 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions
786 )
787 self.butler.registry.registerDatasetType(self.datasetType)
789 def tearDown(self):
790 removeTestTempDir(self.root)
792 def testAstropyTable(self):
793 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
795 self.butler.put(tab1, self.datasetType, dataId={})
796 # Read the whole Table.
797 tab2 = self.butler.get(self.datasetType, dataId={})
798 self._checkAstropyTableEquality(tab1, tab2)
799 # Read the columns.
800 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
801 self.assertEqual(len(columns2), len(tab1.dtype.names))
802 for i, name in enumerate(tab1.dtype.names):
803 self.assertEqual(columns2[i], name)
804 # Read the rowcount.
805 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
806 self.assertEqual(rowcount, len(tab1))
807 # Read the schema.
808 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
809 self.assertEqual(schema, ArrowAstropySchema(tab1))
810 # Read just some columns a few different ways.
811 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
812 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
813 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
814 self._checkAstropyTableEquality(tab1[("a",)], tab4)
815 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
816 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
817 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
818 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
819 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
820 self._checkAstropyTableEquality(tab1[("a",)], tab7)
821 # Passing an unrecognized column should be a ValueError.
822 with self.assertRaises(ValueError):
823 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
825 def testAstropyTableBigEndian(self):
826 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
828 self.butler.put(tab1, self.datasetType, dataId={})
829 # Read the whole Table.
830 tab2 = self.butler.get(self.datasetType, dataId={})
831 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
833 def testAstropyTableWithMetadata(self):
834 tab1 = _makeSimpleAstropyTable(include_multidim=True)
836 meta = {
837 "meta_a": 5,
838 "meta_b": 10.0,
839 "meta_c": [1, 2, 3],
840 "meta_d": True,
841 "meta_e": "string",
842 }
844 tab1.meta.update(meta)
846 self.butler.put(tab1, self.datasetType, dataId={})
847 # Read the whole Table.
848 tab2 = self.butler.get(self.datasetType, dataId={})
849 # This will check that the metadata is equivalent as well.
850 self._checkAstropyTableEquality(tab1, tab2)
852 def testArrowAstropySchema(self):
853 tab1 = _makeSimpleAstropyTable()
854 tab1_arrow = astropy_to_arrow(tab1)
855 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
857 self.assertIsInstance(schema.schema, atable.Table)
858 self.assertEqual(repr(schema), repr(schema._schema))
859 self.assertNotEqual(schema, "not_a_schema")
860 self.assertEqual(schema, schema)
862 # Test various inequalities
863 tab2 = tab1.copy()
864 tab2.rename_column("index", "index2")
865 schema2 = ArrowAstropySchema(tab2)
866 self.assertNotEqual(schema2, schema)
868 tab2 = tab1.copy()
869 tab2["index"].unit = units.micron
870 schema2 = ArrowAstropySchema(tab2)
871 self.assertNotEqual(schema2, schema)
873 tab2 = tab1.copy()
874 tab2["index"].description = "Index column"
875 schema2 = ArrowAstropySchema(tab2)
876 self.assertNotEqual(schema2, schema)
878 tab2 = tab1.copy()
879 tab2["index"].format = "%05d"
880 schema2 = ArrowAstropySchema(tab2)
881 self.assertNotEqual(schema2, schema)
883 def testAstropyParquet(self):
884 tab1 = _makeSimpleAstropyTable()
886 fname = os.path.join(self.root, "test_astropy.parq")
887 tab1.write(fname)
889 astropy_type = DatasetType(
890 "astropy_parquet",
891 dimensions=(),
892 storageClass="ArrowAstropy",
893 universe=self.butler.dimensions,
894 )
895 self.butler.registry.registerDatasetType(astropy_type)
897 data_id = {}
898 ref = DatasetRef(astropy_type, data_id, run=self.run)
899 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
901 self.butler.ingest(dataset, transfer="copy")
903 self.butler.put(tab1, self.datasetType, dataId={})
905 tab2a = self.butler.get(self.datasetType, dataId={})
906 tab2b = self.butler.get("astropy_parquet", dataId={})
907 self._checkAstropyTableEquality(tab2a, tab2b)
909 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
910 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
911 self.assertEqual(len(columns2b), len(columns2a))
912 for i, name in enumerate(columns2a):
913 self.assertEqual(columns2b[i], name)
915 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
916 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
917 self.assertEqual(rowcount2a, rowcount2b)
919 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
920 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
921 self.assertEqual(schema2a, schema2b)
923 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
924 def testWriteAstropyReadAsArrowTable(self):
925 # This astropy <-> arrow works fine with masked columns.
926 tab1 = _makeSimpleAstropyTable(include_masked=True)
928 self.butler.put(tab1, self.datasetType, dataId={})
930 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
932 tab2_astropy = arrow_to_astropy(tab2)
933 self._checkAstropyTableEquality(tab1, tab2_astropy)
935 # Check reading the columns.
936 columns = tab2.schema.names
937 columns2 = self.butler.get(
938 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
939 )
940 self.assertEqual(columns2, columns)
942 # Check reading the schema.
943 schema = tab2.schema
944 schema2 = self.butler.get(
945 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
946 )
948 self.assertEqual(schema, schema2)
950 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
951 def testWriteAstropyReadAsDataFrame(self):
952 tab1 = _makeSimpleAstropyTable()
954 self.butler.put(tab1, self.datasetType, dataId={})
956 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
958 # This is tricky because it loses the units and gains a bonus pandas
959 # _index_ column, so we just test the dataframe form.
961 tab1_df = tab1.to_pandas()
962 self.assertTrue(tab1_df.equals(tab2))
964 # Check reading the columns.
965 columns = tab2.columns
966 columns2 = self.butler.get(
967 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
968 )
969 self.assertTrue(columns.equals(columns2))
971 # Check reading the schema.
972 schema = DataFrameSchema(tab2)
973 schema2 = self.butler.get(
974 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
975 )
977 self.assertEqual(schema2, schema)
979 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
980 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
981 # We need to special-case the write-as-astropy read-as-pandas code
982 # with masks because pandas has multiple ways to use masked columns.
983 # (When writing an astropy table with masked columns we get an object
984 # column back, but each unmasked element has the correct type.)
985 tab1 = _makeSimpleAstropyTable(include_masked=True)
987 self.butler.put(tab1, self.datasetType, dataId={})
989 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
991 tab1_df = astropy_to_pandas(tab1)
993 self.assertTrue(tab1_df.columns.equals(tab2.columns))
994 for name in tab2.columns:
995 col1 = tab1_df[name]
996 col2 = tab2[name]
998 if col1.hasnans:
999 notNull = col1.notnull()
1000 self.assertTrue(notNull.equals(col2.notnull()))
1001 # Need to check value-by-value because column may
1002 # be made of objects, depending on what pandas decides.
1003 for index in notNull.values.nonzero()[0]:
1004 self.assertEqual(col1[index], col2[index])
1005 else:
1006 self.assertTrue(col1.equals(col2))
1008 @unittest.skipUnless(pd is not None, "Cannot test writing as a dataframe without pandas.")
1009 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
1010 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
1012 self.butler.put(df1, self.datasetType, dataId={})
1014 tab2 = self.butler.get(self.datasetType, dataId={})
1016 df1_tab = pandas_to_astropy(df1)
1018 self._checkAstropyTableEquality(df1_tab, tab2)
1020 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1021 def testWriteAstropyReadAsNumpyTable(self):
1022 tab1 = _makeSimpleAstropyTable()
1023 self.butler.put(tab1, self.datasetType, dataId={})
1025 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1027 # This is tricky because it loses the units.
1028 tab2_astropy = atable.Table(tab2)
1030 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
1032 # Check reading the columns.
1033 columns = list(tab2.dtype.names)
1034 columns2 = self.butler.get(
1035 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1036 )
1037 self.assertEqual(columns2, columns)
1039 # Check reading the schema.
1040 schema = ArrowNumpySchema(tab2.dtype)
1041 schema2 = self.butler.get(
1042 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1043 )
1045 self.assertEqual(schema2, schema)
1047 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1048 def testWriteAstropyReadAsNumpyDict(self):
1049 tab1 = _makeSimpleAstropyTable()
1050 self.butler.put(tab1, self.datasetType, dataId={})
1052 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1054 # This is tricky because it loses the units.
1055 tab2_astropy = atable.Table(tab2)
1057 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
1059 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
1060 """Check if two astropy tables have the same columns/values.
1062 Parameters
1063 ----------
1064 table1 : `astropy.table.Table`
1065 table2 : `astropy.table.Table`
1066 skip_units : `bool`
1067 has_bigendian : `bool`
1068 """
1069 if not has_bigendian:
1070 self.assertEqual(table1.dtype, table2.dtype)
1071 else:
1072 for name in table1.dtype.names:
1073 # Only check type matches, force to little-endian.
1074 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1076 self.assertEqual(table1.meta, table2.meta)
1077 if not skip_units:
1078 for name in table1.columns:
1079 self.assertEqual(table1[name].unit, table2[name].unit)
1080 self.assertEqual(table1[name].description, table2[name].description)
1081 self.assertEqual(table1[name].format, table2[name].format)
1082 # We need to check masked/regular columns after filling.
1083 has_masked = False
1084 if isinstance(table1[name], atable.column.MaskedColumn):
1085 c1 = table1[name].filled()
1086 has_masked = True
1087 else:
1088 c1 = np.array(table1[name])
1089 if has_masked:
1090 self.assertIsInstance(table2[name], atable.column.MaskedColumn)
1091 c2 = table2[name].filled()
1092 else:
1093 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn))
1094 c2 = np.array(table2[name])
1095 np.testing.assert_array_equal(c1, c2)
1096 # If we have a masked column then we test the underlying data.
1097 if has_masked:
1098 np.testing.assert_array_equal(np.array(c1), np.array(c2))
1099 np.testing.assert_array_equal(table1[name].mask, table2[name].mask)
1102@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
1103class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
1104 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
1106 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1108 def testAstropyParquet(self):
1109 # This test does not work with an inMemoryDatastore.
1110 pass
1112 def testBadInput(self):
1113 tab1 = _makeSimpleAstropyTable()
1114 delegate = ArrowAstropyDelegate("ArrowAstropy")
1116 with self.assertRaises(ValueError):
1117 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1119 with self.assertRaises(NotImplementedError):
1120 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1122 with self.assertRaises(AttributeError):
1123 delegate.getComponent(composite=tab1, componentName="nothing")
1126@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1127@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1128class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1129 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1131 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1133 def setUp(self):
1134 """Create a new butler root for each test."""
1135 self.root = makeTestTempDir(TESTDIR)
1136 config = Config(self.configFile)
1137 self.butler = Butler.from_config(
1138 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1139 )
1140 # No dimensions in dataset type so we don't have to worry about
1141 # inserting dimension data or defining data IDs.
1142 self.datasetType = DatasetType(
1143 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions
1144 )
1145 self.butler.registry.registerDatasetType(self.datasetType)
1147 def tearDown(self):
1148 removeTestTempDir(self.root)
1150 def testNumpyTable(self):
1151 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1153 self.butler.put(tab1, self.datasetType, dataId={})
1154 # Read the whole Table.
1155 tab2 = self.butler.get(self.datasetType, dataId={})
1156 self._checkNumpyTableEquality(tab1, tab2)
1157 # Read the columns.
1158 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1159 self.assertEqual(len(columns2), len(tab1.dtype.names))
1160 for i, name in enumerate(tab1.dtype.names):
1161 self.assertEqual(columns2[i], name)
1162 # Read the rowcount.
1163 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1164 self.assertEqual(rowcount, len(tab1))
1165 # Read the schema.
1166 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1167 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1168 # Read just some columns a few different ways.
1169 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1170 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1171 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1172 self._checkNumpyTableEquality(
1173 tab1[
1174 [
1175 "a",
1176 ]
1177 ],
1178 tab4,
1179 )
1180 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1181 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1182 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1183 self._checkNumpyTableEquality(
1184 tab1[
1185 [
1186 "ddd",
1187 ]
1188 ],
1189 tab6,
1190 )
1191 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1192 self._checkNumpyTableEquality(
1193 tab1[
1194 [
1195 "a",
1196 ]
1197 ],
1198 tab7,
1199 )
1200 # Passing an unrecognized column should be a ValueError.
1201 with self.assertRaises(ValueError):
1202 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1204 def testNumpyTableBigEndian(self):
1205 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1207 self.butler.put(tab1, self.datasetType, dataId={})
1208 # Read the whole Table.
1209 tab2 = self.butler.get(self.datasetType, dataId={})
1210 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1212 def testArrowNumpySchema(self):
1213 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1214 tab1_arrow = numpy_to_arrow(tab1)
1215 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1217 self.assertIsInstance(schema.schema, np.dtype)
1218 self.assertEqual(repr(schema), repr(schema._dtype))
1219 self.assertNotEqual(schema, "not_a_schema")
1220 self.assertEqual(schema, schema)
1222 # Test inequality
1223 tab2 = tab1.copy()
1224 names = list(tab2.dtype.names)
1225 names[0] = "index2"
1226 tab2.dtype.names = names
1227 schema2 = ArrowNumpySchema(tab2.dtype)
1228 self.assertNotEqual(schema2, schema)
1230 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1231 def testNumpyDictConversions(self):
1232 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1234 # Verify that everything round-trips, including the schema.
1235 tab1_arrow = numpy_to_arrow(tab1)
1236 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1237 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1239 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1240 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1242 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1243 def testWriteNumpyTableReadAsArrowTable(self):
1244 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1246 self.butler.put(tab1, self.datasetType, dataId={})
1248 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1250 tab2_numpy = arrow_to_numpy(tab2)
1252 self._checkNumpyTableEquality(tab1, tab2_numpy)
1254 # Check reading the columns.
1255 columns = tab2.schema.names
1256 columns2 = self.butler.get(
1257 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1258 )
1259 self.assertEqual(columns2, columns)
1261 # Check reading the schema.
1262 schema = tab2.schema
1263 schema2 = self.butler.get(
1264 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1265 )
1266 self.assertEqual(schema2, schema)
1268 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1269 def testWriteNumpyTableReadAsDataFrame(self):
1270 tab1 = _makeSimpleNumpyTable()
1272 self.butler.put(tab1, self.datasetType, dataId={})
1274 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1276 # Converting this back to numpy gets confused with the index column
1277 # and changes the datatype of the string column.
1279 tab1_df = pd.DataFrame(tab1)
1281 self.assertTrue(tab1_df.equals(tab2))
1283 # Check reading the columns.
1284 columns = tab2.columns
1285 columns2 = self.butler.get(
1286 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1287 )
1288 self.assertTrue(columns.equals(columns2))
1290 # Check reading the schema.
1291 schema = DataFrameSchema(tab2)
1292 schema2 = self.butler.get(
1293 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1294 )
1296 self.assertEqual(schema2, schema)
1298 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1299 def testWriteNumpyTableReadAsAstropyTable(self):
1300 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1302 self.butler.put(tab1, self.datasetType, dataId={})
1304 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1305 tab2_numpy = tab2.as_array()
1307 self._checkNumpyTableEquality(tab1, tab2_numpy)
1309 # Check reading the columns.
1310 columns = list(tab2.columns.keys())
1311 columns2 = self.butler.get(
1312 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1313 )
1314 self.assertEqual(columns2, columns)
1316 # Check reading the schema.
1317 schema = ArrowAstropySchema(tab2)
1318 schema2 = self.butler.get(
1319 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1320 )
1322 self.assertEqual(schema2, schema)
1324 def testWriteNumpyTableReadAsNumpyDict(self):
1325 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1327 self.butler.put(tab1, self.datasetType, dataId={})
1329 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1330 tab2_numpy = _numpy_dict_to_numpy(tab2)
1332 self._checkNumpyTableEquality(tab1, tab2_numpy)
1334 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1335 """Check if two numpy tables have the same columns/values
1337 Parameters
1338 ----------
1339 table1 : `numpy.ndarray`
1340 table2 : `numpy.ndarray`
1341 has_bigendian : `bool`
1342 """
1343 self.assertEqual(table1.dtype.names, table2.dtype.names)
1344 for name in table1.dtype.names:
1345 if not has_bigendian:
1346 self.assertEqual(table1.dtype[name], table2.dtype[name])
1347 else:
1348 # Only check type matches, force to little-endian.
1349 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1350 self.assertTrue(np.all(table1 == table2))
1353@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1354class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1355 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1357 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1359 def testBadInput(self):
1360 tab1 = _makeSimpleNumpyTable()
1361 delegate = ArrowNumpyDelegate("ArrowNumpy")
1363 with self.assertRaises(ValueError):
1364 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1366 with self.assertRaises(NotImplementedError):
1367 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1369 with self.assertRaises(AttributeError):
1370 delegate.getComponent(composite=tab1, componentName="nothing")
1372 def testStorageClass(self):
1373 tab1 = _makeSimpleNumpyTable()
1375 factory = StorageClassFactory()
1376 factory.addFromConfig(StorageClassConfig())
1378 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1379 # Force the name lookup to do name matching.
1380 storageClass._pytype = None
1381 self.assertEqual(storageClass.name, "ArrowNumpy")
1383 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1384 # Force the name lookup to do name matching.
1385 storageClass._pytype = None
1386 self.assertEqual(storageClass.name, "ArrowNumpy")
1389@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1390class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1391 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1393 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1395 def setUp(self):
1396 """Create a new butler root for each test."""
1397 self.root = makeTestTempDir(TESTDIR)
1398 config = Config(self.configFile)
1399 self.butler = Butler.from_config(
1400 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1401 )
1402 # No dimensions in dataset type so we don't have to worry about
1403 # inserting dimension data or defining data IDs.
1404 self.datasetType = DatasetType(
1405 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions
1406 )
1407 self.butler.registry.registerDatasetType(self.datasetType)
1409 def tearDown(self):
1410 removeTestTempDir(self.root)
1412 def testArrowTable(self):
1413 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1415 self.butler.put(tab1, self.datasetType, dataId={})
1416 # Read the whole Table.
1417 tab2 = self.butler.get(self.datasetType, dataId={})
1418 # We convert to use the numpy testing framework to handle nan
1419 # comparisons.
1420 self.assertEqual(tab1.schema, tab2.schema)
1421 tab1_np = arrow_to_numpy(tab1)
1422 tab2_np = arrow_to_numpy(tab2)
1423 for col in tab1.column_names:
1424 np.testing.assert_array_equal(tab2_np[col], tab1_np[col])
1425 # Read the columns.
1426 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1427 self.assertEqual(len(columns2), len(tab1.schema.names))
1428 for i, name in enumerate(tab1.schema.names):
1429 self.assertEqual(columns2[i], name)
1430 # Read the rowcount.
1431 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1432 self.assertEqual(rowcount, len(tab1))
1433 # Read the schema.
1434 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1435 self.assertEqual(schema, tab1.schema)
1436 # Read just some columns a few different ways.
1437 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1438 self.assertEqual(tab3, tab1.select(("a", "c")))
1439 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1440 self.assertEqual(tab4, tab1.select(("a",)))
1441 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1442 self.assertEqual(tab5, tab1.select(("index", "a")))
1443 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1444 self.assertEqual(tab6, tab1.select(("ddd",)))
1445 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1446 self.assertEqual(tab7, tab1.select(("a",)))
1447 # Passing an unrecognized column should be a ValueError.
1448 with self.assertRaises(ValueError):
1449 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1451 def testEmptyArrowTable(self):
1452 data = _makeSimpleNumpyTable()
1453 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1455 schema = pa.schema(type_list)
1456 arrays = [[]] * len(schema.names)
1458 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1460 self.butler.put(tab1, self.datasetType, dataId={})
1461 tab2 = self.butler.get(self.datasetType, dataId={})
1462 self.assertEqual(tab2, tab1)
1464 tab1_numpy = arrow_to_numpy(tab1)
1465 self.assertEqual(len(tab1_numpy), 0)
1466 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1467 self.assertEqual(tab1_numpy_arrow, tab1)
1469 tab1_pandas = arrow_to_pandas(tab1)
1470 self.assertEqual(len(tab1_pandas), 0)
1471 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1472 # Unfortunately, string/byte columns get mangled when translated
1473 # through empty pandas dataframes.
1474 self.assertEqual(
1475 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1476 tab1.select(("index", "a", "b", "c", "ddd")),
1477 )
1479 tab1_astropy = arrow_to_astropy(tab1)
1480 self.assertEqual(len(tab1_astropy), 0)
1481 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1482 self.assertEqual(tab1_astropy_arrow, tab1)
1484 def testEmptyArrowTableMultidim(self):
1485 data = _makeSimpleNumpyTable(include_multidim=True)
1486 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1488 md = {}
1489 for name in data.dtype.names:
1490 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1492 schema = pa.schema(type_list, metadata=md)
1493 arrays = [[]] * len(schema.names)
1495 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1497 self.butler.put(tab1, self.datasetType, dataId={})
1498 tab2 = self.butler.get(self.datasetType, dataId={})
1499 self.assertEqual(tab2, tab1)
1501 tab1_numpy = arrow_to_numpy(tab1)
1502 self.assertEqual(len(tab1_numpy), 0)
1503 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1504 self.assertEqual(tab1_numpy_arrow, tab1)
1506 tab1_astropy = arrow_to_astropy(tab1)
1507 self.assertEqual(len(tab1_astropy), 0)
1508 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1509 self.assertEqual(tab1_astropy_arrow, tab1)
1511 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1512 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1513 df1, allColumns = _makeSingleIndexDataFrame()
1515 self.butler.put(df1, self.datasetType, dataId={})
1517 # Read back out as a dataframe.
1518 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1519 self.assertTrue(df1.equals(df2))
1521 # Read back out as an arrow table, convert to dataframe.
1522 tab3 = self.butler.get(self.datasetType, dataId={})
1523 df3 = arrow_to_pandas(tab3)
1524 self.assertTrue(df1.equals(df3))
1526 # Check reading the columns.
1527 columns = df2.reset_index().columns
1528 columns2 = self.butler.get(
1529 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1530 )
1531 # We check the set because pandas reorders the columns.
1532 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1534 # Check reading the schema.
1535 schema = DataFrameSchema(df1)
1536 schema2 = self.butler.get(
1537 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1538 )
1539 self.assertEqual(schema2, schema)
1541 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1542 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1543 df1 = _makeMultiIndexDataFrame()
1545 self.butler.put(df1, self.datasetType, dataId={})
1547 # Read back out as a dataframe.
1548 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1549 self.assertTrue(df1.equals(df2))
1551 # Read back out as an arrow table, convert to dataframe.
1552 atab3 = self.butler.get(self.datasetType, dataId={})
1553 df3 = arrow_to_pandas(atab3)
1554 self.assertTrue(df1.equals(df3))
1556 # Check reading the columns.
1557 columns = df2.columns
1558 columns2 = self.butler.get(
1559 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1560 )
1561 self.assertTrue(columns2.equals(columns))
1563 # Check reading the schema.
1564 schema = DataFrameSchema(df1)
1565 schema2 = self.butler.get(
1566 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1567 )
1568 self.assertEqual(schema2, schema)
1570 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1571 def testWriteArrowTableReadAsAstropyTable(self):
1572 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1574 self.butler.put(tab1, self.datasetType, dataId={})
1576 # Read back out as an astropy table.
1577 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1578 self._checkAstropyTableEquality(tab1, tab2)
1580 # Read back out as an arrow table, convert to astropy table.
1581 atab3 = self.butler.get(self.datasetType, dataId={})
1582 tab3 = arrow_to_astropy(atab3)
1583 self._checkAstropyTableEquality(tab1, tab3)
1585 # Check reading the columns.
1586 columns = list(tab2.columns.keys())
1587 columns2 = self.butler.get(
1588 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1589 )
1590 self.assertEqual(columns2, columns)
1592 # Check reading the schema.
1593 schema = ArrowAstropySchema(tab1)
1594 schema2 = self.butler.get(
1595 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1596 )
1597 self.assertEqual(schema2, schema)
1599 # Check the schema conversions and units.
1600 arrow_schema = schema.to_arrow_schema()
1601 for name in arrow_schema.names:
1602 field_metadata = arrow_schema.field(name).metadata
1603 if (
1604 b"description" in field_metadata
1605 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
1606 ):
1607 self.assertEqual(schema2.schema[name].description, description)
1608 else:
1609 self.assertIsNone(schema2.schema[name].description)
1610 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
1611 self.assertEqual(schema2.schema[name].unit, units.Unit(unit))
1613 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1614 def testWriteArrowTableReadAsNumpyTable(self):
1615 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1617 self.butler.put(tab1, self.datasetType, dataId={})
1619 # Read back out as a numpy table.
1620 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1621 self._checkNumpyTableEquality(tab1, tab2)
1623 # Read back out as an arrow table, convert to numpy table.
1624 atab3 = self.butler.get(self.datasetType, dataId={})
1625 tab3 = arrow_to_numpy(atab3)
1626 self._checkNumpyTableEquality(tab1, tab3)
1628 # Check reading the columns.
1629 columns = list(tab2.dtype.names)
1630 columns2 = self.butler.get(
1631 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1632 )
1633 self.assertEqual(columns2, columns)
1635 # Check reading the schema.
1636 schema = ArrowNumpySchema(tab1.dtype)
1637 schema2 = self.butler.get(
1638 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1639 )
1640 self.assertEqual(schema2, schema)
1642 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1643 def testWriteArrowTableReadAsNumpyDict(self):
1644 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1646 self.butler.put(tab1, self.datasetType, dataId={})
1648 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1649 tab2_numpy = _numpy_dict_to_numpy(tab2)
1650 self._checkNumpyTableEquality(tab1, tab2_numpy)
1652 def _checkAstropyTableEquality(self, table1, table2):
1653 """Check if two astropy tables have the same columns/values
1655 Parameters
1656 ----------
1657 table1 : `astropy.table.Table`
1658 table2 : `astropy.table.Table`
1659 """
1660 self.assertEqual(table1.dtype, table2.dtype)
1661 for name in table1.columns:
1662 self.assertEqual(table1[name].unit, table2[name].unit)
1663 self.assertEqual(table1[name].description, table2[name].description)
1664 self.assertEqual(table1[name].format, table2[name].format)
1665 # We need to check masked/regular columns after filling.
1666 has_masked = False
1667 if isinstance(table1[name], atable.column.MaskedColumn):
1668 c1 = table1[name].filled()
1669 has_masked = True
1670 else:
1671 c1 = np.array(table1[name])
1672 if has_masked:
1673 self.assertIsInstance(table2[name], atable.column.MaskedColumn)
1674 c2 = table2[name].filled()
1675 else:
1676 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn))
1677 c2 = np.array(table2[name])
1678 np.testing.assert_array_equal(c1, c2)
1679 # If we have a masked column then we test the underlying data.
1680 if has_masked:
1681 np.testing.assert_array_equal(np.array(c1), np.array(c2))
1682 np.testing.assert_array_equal(table1[name].mask, table2[name].mask)
1684 def _checkNumpyTableEquality(self, table1, table2):
1685 """Check if two numpy tables have the same columns/values
1687 Parameters
1688 ----------
1689 table1 : `numpy.ndarray`
1690 table2 : `numpy.ndarray`
1691 """
1692 self.assertEqual(table1.dtype.names, table2.dtype.names)
1693 for name in table1.dtype.names:
1694 self.assertEqual(table1.dtype[name], table2.dtype[name])
1695 self.assertTrue(np.all(table1 == table2))
1698@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1699class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1700 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1702 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1704 def testBadInput(self):
1705 tab1 = _makeSimpleArrowTable()
1706 delegate = ArrowTableDelegate("ArrowTable")
1708 with self.assertRaises(ValueError):
1709 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1711 with self.assertRaises(NotImplementedError):
1712 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1714 with self.assertRaises(AttributeError):
1715 delegate.getComponent(composite=tab1, componentName="nothing")
1717 def testStorageClass(self):
1718 tab1 = _makeSimpleArrowTable()
1720 factory = StorageClassFactory()
1721 factory.addFromConfig(StorageClassConfig())
1723 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1724 # Force the name lookup to do name matching.
1725 storageClass._pytype = None
1726 self.assertEqual(storageClass.name, "ArrowTable")
1728 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1729 # Force the name lookup to do name matching.
1730 storageClass._pytype = None
1731 self.assertEqual(storageClass.name, "ArrowTable")
1734@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1735@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1736class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1737 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1739 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1741 def setUp(self):
1742 """Create a new butler root for each test."""
1743 self.root = makeTestTempDir(TESTDIR)
1744 config = Config(self.configFile)
1745 self.butler = Butler.from_config(
1746 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1747 )
1748 # No dimensions in dataset type so we don't have to worry about
1749 # inserting dimension data or defining data IDs.
1750 self.datasetType = DatasetType(
1751 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions
1752 )
1753 self.butler.registry.registerDatasetType(self.datasetType)
1755 def tearDown(self):
1756 removeTestTempDir(self.root)
1758 def testNumpyDict(self):
1759 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1760 dict1 = _numpy_to_numpy_dict(tab1)
1762 self.butler.put(dict1, self.datasetType, dataId={})
1763 # Read the whole table.
1764 dict2 = self.butler.get(self.datasetType, dataId={})
1765 self._checkNumpyDictEquality(dict1, dict2)
1766 # Read the columns.
1767 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1768 self.assertEqual(len(columns2), len(dict1.keys()))
1769 for name in dict1:
1770 self.assertIn(name, columns2)
1771 # Read the rowcount.
1772 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1773 self.assertEqual(rowcount, len(dict1["a"]))
1774 # Read the schema.
1775 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1776 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1777 # Read just some columns a few different ways.
1778 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1779 subdict = {key: dict1[key] for key in ["a", "c"]}
1780 self._checkNumpyDictEquality(subdict, tab3)
1781 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1782 subdict = {key: dict1[key] for key in ["a"]}
1783 self._checkNumpyDictEquality(subdict, tab4)
1784 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1785 subdict = {key: dict1[key] for key in ["index", "a"]}
1786 self._checkNumpyDictEquality(subdict, tab5)
1787 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1788 subdict = {key: dict1[key] for key in ["ddd"]}
1789 self._checkNumpyDictEquality(subdict, tab6)
1790 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1791 subdict = {key: dict1[key] for key in ["a"]}
1792 self._checkNumpyDictEquality(subdict, tab7)
1793 # Passing an unrecognized column should be a ValueError.
1794 with self.assertRaises(ValueError):
1795 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1797 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1798 def testWriteNumpyDictReadAsArrowTable(self):
1799 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1800 dict1 = _numpy_to_numpy_dict(tab1)
1802 self.butler.put(dict1, self.datasetType, dataId={})
1804 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1806 tab2_dict = arrow_to_numpy_dict(tab2)
1808 self._checkNumpyDictEquality(dict1, tab2_dict)
1810 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1811 def testWriteNumpyDictReadAsDataFrame(self):
1812 tab1 = _makeSimpleNumpyTable()
1813 dict1 = _numpy_to_numpy_dict(tab1)
1815 self.butler.put(dict1, self.datasetType, dataId={})
1817 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1819 # The order of the dict may get mixed up, so we need to check column
1820 # by column. We also need to do this in dataframe form because pandas
1821 # changes the datatype of the string column.
1822 tab1_df = pd.DataFrame(tab1)
1824 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1825 for col in tab1_df.columns:
1826 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1828 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1829 def testWriteNumpyDictReadAsAstropyTable(self):
1830 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1831 dict1 = _numpy_to_numpy_dict(tab1)
1833 self.butler.put(dict1, self.datasetType, dataId={})
1835 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1836 tab2_dict = _astropy_to_numpy_dict(tab2)
1838 self._checkNumpyDictEquality(dict1, tab2_dict)
1840 def testWriteNumpyDictReadAsNumpyTable(self):
1841 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1842 dict1 = _numpy_to_numpy_dict(tab1)
1844 self.butler.put(dict1, self.datasetType, dataId={})
1846 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1847 tab2_dict = _numpy_to_numpy_dict(tab2)
1849 self._checkNumpyDictEquality(dict1, tab2_dict)
1851 def testWriteNumpyDictBad(self):
1852 dict1 = {"a": 4, "b": np.ndarray([1])}
1853 with self.assertRaises(RuntimeError):
1854 self.butler.put(dict1, self.datasetType, dataId={})
1856 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1857 with self.assertRaises(RuntimeError):
1858 self.butler.put(dict2, self.datasetType, dataId={})
1860 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1861 with self.assertRaises(RuntimeError):
1862 self.butler.put(dict3, self.datasetType, dataId={})
1864 def _checkNumpyDictEquality(self, dict1, dict2):
1865 """Check if two numpy dicts have the same columns/values.
1867 Parameters
1868 ----------
1869 dict1 : `dict` [`str`, `np.ndarray`]
1870 dict2 : `dict` [`str`, `np.ndarray`]
1871 """
1872 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1873 for name in dict1:
1874 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1875 self.assertTrue(np.all(dict1[name] == dict2[name]))
1878@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1879@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1880class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1881 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1883 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1885 def testWriteNumpyDictBad(self):
1886 # The sub-type checking is not done on in-memory datastore.
1887 pass
1890@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.")
1891class ParquetFormatterArrowSchemaTestCase(unittest.TestCase):
1892 """Tests for ParquetFormatter, ArrowSchema, using local file datastore."""
1894 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1896 def setUp(self):
1897 """Create a new butler root for each test."""
1898 self.root = makeTestTempDir(TESTDIR)
1899 config = Config(self.configFile)
1900 self.butler = Butler.from_config(
1901 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
1902 )
1903 # No dimensions in dataset type so we don't have to worry about
1904 # inserting dimension data or defining data IDs.
1905 self.datasetType = DatasetType(
1906 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions
1907 )
1908 self.butler.registry.registerDatasetType(self.datasetType)
1910 def tearDown(self):
1911 removeTestTempDir(self.root)
1913 def _makeTestSchema(self):
1914 schema = pa.schema(
1915 [
1916 pa.field(
1917 "int32",
1918 pa.int32(),
1919 nullable=False,
1920 metadata={
1921 "description": "32-bit integer",
1922 "unit": "",
1923 },
1924 ),
1925 pa.field(
1926 "int64",
1927 pa.int64(),
1928 nullable=False,
1929 metadata={
1930 "description": "64-bit integer",
1931 "unit": "",
1932 },
1933 ),
1934 pa.field(
1935 "uint64",
1936 pa.uint64(),
1937 nullable=False,
1938 metadata={
1939 "description": "64-bit unsigned integer",
1940 "unit": "",
1941 },
1942 ),
1943 pa.field(
1944 "float32",
1945 pa.float32(),
1946 nullable=False,
1947 metadata={
1948 "description": "32-bit float",
1949 "unit": "count",
1950 },
1951 ),
1952 pa.field(
1953 "float64",
1954 pa.float64(),
1955 nullable=False,
1956 metadata={
1957 "description": "64-bit float",
1958 "unit": "nJy",
1959 },
1960 ),
1961 pa.field(
1962 "fixed_size_list",
1963 pa.list_(pa.float64(), list_size=10),
1964 nullable=False,
1965 metadata={
1966 "description": "Fixed size list of 64-bit floats.",
1967 "unit": "nJy",
1968 },
1969 ),
1970 pa.field(
1971 "variable_size_list",
1972 pa.list_(pa.float64()),
1973 nullable=False,
1974 metadata={
1975 "description": "Variable size list of 64-bit floats.",
1976 "unit": "nJy",
1977 },
1978 ),
1979 # One of these fields will have no description.
1980 pa.field(
1981 "string",
1982 pa.string(),
1983 nullable=False,
1984 metadata={
1985 "unit": "",
1986 },
1987 ),
1988 # One of these fields will have no metadata.
1989 pa.field(
1990 "binary",
1991 pa.binary(),
1992 nullable=False,
1993 ),
1994 ]
1995 )
1997 return schema
1999 def testArrowSchema(self):
2000 schema1 = self._makeTestSchema()
2001 self.butler.put(schema1, self.datasetType, dataId={})
2003 schema2 = self.butler.get(self.datasetType, dataId={})
2004 self.assertEqual(schema2, schema1)
2006 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.")
2007 def testWriteArrowSchemaReadAsDataFrameSchema(self):
2008 schema1 = self._makeTestSchema()
2009 self.butler.put(schema1, self.datasetType, dataId={})
2011 df_schema1 = DataFrameSchema.from_arrow(schema1)
2013 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema")
2014 self.assertEqual(df_schema2, df_schema1)
2016 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.")
2017 def testWriteArrowSchemaReadAsArrowAstropySchema(self):
2018 schema1 = self._makeTestSchema()
2019 self.butler.put(schema1, self.datasetType, dataId={})
2021 ap_schema1 = ArrowAstropySchema.from_arrow(schema1)
2023 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema")
2024 self.assertEqual(ap_schema2, ap_schema1)
2026 # Confirm that the ap_schema2 has the unit/description we expect.
2027 for name in schema1.names:
2028 field_metadata = schema1.field(name).metadata
2029 if field_metadata is None:
2030 continue
2031 if (
2032 b"description" in field_metadata
2033 and (description := field_metadata[b"description"].decode("UTF-8")) != ""
2034 ):
2035 self.assertEqual(ap_schema2.schema[name].description, description)
2036 else:
2037 self.assertIsNone(ap_schema2.schema[name].description)
2038 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "":
2039 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit))
2041 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.")
2042 def testWriteArrowSchemaReadAsArrowNumpySchema(self):
2043 schema1 = self._makeTestSchema()
2044 self.butler.put(schema1, self.datasetType, dataId={})
2046 np_schema1 = ArrowNumpySchema.from_arrow(schema1)
2048 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema")
2049 self.assertEqual(np_schema2, np_schema1)
2052@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.")
2053class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase):
2054 """Tests for InMemoryDatastore and ArrowSchema."""
2056 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
2059@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
2060@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
2061class ComputeRowGroupSizeTestCase(unittest.TestCase):
2062 """Tests for compute_row_group_size."""
2064 def testRowGroupSizeNoMetadata(self):
2065 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
2067 # We can't use the numpy_to_arrow convenience function because
2068 # that adds metadata.
2069 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
2070 schema = pa.schema(type_list)
2071 arrays = _numpy_style_arrays_to_arrow_arrays(
2072 numpyTable.dtype,
2073 len(numpyTable),
2074 numpyTable,
2075 schema,
2076 )
2077 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
2079 row_group_size = compute_row_group_size(arrowTable.schema)
2081 self.assertGreater(row_group_size, 1_000_000)
2082 self.assertLess(row_group_size, 2_000_000)
2084 def testRowGroupSizeWithMetadata(self):
2085 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
2087 arrowTable = numpy_to_arrow(numpyTable)
2089 row_group_size = compute_row_group_size(arrowTable.schema)
2091 self.assertGreater(row_group_size, 1_000_000)
2092 self.assertLess(row_group_size, 2_000_000)
2094 def testRowGroupSizeTinyTable(self):
2095 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
2097 arrowTable = numpy_to_arrow(numpyTable)
2099 row_group_size = compute_row_group_size(arrowTable.schema)
2101 self.assertGreater(row_group_size, 1_000_000)
2103 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
2104 def testRowGroupSizeDataFrameWithLists(self):
2105 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
2106 arrowTable = pandas_to_arrow(df)
2107 row_group_size = compute_row_group_size(arrowTable.schema)
2109 self.assertGreater(row_group_size, 1_000_000)
2112if __name__ == "__main__":
2113 unittest.main()