Coverage for tests/test_parquet.py: 22%
977 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import pyarrow as pa
32except ImportError:
33 pa = None
34try:
35 import astropy.table as atable
36 from astropy import units
37except ImportError:
38 atable = None
39try:
40 import numpy as np
41except ImportError:
42 np = None
43try:
44 import pandas as pd
45except ImportError:
46 pd = None
48from lsst.daf.butler import (
49 Butler,
50 Config,
51 DatasetRef,
52 DatasetType,
53 FileDataset,
54 StorageClassConfig,
55 StorageClassFactory,
56)
58try:
59 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate
60except ImportError:
61 atable = None
62 pa = None
63try:
64 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate
65except ImportError:
66 np = None
67 pa = None
68try:
69 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate
70except ImportError:
71 pa = None
72try:
73 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate
74except ImportError:
75 pd = None
76try:
77 from lsst.daf.butler.formatters.parquet import (
78 ArrowAstropySchema,
79 ArrowNumpySchema,
80 DataFrameSchema,
81 ParquetFormatter,
82 _append_numpy_multidim_metadata,
83 _astropy_to_numpy_dict,
84 _numpy_dict_to_numpy,
85 _numpy_dtype_to_arrow_types,
86 _numpy_style_arrays_to_arrow_arrays,
87 _numpy_to_numpy_dict,
88 arrow_to_astropy,
89 arrow_to_numpy,
90 arrow_to_numpy_dict,
91 arrow_to_pandas,
92 astropy_to_arrow,
93 compute_row_group_size,
94 numpy_dict_to_arrow,
95 numpy_to_arrow,
96 pandas_to_arrow,
97 )
98except ImportError:
99 pa = None
100 pd = None
101 atable = None
102 np = None
103from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
105TESTDIR = os.path.abspath(os.path.dirname(__file__))
108def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
109 """Make a simple numpy table with random data.
111 Parameters
112 ----------
113 include_multidim : `bool`
114 Include multi-dimensional columns.
115 include_bigendian : `bool`
116 Include big-endian columns.
118 Returns
119 -------
120 numpyTable : `numpy.ndarray`
121 """
122 nrow = 5
124 dtype = [
125 ("index", "i4"),
126 ("a", "f8"),
127 ("b", "f8"),
128 ("c", "f8"),
129 ("ddd", "f8"),
130 ("f", "i8"),
131 ("strcol", "U10"),
132 ("bytecol", "a10"),
133 ]
135 if include_multidim:
136 dtype.extend(
137 [
138 ("d1", "f4", (5,)),
139 ("d2", "i8", (5, 10)),
140 ("d3", "f8", (5, 10)),
141 ]
142 )
144 if include_bigendian:
145 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")])
147 data = np.zeros(nrow, dtype=dtype)
148 data["index"][:] = np.arange(nrow)
149 data["a"] = np.random.randn(nrow)
150 data["b"] = np.random.randn(nrow)
151 data["c"] = np.random.randn(nrow)
152 data["ddd"] = np.random.randn(nrow)
153 data["f"] = np.arange(nrow) * 10
154 data["strcol"][:] = "teststring"
155 data["bytecol"][:] = "teststring"
157 if include_multidim:
158 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
159 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape)
160 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape))
162 if include_bigendian:
163 data["a_bigendian"][:] = data["a"]
164 data["f_bigendian"][:] = data["f"]
166 return data
169def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
170 """Make a single index data frame for testing.
172 Parameters
173 ----------
174 include_masked : `bool`
175 Include masked columns.
176 include_lists : `bool`
177 Include list columns.
179 Returns
180 -------
181 dataFrame : `~pandas.DataFrame`
182 The test dataframe.
183 allColumns : `list` [`str`]
184 List of all the columns (including index columns).
185 """
186 data = _makeSimpleNumpyTable()
187 df = pd.DataFrame(data)
188 df = df.set_index("index")
190 if include_masked:
191 nrow = len(df)
193 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype())
194 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
195 df["mstrcol"] = pd.array(np.array(["text"] * nrow))
196 df.loc[1, ["m1", "m2", "mstrcol"]] = None
198 if include_lists:
199 nrow = len(df)
201 df["l1"] = [[0, 0]] * nrow
202 df["l2"] = [[0.0, 0.0]] * nrow
203 df["l3"] = [[]] * nrow
205 allColumns = df.columns.append(pd.Index(df.index.names))
207 return df, allColumns
210def _makeMultiIndexDataFrame():
211 """Make a multi-index data frame for testing.
213 Returns
214 -------
215 dataFrame : `~pandas.DataFrame`
216 The test dataframe.
217 """
218 columns = pd.MultiIndex.from_tuples(
219 [
220 ("g", "a"),
221 ("g", "b"),
222 ("g", "c"),
223 ("r", "a"),
224 ("r", "b"),
225 ("r", "c"),
226 ],
227 names=["filter", "column"],
228 )
229 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns)
231 return df
234def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False):
235 """Make an astropy table for testing.
237 Parameters
238 ----------
239 include_multidim : `bool`
240 Include multi-dimensional columns.
241 include_masked : `bool`
242 Include masked columns.
243 include_bigendian : `bool`
244 Include big-endian columns.
246 Returns
247 -------
248 astropyTable : `astropy.table.Table`
249 The test table.
250 """
251 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian)
252 # Add a couple of units.
253 table = atable.Table(data)
254 table["a"].unit = units.degree
255 table["b"].unit = units.meter
257 # Add some masked columns.
258 if include_masked:
259 nrow = len(table)
260 mask = np.zeros(nrow, dtype=bool)
261 mask[1] = True
262 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask)
263 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask)
264 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask)
265 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask)
267 return table
270def _makeSimpleArrowTable(include_multidim=False, include_masked=False):
271 """Make an arrow table for testing.
273 Parameters
274 ----------
275 include_multidim : `bool`
276 Include multi-dimensional columns.
277 include_masked : `bool`
278 Include masked columns.
280 Returns
281 -------
282 arrowTable : `pyarrow.Table`
283 The test table.
284 """
285 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked)
286 return astropy_to_arrow(data)
289@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.")
290@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.")
291class ParquetFormatterDataFrameTestCase(unittest.TestCase):
292 """Tests for ParquetFormatter, DataFrame, using local file datastore."""
294 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
296 def setUp(self):
297 """Create a new butler root for each test."""
298 self.root = makeTestTempDir(TESTDIR)
299 config = Config(self.configFile)
300 self.run = "test_run"
301 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
302 # No dimensions in dataset type so we don't have to worry about
303 # inserting dimension data or defining data IDs.
304 self.datasetType = DatasetType(
305 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions
306 )
307 self.butler.registry.registerDatasetType(self.datasetType)
309 def tearDown(self):
310 removeTestTempDir(self.root)
312 def testSingleIndexDataFrame(self):
313 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
315 self.butler.put(df1, self.datasetType, dataId={})
316 # Read the whole DataFrame.
317 df2 = self.butler.get(self.datasetType, dataId={})
318 self.assertTrue(df1.equals(df2))
319 # Read just the column descriptions.
320 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
321 self.assertTrue(allColumns.equals(columns2))
322 # Read the rowcount.
323 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
324 self.assertEqual(rowcount, len(df1))
325 # Read the schema.
326 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
327 self.assertEqual(schema, DataFrameSchema(df1))
328 # Read just some columns a few different ways.
329 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
330 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
331 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
332 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
333 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
334 self.assertTrue(df1.loc[:, ["a"]].equals(df5))
335 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
336 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6))
337 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
338 self.assertTrue(df1.loc[:, ["a"]].equals(df7))
339 # Passing an unrecognized column should be a ValueError.
340 with self.assertRaises(ValueError):
341 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
343 def testSingleIndexDataFrameWithLists(self):
344 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)
346 self.butler.put(df1, self.datasetType, dataId={})
347 # Read the whole DataFrame.
348 df2 = self.butler.get(self.datasetType, dataId={})
350 # We need to check the list columns specially because they go
351 # from lists to arrays.
352 for col in ["l1", "l2", "l3"]:
353 for i in range(len(df1)):
354 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))
356 def testMultiIndexDataFrame(self):
357 df1 = _makeMultiIndexDataFrame()
359 self.butler.put(df1, self.datasetType, dataId={})
360 # Read the whole DataFrame.
361 df2 = self.butler.get(self.datasetType, dataId={})
362 self.assertTrue(df1.equals(df2))
363 # Read just the column descriptions.
364 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
365 self.assertTrue(df1.columns.equals(columns2))
366 self.assertEqual(columns2.names, df1.columns.names)
367 # Read the rowcount.
368 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
369 self.assertEqual(rowcount, len(df1))
370 # Read the schema.
371 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
372 self.assertEqual(schema, DataFrameSchema(df1))
373 # Read just some columns a few different ways.
374 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
375 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
376 df4 = self.butler.get(
377 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
378 )
379 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
380 column_list = [("g", "a"), ("r", "c")]
381 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
382 self.assertTrue(df1.loc[:, column_list].equals(df5))
383 column_dict = {"filter": "r", "column": ["a", "b"]}
384 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict})
385 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6))
386 # Passing an unrecognized column should be a ValueError.
387 with self.assertRaises(ValueError):
388 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
390 def testSingleIndexDataFrameEmptyString(self):
391 """Test persisting a single index dataframe with empty strings."""
392 df1, _ = _makeSingleIndexDataFrame()
394 # Set one of the strings to None
395 df1.at[1, "strcol"] = None
397 self.butler.put(df1, self.datasetType, dataId={})
398 # Read the whole DataFrame.
399 df2 = self.butler.get(self.datasetType, dataId={})
400 self.assertTrue(df1.equals(df2))
402 def testSingleIndexDataFrameAllEmptyStrings(self):
403 """Test persisting a single index dataframe with an empty string
404 column.
405 """
406 df1, _ = _makeSingleIndexDataFrame()
408 # Set all of the strings to None
409 df1.loc[0:, "strcol"] = None
411 self.butler.put(df1, self.datasetType, dataId={})
412 # Read the whole DataFrame.
413 df2 = self.butler.get(self.datasetType, dataId={})
414 self.assertTrue(df1.equals(df2))
416 def testLegacyDataFrame(self):
417 """Test writing a dataframe to parquet via pandas (without additional
418 metadata) and ensure that we can read it back with all the new
419 functionality.
420 """
421 df1, allColumns = _makeSingleIndexDataFrame()
423 fname = os.path.join(self.root, "test_dataframe.parq")
424 df1.to_parquet(fname)
426 legacy_type = DatasetType(
427 "legacy_dataframe",
428 dimensions=(),
429 storageClass="DataFrame",
430 universe=self.butler.dimensions,
431 )
432 self.butler.registry.registerDatasetType(legacy_type)
434 data_id = {}
435 ref = DatasetRef(legacy_type, data_id, run=self.run)
436 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
438 self.butler.ingest(dataset, transfer="copy")
440 self.butler.put(df1, self.datasetType, dataId={})
442 df2a = self.butler.get(self.datasetType, dataId={})
443 df2b = self.butler.get("legacy_dataframe", dataId={})
444 self.assertTrue(df2a.equals(df2b))
446 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]})
447 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]})
448 self.assertTrue(df3a.equals(df3b))
450 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
451 columns2b = self.butler.get("legacy_dataframe.columns", dataId={})
452 self.assertTrue(columns2a.equals(columns2b))
454 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
455 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={})
456 self.assertEqual(rowcount2a, rowcount2b)
458 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
459 schema2b = self.butler.get("legacy_dataframe.schema", dataId={})
460 self.assertEqual(schema2a, schema2b)
462 def testDataFrameSchema(self):
463 tab1 = _makeSimpleArrowTable()
465 schema = DataFrameSchema.from_arrow(tab1.schema)
467 self.assertIsInstance(schema.schema, pd.DataFrame)
468 self.assertEqual(repr(schema), repr(schema._schema))
469 self.assertNotEqual(schema, "not_a_schema")
470 self.assertEqual(schema, schema)
472 tab2 = _makeMultiIndexDataFrame()
473 schema2 = DataFrameSchema(tab2)
475 self.assertNotEqual(schema, schema2)
477 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
478 def testWriteSingleIndexDataFrameReadAsAstropyTable(self):
479 df1, allColumns = _makeSingleIndexDataFrame()
481 self.butler.put(df1, self.datasetType, dataId={})
483 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
485 tab2_df = tab2.to_pandas(index="index")
486 self.assertTrue(df1.equals(tab2_df))
488 # Check reading the columns.
489 columns = list(tab2.columns.keys())
490 columns2 = self.butler.get(
491 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
492 )
493 # We check the set because pandas reorders the columns.
494 self.assertEqual(set(columns2), set(columns))
496 # Check reading the schema.
497 schema = ArrowAstropySchema(tab2)
498 schema2 = self.butler.get(
499 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
500 )
502 # The string types are objectified by pandas, and the order
503 # will be changed because of pandas indexing.
504 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns))
505 for name in schema.schema.columns:
506 self.assertIn(name, schema2.schema.columns)
507 if schema2.schema[name].dtype != np.dtype("O"):
508 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype)
510 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
511 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
512 # We need to special-case the write-as-pandas read-as-astropy code
513 # with masks because pandas has multiple ways to use masked columns.
514 # (The string column mask handling in particular is frustratingly
515 # inconsistent.)
516 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
518 self.butler.put(df1, self.datasetType, dataId={})
520 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
521 tab2_df = tab2.to_pandas(index="index")
523 self.assertTrue(df1.columns.equals(tab2_df.columns))
524 for name in tab2_df.columns:
525 col1 = df1[name]
526 col2 = tab2_df[name]
528 if col1.hasnans:
529 notNull = col1.notnull()
530 self.assertTrue(notNull.equals(col2.notnull()))
531 # Need to check value-by-value because column may
532 # be made of objects, depending on what pandas decides.
533 for index in notNull.values.nonzero()[0]:
534 self.assertEqual(col1[index], col2[index])
535 else:
536 self.assertTrue(col1.equals(col2))
538 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
539 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
540 df1 = _makeMultiIndexDataFrame()
542 self.butler.put(df1, self.datasetType, dataId={})
544 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
546 # This is an odd duck, it doesn't really round-trip.
547 # This test simply checks that it's readable, but definitely not
548 # recommended.
550 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
551 def testWriteSingleIndexDataFrameReadAsArrowTable(self):
552 df1, allColumns = _makeSingleIndexDataFrame()
554 self.butler.put(df1, self.datasetType, dataId={})
556 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
558 tab2_df = arrow_to_pandas(tab2)
559 self.assertTrue(df1.equals(tab2_df))
561 # Check reading the columns.
562 columns = list(tab2.schema.names)
563 columns2 = self.butler.get(
564 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
565 )
566 # We check the set because pandas reorders the columns.
567 self.assertEqual(set(columns), set(columns2))
569 # Check reading the schema.
570 schema = tab2.schema
571 schema2 = self.butler.get(
572 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
573 )
575 # These will not have the same metadata, nor will the string column
576 # information be maintained.
577 self.assertEqual(len(schema.names), len(schema2.names))
578 for name in schema.names:
579 if schema.field(name).type not in (pa.string(), pa.binary()):
580 self.assertEqual(schema.field(name).type, schema2.field(name).type)
582 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
583 def testWriteMultiIndexDataFrameReadAsArrowTable(self):
584 df1 = _makeMultiIndexDataFrame()
586 self.butler.put(df1, self.datasetType, dataId={})
588 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
590 tab2_df = arrow_to_pandas(tab2)
591 self.assertTrue(df1.equals(tab2_df))
593 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
594 def testWriteSingleIndexDataFrameReadAsNumpyTable(self):
595 df1, allColumns = _makeSingleIndexDataFrame()
597 self.butler.put(df1, self.datasetType, dataId={})
599 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
601 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
602 self.assertTrue(df1.equals(tab2_df))
604 # Check reading the columns.
605 columns = list(tab2.dtype.names)
606 columns2 = self.butler.get(
607 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
608 )
609 # We check the set because pandas reorders the columns.
610 self.assertEqual(set(columns2), set(columns))
612 # Check reading the schema.
613 schema = ArrowNumpySchema(tab2.dtype)
614 schema2 = self.butler.get(
615 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
616 )
618 # The string types will be objectified by pandas, and the order
619 # will be changed because of pandas indexing.
620 self.assertEqual(len(schema.schema.names), len(schema2.schema.names))
621 for name in schema.schema.names:
622 self.assertIn(name, schema2.schema.names)
623 self.assertEqual(schema2.schema[name].type, schema.schema[name].type)
625 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
626 def testWriteMultiIndexDataFrameReadAsNumpyTable(self):
627 df1 = _makeMultiIndexDataFrame()
629 self.butler.put(df1, self.datasetType, dataId={})
631 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
633 # This is an odd duck, it doesn't really round-trip.
634 # This test simply checks that it's readable, but definitely not
635 # recommended.
637 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
638 def testWriteSingleIndexDataFrameReadAsNumpyDict(self):
639 df1, allColumns = _makeSingleIndexDataFrame()
641 self.butler.put(df1, self.datasetType, dataId={})
643 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
645 tab2_df = pd.DataFrame.from_records(tab2, index=["index"])
646 # The column order is not maintained.
647 self.assertEqual(set(df1.columns), set(tab2_df.columns))
648 for col in df1.columns:
649 self.assertTrue(np.all(df1[col].values == tab2_df[col].values))
651 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.")
652 def testWriteMultiIndexDataFrameReadAsNumpyDict(self):
653 df1 = _makeMultiIndexDataFrame()
655 self.butler.put(df1, self.datasetType, dataId={})
657 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
659 # This is an odd duck, it doesn't really round-trip.
660 # This test simply checks that it's readable, but definitely not
661 # recommended.
664@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.")
665class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase):
666 """Tests for InMemoryDatastore, using DataFrameDelegate."""
668 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
670 def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
671 df1 = _makeMultiIndexDataFrame()
673 self.butler.put(df1, self.datasetType, dataId={})
675 with self.assertRaises(ValueError):
676 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
678 def testLegacyDataFrame(self):
679 # This test does not work with an inMemoryDatastore.
680 pass
682 def testBadInput(self):
683 df1, _ = _makeSingleIndexDataFrame()
684 delegate = DataFrameDelegate("DataFrame")
686 with self.assertRaises(ValueError):
687 delegate.handleParameters(inMemoryDataset="not_a_dataframe")
689 with self.assertRaises(AttributeError):
690 delegate.getComponent(composite=df1, componentName="nothing")
692 def testStorageClass(self):
693 df1, allColumns = _makeSingleIndexDataFrame()
695 factory = StorageClassFactory()
696 factory.addFromConfig(StorageClassConfig())
698 storageClass = factory.findStorageClass(type(df1), compare_types=False)
699 # Force the name lookup to do name matching.
700 storageClass._pytype = None
701 self.assertEqual(storageClass.name, "DataFrame")
703 storageClass = factory.findStorageClass(type(df1), compare_types=True)
704 # Force the name lookup to do name matching.
705 storageClass._pytype = None
706 self.assertEqual(storageClass.name, "DataFrame")
709@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.")
710@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.")
711class ParquetFormatterArrowAstropyTestCase(unittest.TestCase):
712 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore."""
714 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
716 def setUp(self):
717 """Create a new butler root for each test."""
718 self.root = makeTestTempDir(TESTDIR)
719 config = Config(self.configFile)
720 self.run = "test_run"
721 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
722 # No dimensions in dataset type so we don't have to worry about
723 # inserting dimension data or defining data IDs.
724 self.datasetType = DatasetType(
725 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions
726 )
727 self.butler.registry.registerDatasetType(self.datasetType)
729 def tearDown(self):
730 removeTestTempDir(self.root)
732 def testAstropyTable(self):
733 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
735 self.butler.put(tab1, self.datasetType, dataId={})
736 # Read the whole Table.
737 tab2 = self.butler.get(self.datasetType, dataId={})
738 self._checkAstropyTableEquality(tab1, tab2)
739 # Read the columns.
740 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
741 self.assertEqual(len(columns2), len(tab1.dtype.names))
742 for i, name in enumerate(tab1.dtype.names):
743 self.assertEqual(columns2[i], name)
744 # Read the rowcount.
745 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
746 self.assertEqual(rowcount, len(tab1))
747 # Read the schema.
748 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
749 self.assertEqual(schema, ArrowAstropySchema(tab1))
750 # Read just some columns a few different ways.
751 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
752 self._checkAstropyTableEquality(tab1[("a", "c")], tab3)
753 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
754 self._checkAstropyTableEquality(tab1[("a",)], tab4)
755 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
756 self._checkAstropyTableEquality(tab1[("index", "a")], tab5)
757 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
758 self._checkAstropyTableEquality(tab1[("ddd",)], tab6)
759 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
760 self._checkAstropyTableEquality(tab1[("a",)], tab7)
761 # Passing an unrecognized column should be a ValueError.
762 with self.assertRaises(ValueError):
763 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
765 def testAstropyTableBigEndian(self):
766 tab1 = _makeSimpleAstropyTable(include_bigendian=True)
768 self.butler.put(tab1, self.datasetType, dataId={})
769 # Read the whole Table.
770 tab2 = self.butler.get(self.datasetType, dataId={})
771 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True)
773 def testAstropyTableWithMetadata(self):
774 tab1 = _makeSimpleAstropyTable(include_multidim=True)
776 meta = {
777 "meta_a": 5,
778 "meta_b": 10.0,
779 "meta_c": [1, 2, 3],
780 "meta_d": True,
781 "meta_e": "string",
782 }
784 tab1.meta.update(meta)
786 self.butler.put(tab1, self.datasetType, dataId={})
787 # Read the whole Table.
788 tab2 = self.butler.get(self.datasetType, dataId={})
789 # This will check that the metadata is equivalent as well.
790 self._checkAstropyTableEquality(tab1, tab2)
792 def testArrowAstropySchema(self):
793 tab1 = _makeSimpleAstropyTable()
794 tab1_arrow = astropy_to_arrow(tab1)
795 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema)
797 self.assertIsInstance(schema.schema, atable.Table)
798 self.assertEqual(repr(schema), repr(schema._schema))
799 self.assertNotEqual(schema, "not_a_schema")
800 self.assertEqual(schema, schema)
802 # Test various inequalities
803 tab2 = tab1.copy()
804 tab2.rename_column("index", "index2")
805 schema2 = ArrowAstropySchema(tab2)
806 self.assertNotEqual(schema2, schema)
808 tab2 = tab1.copy()
809 tab2["index"].unit = units.micron
810 schema2 = ArrowAstropySchema(tab2)
811 self.assertNotEqual(schema2, schema)
813 tab2 = tab1.copy()
814 tab2["index"].description = "Index column"
815 schema2 = ArrowAstropySchema(tab2)
816 self.assertNotEqual(schema2, schema)
818 tab2 = tab1.copy()
819 tab2["index"].format = "%05d"
820 schema2 = ArrowAstropySchema(tab2)
821 self.assertNotEqual(schema2, schema)
823 def testAstropyParquet(self):
824 tab1 = _makeSimpleAstropyTable()
826 fname = os.path.join(self.root, "test_astropy.parq")
827 tab1.write(fname)
829 astropy_type = DatasetType(
830 "astropy_parquet",
831 dimensions=(),
832 storageClass="ArrowAstropy",
833 universe=self.butler.dimensions,
834 )
835 self.butler.registry.registerDatasetType(astropy_type)
837 data_id = {}
838 ref = DatasetRef(astropy_type, data_id, run=self.run)
839 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter)
841 self.butler.ingest(dataset, transfer="copy")
843 self.butler.put(tab1, self.datasetType, dataId={})
845 tab2a = self.butler.get(self.datasetType, dataId={})
846 tab2b = self.butler.get("astropy_parquet", dataId={})
847 self._checkAstropyTableEquality(tab2a, tab2b)
849 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
850 columns2b = self.butler.get("astropy_parquet.columns", dataId={})
851 self.assertEqual(len(columns2b), len(columns2a))
852 for i, name in enumerate(columns2a):
853 self.assertEqual(columns2b[i], name)
855 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
856 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={})
857 self.assertEqual(rowcount2a, rowcount2b)
859 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
860 schema2b = self.butler.get("astropy_parquet.schema", dataId={})
861 self.assertEqual(schema2a, schema2b)
863 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
864 def testWriteAstropyReadAsArrowTable(self):
865 # This astropy <-> arrow works fine with masked columns.
866 tab1 = _makeSimpleAstropyTable(include_masked=True)
868 self.butler.put(tab1, self.datasetType, dataId={})
870 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
872 tab2_astropy = arrow_to_astropy(tab2)
873 self._checkAstropyTableEquality(tab1, tab2_astropy)
875 # Check reading the columns.
876 columns = tab2.schema.names
877 columns2 = self.butler.get(
878 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
879 )
880 self.assertEqual(columns2, columns)
882 # Check reading the schema.
883 schema = tab2.schema
884 schema2 = self.butler.get(
885 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
886 )
888 self.assertEqual(schema, schema2)
890 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
891 def testWriteAstropyReadAsDataFrame(self):
892 tab1 = _makeSimpleAstropyTable()
894 self.butler.put(tab1, self.datasetType, dataId={})
896 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
898 # This is tricky because it loses the units and gains a bonus pandas
899 # _index_ column, so we just test the dataframe form.
901 tab1_df = tab1.to_pandas()
902 self.assertTrue(tab1_df.equals(tab2))
904 # Check reading the columns.
905 columns = tab2.columns
906 columns2 = self.butler.get(
907 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
908 )
909 self.assertTrue(columns.equals(columns2))
911 # Check reading the schema.
912 schema = DataFrameSchema(tab2)
913 schema2 = self.butler.get(
914 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
915 )
917 self.assertEqual(schema2, schema)
919 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
920 def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
921 # We need to special-case the write-as-astropy read-as-pandas code
922 # with masks because pandas has multiple ways to use masked columns.
923 # (When writing an astropy table with masked columns we get an object
924 # column back, but each unmasked element has the correct type.)
925 tab1 = _makeSimpleAstropyTable(include_masked=True)
927 self.butler.put(tab1, self.datasetType, dataId={})
929 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
931 tab1_df = tab1.to_pandas()
933 self.assertTrue(tab1_df.columns.equals(tab2.columns))
934 for name in tab2.columns:
935 col1 = tab1_df[name]
936 col2 = tab2[name]
938 if col1.hasnans:
939 notNull = col1.notnull()
940 self.assertTrue(notNull.equals(col2.notnull()))
941 # Need to check value-by-value because column may
942 # be made of objects, depending on what pandas decides.
943 for index in notNull.values.nonzero()[0]:
944 self.assertEqual(col1[index], col2[index])
945 else:
946 self.assertTrue(col1.equals(col2))
948 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
949 def testWriteAstropyReadAsNumpyTable(self):
950 tab1 = _makeSimpleAstropyTable()
951 self.butler.put(tab1, self.datasetType, dataId={})
953 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
955 # This is tricky because it loses the units.
956 tab2_astropy = atable.Table(tab2)
958 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
960 # Check reading the columns.
961 columns = list(tab2.dtype.names)
962 columns2 = self.butler.get(
963 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
964 )
965 self.assertEqual(columns2, columns)
967 # Check reading the schema.
968 schema = ArrowNumpySchema(tab2.dtype)
969 schema2 = self.butler.get(
970 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
971 )
973 self.assertEqual(schema2, schema)
975 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
976 def testWriteAstropyReadAsNumpyDict(self):
977 tab1 = _makeSimpleAstropyTable()
978 self.butler.put(tab1, self.datasetType, dataId={})
980 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
982 # This is tricky because it loses the units.
983 tab2_astropy = atable.Table(tab2)
985 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True)
987 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False):
988 """Check if two astropy tables have the same columns/values.
990 Parameters
991 ----------
992 table1 : `astropy.table.Table`
993 table2 : `astropy.table.Table`
994 skip_units : `bool`
995 has_bigendian : `bool`
996 """
997 if not has_bigendian:
998 self.assertEqual(table1.dtype, table2.dtype)
999 else:
1000 for name in table1.dtype.names:
1001 # Only check type matches, force to little-endian.
1002 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1004 self.assertEqual(table1.meta, table2.meta)
1005 if not skip_units:
1006 for name in table1.columns:
1007 self.assertEqual(table1[name].unit, table2[name].unit)
1008 self.assertEqual(table1[name].description, table2[name].description)
1009 self.assertEqual(table1[name].format, table2[name].format)
1010 self.assertTrue(np.all(table1 == table2))
1013@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.")
1014class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase):
1015 """Tests for InMemoryDatastore, using ArrowAstropyDelegate."""
1017 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1019 def testAstropyParquet(self):
1020 # This test does not work with an inMemoryDatastore.
1021 pass
1023 def testBadInput(self):
1024 tab1 = _makeSimpleAstropyTable()
1025 delegate = ArrowAstropyDelegate("ArrowAstropy")
1027 with self.assertRaises(ValueError):
1028 delegate.handleParameters(inMemoryDataset="not_an_astropy_table")
1030 with self.assertRaises(NotImplementedError):
1031 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1033 with self.assertRaises(AttributeError):
1034 delegate.getComponent(composite=tab1, componentName="nothing")
1037@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1038@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1039class ParquetFormatterArrowNumpyTestCase(unittest.TestCase):
1040 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore."""
1042 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1044 def setUp(self):
1045 """Create a new butler root for each test."""
1046 self.root = makeTestTempDir(TESTDIR)
1047 config = Config(self.configFile)
1048 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1049 # No dimensions in dataset type so we don't have to worry about
1050 # inserting dimension data or defining data IDs.
1051 self.datasetType = DatasetType(
1052 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions
1053 )
1054 self.butler.registry.registerDatasetType(self.datasetType)
1056 def tearDown(self):
1057 removeTestTempDir(self.root)
1059 def testNumpyTable(self):
1060 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1062 self.butler.put(tab1, self.datasetType, dataId={})
1063 # Read the whole Table.
1064 tab2 = self.butler.get(self.datasetType, dataId={})
1065 self._checkNumpyTableEquality(tab1, tab2)
1066 # Read the columns.
1067 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1068 self.assertEqual(len(columns2), len(tab1.dtype.names))
1069 for i, name in enumerate(tab1.dtype.names):
1070 self.assertEqual(columns2[i], name)
1071 # Read the rowcount.
1072 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1073 self.assertEqual(rowcount, len(tab1))
1074 # Read the schema.
1075 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1076 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1077 # Read just some columns a few different ways.
1078 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1079 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3)
1080 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1081 self._checkNumpyTableEquality(
1082 tab1[
1083 [
1084 "a",
1085 ]
1086 ],
1087 tab4,
1088 )
1089 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1090 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5)
1091 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1092 self._checkNumpyTableEquality(
1093 tab1[
1094 [
1095 "ddd",
1096 ]
1097 ],
1098 tab6,
1099 )
1100 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1101 self._checkNumpyTableEquality(
1102 tab1[
1103 [
1104 "a",
1105 ]
1106 ],
1107 tab7,
1108 )
1109 # Passing an unrecognized column should be a ValueError.
1110 with self.assertRaises(ValueError):
1111 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1113 def testNumpyTableBigEndian(self):
1114 tab1 = _makeSimpleNumpyTable(include_bigendian=True)
1116 self.butler.put(tab1, self.datasetType, dataId={})
1117 # Read the whole Table.
1118 tab2 = self.butler.get(self.datasetType, dataId={})
1119 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True)
1121 def testArrowNumpySchema(self):
1122 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1123 tab1_arrow = numpy_to_arrow(tab1)
1124 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema)
1126 self.assertIsInstance(schema.schema, np.dtype)
1127 self.assertEqual(repr(schema), repr(schema._dtype))
1128 self.assertNotEqual(schema, "not_a_schema")
1129 self.assertEqual(schema, schema)
1131 # Test inequality
1132 tab2 = tab1.copy()
1133 names = list(tab2.dtype.names)
1134 names[0] = "index2"
1135 tab2.dtype.names = names
1136 schema2 = ArrowNumpySchema(tab2.dtype)
1137 self.assertNotEqual(schema2, schema)
1139 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.")
1140 def testNumpyDictConversions(self):
1141 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1143 # Verify that everything round-trips, including the schema.
1144 tab1_arrow = numpy_to_arrow(tab1)
1145 tab1_dict = arrow_to_numpy_dict(tab1_arrow)
1146 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict)
1148 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema)
1149 self.assertEqual(tab1_arrow, tab1_dict_arrow)
1151 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1152 def testWriteNumpyTableReadAsArrowTable(self):
1153 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1155 self.butler.put(tab1, self.datasetType, dataId={})
1157 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1159 tab2_numpy = arrow_to_numpy(tab2)
1161 self._checkNumpyTableEquality(tab1, tab2_numpy)
1163 # Check reading the columns.
1164 columns = tab2.schema.names
1165 columns2 = self.butler.get(
1166 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1167 )
1168 self.assertEqual(columns2, columns)
1170 # Check reading the schema.
1171 schema = tab2.schema
1172 schema2 = self.butler.get(
1173 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema"
1174 )
1175 self.assertEqual(schema2, schema)
1177 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1178 def testWriteNumpyTableReadAsDataFrame(self):
1179 tab1 = _makeSimpleNumpyTable()
1181 self.butler.put(tab1, self.datasetType, dataId={})
1183 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1185 # Converting this back to numpy gets confused with the index column
1186 # and changes the datatype of the string column.
1188 tab1_df = pd.DataFrame(tab1)
1190 self.assertTrue(tab1_df.equals(tab2))
1192 # Check reading the columns.
1193 columns = tab2.columns
1194 columns2 = self.butler.get(
1195 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1196 )
1197 self.assertTrue(columns.equals(columns2))
1199 # Check reading the schema.
1200 schema = DataFrameSchema(tab2)
1201 schema2 = self.butler.get(
1202 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1203 )
1205 self.assertEqual(schema2, schema)
1207 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1208 def testWriteNumpyTableReadAsAstropyTable(self):
1209 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1211 self.butler.put(tab1, self.datasetType, dataId={})
1213 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1214 tab2_numpy = tab2.as_array()
1216 self._checkNumpyTableEquality(tab1, tab2_numpy)
1218 # Check reading the columns.
1219 columns = list(tab2.columns.keys())
1220 columns2 = self.butler.get(
1221 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1222 )
1223 self.assertEqual(columns2, columns)
1225 # Check reading the schema.
1226 schema = ArrowAstropySchema(tab2)
1227 schema2 = self.butler.get(
1228 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1229 )
1231 self.assertEqual(schema2, schema)
1233 def testWriteNumpyTableReadAsNumpyDict(self):
1234 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1236 self.butler.put(tab1, self.datasetType, dataId={})
1238 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1239 tab2_numpy = _numpy_dict_to_numpy(tab2)
1241 self._checkNumpyTableEquality(tab1, tab2_numpy)
1243 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False):
1244 """Check if two numpy tables have the same columns/values
1246 Parameters
1247 ----------
1248 table1 : `numpy.ndarray`
1249 table2 : `numpy.ndarray`
1250 has_bigendian : `bool`
1251 """
1252 self.assertEqual(table1.dtype.names, table2.dtype.names)
1253 for name in table1.dtype.names:
1254 if not has_bigendian:
1255 self.assertEqual(table1.dtype[name], table2.dtype[name])
1256 else:
1257 # Only check type matches, force to little-endian.
1258 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">"))
1259 self.assertTrue(np.all(table1 == table2))
1262@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1263class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase):
1264 """Tests for InMemoryDatastore, using ArrowNumpyDelegate."""
1266 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1268 def testBadInput(self):
1269 tab1 = _makeSimpleNumpyTable()
1270 delegate = ArrowNumpyDelegate("ArrowNumpy")
1272 with self.assertRaises(ValueError):
1273 delegate.handleParameters(inMemoryDataset="not_a_numpy_table")
1275 with self.assertRaises(NotImplementedError):
1276 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1278 with self.assertRaises(AttributeError):
1279 delegate.getComponent(composite=tab1, componentName="nothing")
1281 def testStorageClass(self):
1282 tab1 = _makeSimpleNumpyTable()
1284 factory = StorageClassFactory()
1285 factory.addFromConfig(StorageClassConfig())
1287 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1288 # Force the name lookup to do name matching.
1289 storageClass._pytype = None
1290 self.assertEqual(storageClass.name, "ArrowNumpy")
1292 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1293 # Force the name lookup to do name matching.
1294 storageClass._pytype = None
1295 self.assertEqual(storageClass.name, "ArrowNumpy")
1298@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.")
1299class ParquetFormatterArrowTableTestCase(unittest.TestCase):
1300 """Tests for ParquetFormatter, ArrowTable, using local file datastore."""
1302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1304 def setUp(self):
1305 """Create a new butler root for each test."""
1306 self.root = makeTestTempDir(TESTDIR)
1307 config = Config(self.configFile)
1308 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1309 # No dimensions in dataset type so we don't have to worry about
1310 # inserting dimension data or defining data IDs.
1311 self.datasetType = DatasetType(
1312 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions
1313 )
1314 self.butler.registry.registerDatasetType(self.datasetType)
1316 def tearDown(self):
1317 removeTestTempDir(self.root)
1319 def testArrowTable(self):
1320 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True)
1322 self.butler.put(tab1, self.datasetType, dataId={})
1323 # Read the whole Table.
1324 tab2 = self.butler.get(self.datasetType, dataId={})
1325 self.assertEqual(tab2, tab1)
1326 # Read the columns.
1327 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1328 self.assertEqual(len(columns2), len(tab1.schema.names))
1329 for i, name in enumerate(tab1.schema.names):
1330 self.assertEqual(columns2[i], name)
1331 # Read the rowcount.
1332 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1333 self.assertEqual(rowcount, len(tab1))
1334 # Read the schema.
1335 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1336 self.assertEqual(schema, tab1.schema)
1337 # Read just some columns a few different ways.
1338 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1339 self.assertEqual(tab3, tab1.select(("a", "c")))
1340 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1341 self.assertEqual(tab4, tab1.select(("a",)))
1342 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1343 self.assertEqual(tab5, tab1.select(("index", "a")))
1344 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1345 self.assertEqual(tab6, tab1.select(("ddd",)))
1346 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1347 self.assertEqual(tab7, tab1.select(("a",)))
1348 # Passing an unrecognized column should be a ValueError.
1349 with self.assertRaises(ValueError):
1350 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1352 def testEmptyArrowTable(self):
1353 data = _makeSimpleNumpyTable()
1354 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1356 schema = pa.schema(type_list)
1357 arrays = [[]] * len(schema.names)
1359 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1361 self.butler.put(tab1, self.datasetType, dataId={})
1362 tab2 = self.butler.get(self.datasetType, dataId={})
1363 self.assertEqual(tab2, tab1)
1365 tab1_numpy = arrow_to_numpy(tab1)
1366 self.assertEqual(len(tab1_numpy), 0)
1367 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1368 self.assertEqual(tab1_numpy_arrow, tab1)
1370 tab1_pandas = arrow_to_pandas(tab1)
1371 self.assertEqual(len(tab1_pandas), 0)
1372 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas)
1373 # Unfortunately, string/byte columns get mangled when translated
1374 # through empty pandas dataframes.
1375 self.assertEqual(
1376 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")),
1377 tab1.select(("index", "a", "b", "c", "ddd")),
1378 )
1380 tab1_astropy = arrow_to_astropy(tab1)
1381 self.assertEqual(len(tab1_astropy), 0)
1382 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1383 self.assertEqual(tab1_astropy_arrow, tab1)
1385 def testEmptyArrowTableMultidim(self):
1386 data = _makeSimpleNumpyTable(include_multidim=True)
1387 type_list = _numpy_dtype_to_arrow_types(data.dtype)
1389 md = {}
1390 for name in data.dtype.names:
1391 _append_numpy_multidim_metadata(md, name, data.dtype[name])
1393 schema = pa.schema(type_list, metadata=md)
1394 arrays = [[]] * len(schema.names)
1396 tab1 = pa.Table.from_arrays(arrays, schema=schema)
1398 self.butler.put(tab1, self.datasetType, dataId={})
1399 tab2 = self.butler.get(self.datasetType, dataId={})
1400 self.assertEqual(tab2, tab1)
1402 tab1_numpy = arrow_to_numpy(tab1)
1403 self.assertEqual(len(tab1_numpy), 0)
1404 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy)
1405 self.assertEqual(tab1_numpy_arrow, tab1)
1407 tab1_astropy = arrow_to_astropy(tab1)
1408 self.assertEqual(len(tab1_astropy), 0)
1409 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy)
1410 self.assertEqual(tab1_astropy_arrow, tab1)
1412 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1413 def testWriteArrowTableReadAsSingleIndexDataFrame(self):
1414 df1, allColumns = _makeSingleIndexDataFrame()
1416 self.butler.put(df1, self.datasetType, dataId={})
1418 # Read back out as a dataframe.
1419 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1420 self.assertTrue(df1.equals(df2))
1422 # Read back out as an arrow table, convert to dataframe.
1423 tab3 = self.butler.get(self.datasetType, dataId={})
1424 df3 = arrow_to_pandas(tab3)
1425 self.assertTrue(df1.equals(df3))
1427 # Check reading the columns.
1428 columns = df2.reset_index().columns
1429 columns2 = self.butler.get(
1430 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1431 )
1432 # We check the set because pandas reorders the columns.
1433 self.assertEqual(set(columns2.to_list()), set(columns.to_list()))
1435 # Check reading the schema.
1436 schema = DataFrameSchema(df1)
1437 schema2 = self.butler.get(
1438 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1439 )
1440 self.assertEqual(schema2, schema)
1442 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1443 def testWriteArrowTableReadAsMultiIndexDataFrame(self):
1444 df1 = _makeMultiIndexDataFrame()
1446 self.butler.put(df1, self.datasetType, dataId={})
1448 # Read back out as a dataframe.
1449 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1450 self.assertTrue(df1.equals(df2))
1452 # Read back out as an arrow table, convert to dataframe.
1453 atab3 = self.butler.get(self.datasetType, dataId={})
1454 df3 = arrow_to_pandas(atab3)
1455 self.assertTrue(df1.equals(df3))
1457 # Check reading the columns.
1458 columns = df2.columns
1459 columns2 = self.butler.get(
1460 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex"
1461 )
1462 self.assertTrue(columns2.equals(columns))
1464 # Check reading the schema.
1465 schema = DataFrameSchema(df1)
1466 schema2 = self.butler.get(
1467 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema"
1468 )
1469 self.assertEqual(schema2, schema)
1471 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1472 def testWriteArrowTableReadAsAstropyTable(self):
1473 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True)
1475 self.butler.put(tab1, self.datasetType, dataId={})
1477 # Read back out as an astropy table.
1478 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1479 self._checkAstropyTableEquality(tab1, tab2)
1481 # Read back out as an arrow table, convert to astropy table.
1482 atab3 = self.butler.get(self.datasetType, dataId={})
1483 tab3 = arrow_to_astropy(atab3)
1484 self._checkAstropyTableEquality(tab1, tab3)
1486 # Check reading the columns.
1487 columns = list(tab2.columns.keys())
1488 columns2 = self.butler.get(
1489 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1490 )
1491 self.assertEqual(columns2, columns)
1493 # Check reading the schema.
1494 schema = ArrowAstropySchema(tab1)
1495 schema2 = self.butler.get(
1496 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema"
1497 )
1498 self.assertEqual(schema2, schema)
1500 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1501 def testWriteArrowTableReadAsNumpyTable(self):
1502 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1504 self.butler.put(tab1, self.datasetType, dataId={})
1506 # Read back out as a numpy table.
1507 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1508 self._checkNumpyTableEquality(tab1, tab2)
1510 # Read back out as an arrow table, convert to numpy table.
1511 atab3 = self.butler.get(self.datasetType, dataId={})
1512 tab3 = arrow_to_numpy(atab3)
1513 self._checkNumpyTableEquality(tab1, tab3)
1515 # Check reading the columns.
1516 columns = list(tab2.dtype.names)
1517 columns2 = self.butler.get(
1518 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList"
1519 )
1520 self.assertEqual(columns2, columns)
1522 # Check reading the schema.
1523 schema = ArrowNumpySchema(tab1.dtype)
1524 schema2 = self.butler.get(
1525 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema"
1526 )
1527 self.assertEqual(schema2, schema)
1529 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
1530 def testWriteArrowTableReadAsNumpyDict(self):
1531 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1533 self.butler.put(tab1, self.datasetType, dataId={})
1535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict")
1536 tab2_numpy = _numpy_dict_to_numpy(tab2)
1537 self._checkNumpyTableEquality(tab1, tab2_numpy)
1539 def _checkAstropyTableEquality(self, table1, table2):
1540 """Check if two astropy tables have the same columns/values
1542 Parameters
1543 ----------
1544 table1 : `astropy.table.Table`
1545 table2 : `astropy.table.Table`
1546 """
1547 self.assertEqual(table1.dtype, table2.dtype)
1548 for name in table1.columns:
1549 self.assertEqual(table1[name].unit, table2[name].unit)
1550 self.assertEqual(table1[name].description, table2[name].description)
1551 self.assertEqual(table1[name].format, table2[name].format)
1552 self.assertTrue(np.all(table1 == table2))
1554 def _checkNumpyTableEquality(self, table1, table2):
1555 """Check if two numpy tables have the same columns/values
1557 Parameters
1558 ----------
1559 table1 : `numpy.ndarray`
1560 table2 : `numpy.ndarray`
1561 """
1562 self.assertEqual(table1.dtype.names, table2.dtype.names)
1563 for name in table1.dtype.names:
1564 self.assertEqual(table1.dtype[name], table2.dtype[name])
1565 self.assertTrue(np.all(table1 == table2))
1568@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.")
1569class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase):
1570 """Tests for InMemoryDatastore, using ArrowTableDelegate."""
1572 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1574 def testBadInput(self):
1575 tab1 = _makeSimpleArrowTable()
1576 delegate = ArrowTableDelegate("ArrowTable")
1578 with self.assertRaises(ValueError):
1579 delegate.handleParameters(inMemoryDataset="not_an_arrow_table")
1581 with self.assertRaises(NotImplementedError):
1582 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]})
1584 with self.assertRaises(AttributeError):
1585 delegate.getComponent(composite=tab1, componentName="nothing")
1587 def testStorageClass(self):
1588 tab1 = _makeSimpleArrowTable()
1590 factory = StorageClassFactory()
1591 factory.addFromConfig(StorageClassConfig())
1593 storageClass = factory.findStorageClass(type(tab1), compare_types=False)
1594 # Force the name lookup to do name matching.
1595 storageClass._pytype = None
1596 self.assertEqual(storageClass.name, "ArrowTable")
1598 storageClass = factory.findStorageClass(type(tab1), compare_types=True)
1599 # Force the name lookup to do name matching.
1600 storageClass._pytype = None
1601 self.assertEqual(storageClass.name, "ArrowTable")
1604@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1605@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1606class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase):
1607 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store."""
1609 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1611 def setUp(self):
1612 """Create a new butler root for each test."""
1613 self.root = makeTestTempDir(TESTDIR)
1614 config = Config(self.configFile)
1615 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
1616 # No dimensions in dataset type so we don't have to worry about
1617 # inserting dimension data or defining data IDs.
1618 self.datasetType = DatasetType(
1619 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions
1620 )
1621 self.butler.registry.registerDatasetType(self.datasetType)
1623 def tearDown(self):
1624 removeTestTempDir(self.root)
1626 def testNumpyDict(self):
1627 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1628 dict1 = _numpy_to_numpy_dict(tab1)
1630 self.butler.put(dict1, self.datasetType, dataId={})
1631 # Read the whole table.
1632 dict2 = self.butler.get(self.datasetType, dataId={})
1633 self._checkNumpyDictEquality(dict1, dict2)
1634 # Read the columns.
1635 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
1636 self.assertEqual(len(columns2), len(dict1.keys()))
1637 for name in dict1:
1638 self.assertIn(name, columns2)
1639 # Read the rowcount.
1640 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={})
1641 self.assertEqual(rowcount, len(dict1["a"]))
1642 # Read the schema.
1643 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={})
1644 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype))
1645 # Read just some columns a few different ways.
1646 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
1647 subdict = {key: dict1[key] for key in ["a", "c"]}
1648 self._checkNumpyDictEquality(subdict, tab3)
1649 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
1650 subdict = {key: dict1[key] for key in ["a"]}
1651 self._checkNumpyDictEquality(subdict, tab4)
1652 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]})
1653 subdict = {key: dict1[key] for key in ["index", "a"]}
1654 self._checkNumpyDictEquality(subdict, tab5)
1655 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"})
1656 subdict = {key: dict1[key] for key in ["ddd"]}
1657 self._checkNumpyDictEquality(subdict, tab6)
1658 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]})
1659 subdict = {key: dict1[key] for key in ["a"]}
1660 self._checkNumpyDictEquality(subdict, tab7)
1661 # Passing an unrecognized column should be a ValueError.
1662 with self.assertRaises(ValueError):
1663 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})
1665 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
1666 def testWriteNumpyDictReadAsArrowTable(self):
1667 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1668 dict1 = _numpy_to_numpy_dict(tab1)
1670 self.butler.put(dict1, self.datasetType, dataId={})
1672 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable")
1674 tab2_dict = arrow_to_numpy_dict(tab2)
1676 self._checkNumpyDictEquality(dict1, tab2_dict)
1678 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.")
1679 def testWriteNumpyDictReadAsDataFrame(self):
1680 tab1 = _makeSimpleNumpyTable()
1681 dict1 = _numpy_to_numpy_dict(tab1)
1683 self.butler.put(dict1, self.datasetType, dataId={})
1685 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
1687 # The order of the dict may get mixed up, so we need to check column
1688 # by column. We also need to do this in dataframe form because pandas
1689 # changes the datatype of the string column.
1690 tab1_df = pd.DataFrame(tab1)
1692 self.assertEqual(set(tab1_df.columns), set(tab2.columns))
1693 for col in tab1_df.columns:
1694 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values))
1696 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.")
1697 def testWriteNumpyDictReadAsAstropyTable(self):
1698 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1699 dict1 = _numpy_to_numpy_dict(tab1)
1701 self.butler.put(dict1, self.datasetType, dataId={})
1703 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
1704 tab2_dict = _astropy_to_numpy_dict(tab2)
1706 self._checkNumpyDictEquality(dict1, tab2_dict)
1708 def testWriteNumpyDictReadAsNumpyTable(self):
1709 tab1 = _makeSimpleNumpyTable(include_multidim=True)
1710 dict1 = _numpy_to_numpy_dict(tab1)
1712 self.butler.put(dict1, self.datasetType, dataId={})
1714 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy")
1715 tab2_dict = _numpy_to_numpy_dict(tab2)
1717 self._checkNumpyDictEquality(dict1, tab2_dict)
1719 def testWriteNumpyDictBad(self):
1720 dict1 = {"a": 4, "b": np.ndarray([1])}
1721 with self.assertRaises(RuntimeError):
1722 self.butler.put(dict1, self.datasetType, dataId={})
1724 dict2 = {"a": np.zeros(4), "b": np.zeros(5)}
1725 with self.assertRaises(RuntimeError):
1726 self.butler.put(dict2, self.datasetType, dataId={})
1728 dict3 = {"a": [0] * 5, "b": np.zeros(5)}
1729 with self.assertRaises(RuntimeError):
1730 self.butler.put(dict3, self.datasetType, dataId={})
1732 def _checkNumpyDictEquality(self, dict1, dict2):
1733 """Check if two numpy dicts have the same columns/values.
1735 Parameters
1736 ----------
1737 dict1 : `dict` [`str`, `np.ndarray`]
1738 dict2 : `dict` [`str`, `np.ndarray`]
1739 """
1740 self.assertEqual(set(dict1.keys()), set(dict2.keys()))
1741 for name in dict1:
1742 self.assertEqual(dict1[name].dtype, dict2[name].dtype)
1743 self.assertTrue(np.all(dict1[name] == dict2[name]))
1746@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.")
1747@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.")
1748class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase):
1749 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate."""
1751 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1753 def testWriteNumpyDictBad(self):
1754 # The sub-type checking is not done on in-memory datastore.
1755 pass
1758@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.")
1759@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.")
1760class ComputeRowGroupSizeTestCase(unittest.TestCase):
1761 """Tests for compute_row_group_size."""
1763 def testRowGroupSizeNoMetadata(self):
1764 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1766 # We can't use the numpy_to_arrow convenience function because
1767 # that adds metadata.
1768 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype)
1769 schema = pa.schema(type_list)
1770 arrays = _numpy_style_arrays_to_arrow_arrays(
1771 numpyTable.dtype,
1772 len(numpyTable),
1773 numpyTable,
1774 schema,
1775 )
1776 arrowTable = pa.Table.from_arrays(arrays, schema=schema)
1778 row_group_size = compute_row_group_size(arrowTable.schema)
1780 self.assertGreater(row_group_size, 1_000_000)
1781 self.assertLess(row_group_size, 2_000_000)
1783 def testRowGroupSizeWithMetadata(self):
1784 numpyTable = _makeSimpleNumpyTable(include_multidim=True)
1786 arrowTable = numpy_to_arrow(numpyTable)
1788 row_group_size = compute_row_group_size(arrowTable.schema)
1790 self.assertGreater(row_group_size, 1_000_000)
1791 self.assertLess(row_group_size, 2_000_000)
1793 def testRowGroupSizeTinyTable(self):
1794 numpyTable = np.zeros(1, dtype=[("a", np.bool_)])
1796 arrowTable = numpy_to_arrow(numpyTable)
1798 row_group_size = compute_row_group_size(arrowTable.schema)
1800 self.assertGreater(row_group_size, 1_000_000)
1802 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
1803 def testRowGroupSizeDataFrameWithLists(self):
1804 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
1805 arrowTable = pandas_to_arrow(df)
1806 row_group_size = compute_row_group_size(arrowTable.schema)
1808 self.assertGreater(row_group_size, 1_000_000)
1811if __name__ == "__main__":
1812 unittest.main()