Coverage for tests/test_parquet.py: 32%
55 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-26 02:22 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-26 02:22 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import numpy as np
32 import pandas as pd
33 import pyarrow.parquet
34except ImportError:
35 pyarrow = None
37from lsst.daf.butler import Butler, DatasetType
38from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
40TESTDIR = os.path.abspath(os.path.dirname(__file__))
43@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.")
44class ParquetFormatterTestCase(unittest.TestCase):
45 """Tests for ParquetFormatter, using local file datastore."""
47 def setUp(self):
48 """Create a new butler root for each test."""
49 self.root = makeTestTempDir(TESTDIR)
50 Butler.makeRepo(self.root)
51 self.butler = Butler(self.root, run="test_run")
52 # No dimensions in dataset type so we don't have to worry about
53 # inserting dimension data or defining data IDs.
54 self.datasetType = DatasetType(
55 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions
56 )
57 self.butler.registry.registerDatasetType(self.datasetType)
59 def tearDown(self):
60 removeTestTempDir(self.root)
62 def testSingleIndexDataFrame(self):
63 columns1 = pd.Index(["a", "b", "c"])
64 df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1)
65 self.butler.put(df1, self.datasetType, dataId={})
66 # Read the whole DataFrame.
67 df2 = self.butler.get(self.datasetType, dataId={})
68 self.assertTrue(df1.equals(df2))
69 # Read just the column descriptions.
70 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
71 self.assertTrue(df1.columns.equals(columns2))
72 # Read just some columns a few different ways.
73 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
74 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
75 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
76 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
77 # Passing an unrecognized column should be a ValueError.
78 with self.assertRaises(ValueError):
79 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
81 def testMultiIndexDataFrame(self):
82 columns1 = pd.MultiIndex.from_tuples(
83 [
84 ("g", "a"),
85 ("g", "b"),
86 ("g", "c"),
87 ("r", "a"),
88 ("r", "b"),
89 ("r", "c"),
90 ],
91 names=["filter", "column"],
92 )
93 df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1)
94 self.butler.put(df1, self.datasetType, dataId={})
95 # Read the whole DataFrame.
96 df2 = self.butler.get(self.datasetType, dataId={})
97 self.assertTrue(df1.equals(df2))
98 # Read just the column descriptions.
99 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
100 self.assertTrue(df1.columns.equals(columns2))
101 # Read just some columns a few different ways.
102 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
103 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
104 df4 = self.butler.get(
105 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}}
106 )
107 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
108 column_list = [("g", "a"), ("r", "c")]
109 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list})
110 self.assertTrue(df1.loc[:, column_list].equals(df5))
111 # Passing an unrecognized column should be a ValueError.
112 with self.assertRaises(ValueError):
113 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
116if __name__ == "__main__": 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true
117 unittest.main()