Coverage for tests/test_parquet.py : 34%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
30try:
31 import numpy as np
32 import pandas as pd
33 import pyarrow.parquet
34except ImportError:
35 pyarrow = None
37from lsst.daf.butler import Butler, DatasetType
38from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
41TESTDIR = os.path.abspath(os.path.dirname(__file__))
44@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.")
45class ParquetFormatterTestCase(unittest.TestCase):
46 """Tests for ParquetFormatter, using local file datastore.
47 """
49 def setUp(self):
50 """Create a new butler root for each test."""
51 self.root = makeTestTempDir(TESTDIR)
52 Butler.makeRepo(self.root)
53 self.butler = Butler(self.root, run="test_run")
54 # No dimensions in dataset type so we don't have to worry about
55 # inserting dimension data or defining data IDs.
56 self.datasetType = DatasetType("data", dimensions=(), storageClass="DataFrame",
57 universe=self.butler.registry.dimensions)
58 self.butler.registry.registerDatasetType(self.datasetType)
60 def tearDown(self):
61 removeTestTempDir(self.root)
63 def testSingleIndexDataFrame(self):
64 columns1 = pd.Index(["a", "b", "c"])
65 df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1)
66 self.butler.put(df1, self.datasetType, dataId={})
67 # Read the whole DataFrame.
68 df2 = self.butler.get(self.datasetType, dataId={})
69 self.assertTrue(df1.equals(df2))
70 # Read just the column descriptions.
71 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
72 self.assertTrue(df1.columns.equals(columns2))
73 # Read just some columns a few different ways.
74 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
75 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
76 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
77 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
78 # Passing an unrecognized column should be a ValueError.
79 with self.assertRaises(ValueError):
80 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
82 def testMultiIndexDataFrame(self):
83 columns1 = pd.MultiIndex.from_tuples(
84 [
85 ("g", "a"),
86 ("g", "b"),
87 ("g", "c"),
88 ("r", "a"),
89 ("r", "b"),
90 ("r", "c"),
91 ],
92 names=["filter", "column"],
93 )
94 df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1)
95 self.butler.put(df1, self.datasetType, dataId={})
96 # Read the whole DataFrame.
97 df2 = self.butler.get(self.datasetType, dataId={})
98 self.assertTrue(df1.equals(df2))
99 # Read just the column descriptions.
100 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
101 self.assertTrue(df1.columns.equals(columns2))
102 # Read just some columns a few different ways.
103 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
104 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
105 df4 = self.butler.get(self.datasetType, dataId={},
106 parameters={"columns": {"filter": ["r"], "column": "a"}})
107 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
108 column_list = [('g', 'a'), ('r', 'c')]
109 df5 = self.butler.get(self.datasetType, dataId={},
110 parameters={'columns': column_list})
111 self.assertTrue(df1.loc[:, column_list].equals(df5))
112 # Passing an unrecognized column should be a ValueError.
113 with self.assertRaises(ValueError):
114 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
117if __name__ == "__main__": 117 ↛ 118line 117 didn't jump to line 118, because the condition on line 117 was never true
118 unittest.main()