Coverage for tests/test_parquet.py : 35%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for ParquetFormatter.
24Tests in this module are disabled unless pandas and pyarrow are importable.
25"""
27import os
28import unittest
29import tempfile
30import shutil
32try:
33 import numpy as np
34 import pandas as pd
35 import pyarrow.parquet
36except ImportError:
37 pyarrow = None
39from lsst.daf.butler import Butler, DatasetType
42TESTDIR = os.path.abspath(os.path.dirname(__file__))
45@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.")
46class ParquetFormatterTestCase(unittest.TestCase):
47 """Tests for ParquetFormatter, using local file datastore.
48 """
50 def setUp(self):
51 """Create a new butler root for each test."""
52 self.root = tempfile.mkdtemp(dir=TESTDIR)
53 Butler.makeRepo(self.root)
54 self.butler = Butler(self.root, run="test_run")
55 # No dimensions in dataset type so we don't have to worry about
56 # inserting dimension data or defining data IDs.
57 self.datasetType = DatasetType("data", dimensions=(), storageClass="DataFrame",
58 universe=self.butler.registry.dimensions)
59 self.butler.registry.registerDatasetType(self.datasetType)
61 def tearDown(self):
62 if os.path.exists(self.root):
63 shutil.rmtree(self.root, ignore_errors=True)
65 def testSingleIndexDataFrame(self):
66 columns1 = pd.Index(["a", "b", "c"])
67 df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1)
68 self.butler.put(df1, self.datasetType, dataId={})
69 # Read the whole DataFrame.
70 df2 = self.butler.get(self.datasetType, dataId={})
71 self.assertTrue(df1.equals(df2))
72 # Read just the column descriptions.
73 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
74 self.assertTrue(df1.columns.equals(columns2))
75 # Read just some columns a few different ways.
76 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]})
77 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3))
78 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"})
79 self.assertTrue(df1.loc[:, ["a"]].equals(df4))
80 # Passing an unrecognized column should be a ValueError.
81 with self.assertRaises(ValueError):
82 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
84 def testMultiIndexDataFrame(self):
85 columns1 = pd.MultiIndex.from_tuples(
86 [
87 ("g", "a"),
88 ("g", "b"),
89 ("g", "c"),
90 ("r", "a"),
91 ("r", "b"),
92 ("r", "c"),
93 ],
94 names=["filter", "column"],
95 )
96 df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1)
97 self.butler.put(df1, self.datasetType, dataId={})
98 # Read the whole DataFrame.
99 df2 = self.butler.get(self.datasetType, dataId={})
100 self.assertTrue(df1.equals(df2))
101 # Read just the column descriptions.
102 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={})
103 self.assertTrue(df1.columns.equals(columns2))
104 # Read just some columns a few different ways.
105 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}})
106 self.assertTrue(df1.loc[:, ["g"]].equals(df3))
107 df4 = self.butler.get(self.datasetType, dataId={},
108 parameters={"columns": {"filter": ["r"], "column": "a"}})
109 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4))
110 # Passing an unrecognized column should be a ValueError.
111 with self.assertRaises(ValueError):
112 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
115if __name__ == "__main__": 115 ↛ 116line 115 didn't jump to line 116, because the condition on line 115 was never true
116 unittest.main()