Coverage for tests/test_parquet.py: 19%
110 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 03:31 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 03:31 -0700
1# This file is part of pipe_tasks.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import warnings
23import unittest
24import copy
25import functools
26import pandas as pd
27from pandas.testing import assert_frame_equal
29import lsst.utils.tests
31import pyarrow as pa
32import pyarrow.parquet as pq
34from lsst.pipe.tasks.parquetTable import ParquetTable, MultilevelParquetTable
37def setup_module(module):
38 lsst.utils.tests.init()
41class ParquetTableTestCase(unittest.TestCase):
42 """Test case for ParquetTable
43 """
45 def simulateDF(self):
46 """Create a simple test DataFrame
47 """
48 df = pd.DataFrame({
49 "coord_ra": [3.77654137, 3.77643059, 3.77621148, 3.77611944, 3.77610396],
50 "coord_dec": [0.01127624, 0.01127787, 0.01127543, 0.01127543, 0.01127543],
51 "slot_Centroid_flag": [True, True, True, True, True],
52 "slot_Centroid_x": [16208., 16344., 16613., 16726., 16745.],
53 "slot_Centroid_y": [15905., 15907., 15904., 15904., 15904.],
54 "slot_PsfFlux_apCorr": [0.98636465, 0.98437287, 0.97212515, 0.97179828, 0.97182371],
55 "slot_PsfFlux_apCorrSigma": [0., 0., 0., 0., 0.],
56 "slot_PsfFlux_flag": [True, True, True, True, True],
57 "slot_PsfFlux_instFlux": [0.28106412, 1.98260751, 0.08900771, 1.11375753, 1.3835924],
58 "slot_PsfFlux_instFluxSigma": [0.22967081, 0.25409701, 0.2120654, 0.23031162, 0.24262261],
59 "calib_psfUsed": [False, False, False, False, False],
60 "detect_isPatchInner": [False, False, False, False, False],
61 "detect_isPrimary": [False, False, False, False, False],
62 "detect_isTractInner": [True, True, True, True, True]})
63 return df
65 def setUp(self):
66 self.df = self.simulateDF()
67 with lsst.utils.tests.getTempFilePath('*.parq') as filename:
68 table = pa.Table.from_pandas(self.df)
69 pq.write_table(table, filename)
70 self.parq, self.dfParq = self.getParq(filename, self.df)
72 def tearDown(self):
73 del self.df
74 del self.parq
76 def getParq(self, filename, df):
77 with warnings.catch_warnings():
78 warnings.simplefilter("ignore")
79 fromFile, fromDF = ParquetTable(filename), ParquetTable(dataFrame=df)
81 return fromFile, fromDF
83 def testRoundTrip(self):
84 self.assertTrue(self.parq.toDataFrame().equals(self.df))
86 def testColumns(self):
87 columns = ['coord_ra', 'coord_dec']
88 self.assertTrue(self.parq.toDataFrame(columns=columns).equals(self.df[columns]))
90 # TO DO: DM-21976 Confirm this is the behavior we want
91 # Quietly ignore nonsense columns
92 self.assertTrue(self.parq.toDataFrame(columns=columns + ['hello']).equals(self.df[columns]))
95class MultilevelParquetTableTestCase(ParquetTableTestCase):
96 """Test case for MultilevelParquetTable
97 """
99 def simulateDF(self):
100 self.datasets = ['meas', 'ref']
101 self.filters = ['G', 'R']
102 self.columns = ['coord_ra', 'coord_dec']
103 simpleDF = super(MultilevelParquetTableTestCase, self).simulateDF()
104 dfFilterDSCombos = []
105 for ds in self.datasets:
106 for filterName in self.filters:
107 df = copy.copy(simpleDF)
108 df.reindex(sorted(df.columns), axis=1)
109 df['dataset'] = 'meas'
110 df['filter'] = filterName
111 df.columns = pd.MultiIndex.from_tuples([(ds, filterName, c) for c in df.columns],
112 names=('dataset', 'filter', 'column'))
113 dfFilterDSCombos.append(df)
115 return functools.reduce(lambda d1, d2: d1.join(d2), dfFilterDSCombos)
117 def getParq(self, filename, df):
118 with warnings.catch_warnings():
119 warnings.simplefilter("ignore")
120 fromFile = MultilevelParquetTable(filename)
121 fromDf = MultilevelParquetTable(dataFrame=df)
122 return fromFile, fromDf
124 def testProperties(self):
125 self.assertTrue(all([x == y for x, y in zip(self.parq.columnLevels, self.df.columns.names)]))
126 self.assertEqual(len(self.parq.columns), len(self.df.columns))
128 self.assertTrue(all([x == y for x, y in zip(self.dfParq.columnLevels, self.df.columns.names)]))
129 self.assertEqual(len(self.dfParq.columns), len(self.df.columns))
131 def testColumns(self):
132 df = self.df
133 parq = self.parq
135 # Case A, each level has multiple values
136 datasets_A = self.datasets
137 filters_A = self.filters
138 columns_A = self.columns
139 columnDict_A = {'dataset': datasets_A,
140 'filter': filters_A,
141 'column': columns_A
142 }
143 colTuples_A = [(self.datasets[0], self.filters[0], self.columns[0]),
144 (self.datasets[0], self.filters[0], self.columns[1]),
145 (self.datasets[0], self.filters[1], self.columns[0]),
146 (self.datasets[0], self.filters[1], self.columns[1]),
147 (self.datasets[1], self.filters[0], self.columns[0]),
148 (self.datasets[1], self.filters[0], self.columns[1]),
149 (self.datasets[1], self.filters[1], self.columns[0]),
150 (self.datasets[1], self.filters[1], self.columns[1])]
151 df_A = df[colTuples_A]
152 assert_frame_equal(parq.toDataFrame(columns=columnDict_A), df_A)
154 # Case A1, add a bogus column and test that it gets ignored
155 datasets_A1 = self.datasets
156 filters_A1 = self.filters
157 columns_A1 = self.columns + ['garbage']
158 columnDict_A1 = {'dataset': datasets_A1,
159 'filter': filters_A1,
160 'column': columns_A1}
161 colTuples_A1 = [(self.datasets[0], self.filters[0], self.columns[0]),
162 (self.datasets[0], self.filters[0], self.columns[1]),
163 (self.datasets[0], self.filters[1], self.columns[0]),
164 (self.datasets[0], self.filters[1], self.columns[1]),
165 (self.datasets[1], self.filters[0], self.columns[0]),
166 (self.datasets[1], self.filters[0], self.columns[1]),
167 (self.datasets[1], self.filters[1], self.columns[0]),
168 (self.datasets[1], self.filters[1], self.columns[1])]
169 df_A1 = df[colTuples_A1]
170 assert_frame_equal(parq.toDataFrame(columns=columnDict_A1), df_A1)
172 # Case B: One level has only a single value
173 datasets_B = self.datasets[0]
174 filters_B = self.filters
175 columns_B = self.columns
176 columnDict_B = {'dataset': datasets_B,
177 'filter': filters_B,
178 'column': columns_B}
179 colTuples_B = [(self.datasets[0], self.filters[0], self.columns[0]),
180 (self.datasets[0], self.filters[0], self.columns[1]),
181 (self.datasets[0], self.filters[1], self.columns[0]),
182 (self.datasets[0], self.filters[1], self.columns[1])]
183 df_B = df[colTuples_B]
184 df_B.columns = df_B.columns.droplevel('dataset')
185 assert_frame_equal(parq.toDataFrame(columns=columnDict_B), df_B)
186 assert_frame_equal(df_B, parq.toDataFrame(columns=colTuples_B))
188 # When explicit columns are not provided, comparison requires
189 # first getting the column index in sorted order. Apparently this
190 # happens by default in parq.toDataFrame(); to be honest, I'm not
191 # exactly sure how/why.
193 # Case C: Two levels have a single value; third is not provided
194 datasets_C = self.datasets[0]
195 filters_C = self.filters[0]
196 columnDict_C = {'dataset': datasets_C,
197 'filter': filters_C}
198 df_C = df[datasets_C][filters_C].sort_index(axis=1)
200 self.assertTrue(parq.toDataFrame(columns=columnDict_C).equals(df_C))
202 # Case D: Only one level (first level) is provided
203 dataset_D = self.datasets[0]
204 columnDict_D = {'dataset': dataset_D}
205 df_D = df[dataset_D].sort_index(axis=1)
206 self.assertTrue(parq.toDataFrame(columns=columnDict_D).equals(df_D))
208 # Case E: Only one level (second level) is provided
209 filters_E = self.filters[1]
210 columnDict_E = {'filter': filters_E}
211 # get second level of multi-index column using .xs()
212 df_E = df.xs(filters_E, level=1, axis=1).sort_index(axis=1)
213 self.assertTrue(parq.toDataFrame(columns=columnDict_E).equals(df_E))
215 # Case when all requested columns don't exist
216 columnDictNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('hello')}
217 self.assertRaises(ValueError, parq.toDataFrame, columns=columnDictNonsense)
219 # Case when some requested columns don't exist.
220 # TO DO: DM-21976 Confirm this is the behavior we want
221 # Quietly ignore nonsense columns
222 columnDictSomeNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('coord_ra', 'hello')}
223 dfGood = pd.DataFrame(df['meas']['G']['coord_ra'])
224 self.assertTrue(parq.toDataFrame(columns=columnDictSomeNonsense).equals(dfGood))
227if __name__ == "__main__": 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true
228 lsst.utils.tests.init()
229 unittest.main()