Coverage for tests/test_parquet.py: 20%
104 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 09:33 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 09:33 +0000
1# This file is part of pipe_tasks.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import unittest
23import copy
24import functools
25import pandas as pd
26from pandas.testing import assert_frame_equal
28import lsst.utils.tests
30import pyarrow as pa
31import pyarrow.parquet as pq
32from lsst.pipe.tasks.parquetTable import ParquetTable, MultilevelParquetTable
35def setup_module(module):
36 lsst.utils.tests.init()
39class ParquetTableTestCase(unittest.TestCase):
40 """Test case for ParquetTable
41 """
43 def simulateDF(self):
44 """Create a simple test DataFrame
45 """
46 df = pd.DataFrame({
47 "coord_ra": [3.77654137, 3.77643059, 3.77621148, 3.77611944, 3.77610396],
48 "coord_dec": [0.01127624, 0.01127787, 0.01127543, 0.01127543, 0.01127543],
49 "slot_Centroid_flag": [True, True, True, True, True],
50 "slot_Centroid_x": [16208., 16344., 16613., 16726., 16745.],
51 "slot_Centroid_y": [15905., 15907., 15904., 15904., 15904.],
52 "slot_PsfFlux_apCorr": [0.98636465, 0.98437287, 0.97212515, 0.97179828, 0.97182371],
53 "slot_PsfFlux_apCorrSigma": [0., 0., 0., 0., 0.],
54 "slot_PsfFlux_flag": [True, True, True, True, True],
55 "slot_PsfFlux_instFlux": [0.28106412, 1.98260751, 0.08900771, 1.11375753, 1.3835924],
56 "slot_PsfFlux_instFluxSigma": [0.22967081, 0.25409701, 0.2120654, 0.23031162, 0.24262261],
57 "calib_psfUsed": [False, False, False, False, False],
58 "detect_isPatchInner": [False, False, False, False, False],
59 "detect_isPrimary": [False, False, False, False, False],
60 "detect_isTractInner": [True, True, True, True, True]})
61 return df
63 def setUp(self):
64 self.df = self.simulateDF()
65 with lsst.utils.tests.getTempFilePath('*.parq') as filename:
66 table = pa.Table.from_pandas(self.df)
67 pq.write_table(table, filename)
68 self.parq, self.dfParq = self.getParq(filename, self.df)
70 def tearDown(self):
71 del self.df
72 del self.parq
74 def getParq(self, filename, df):
75 return ParquetTable(filename), ParquetTable(dataFrame=df)
77 def testRoundTrip(self):
78 self.assertTrue(self.parq.toDataFrame().equals(self.df))
80 def testColumns(self):
81 columns = ['coord_ra', 'coord_dec']
82 self.assertTrue(self.parq.toDataFrame(columns=columns).equals(self.df[columns]))
84 # TO DO: DM-21976 Confirm this is the behavior we want
85 # Quietly ignore nonsense columns
86 self.assertTrue(self.parq.toDataFrame(columns=columns + ['hello']).equals(self.df[columns]))
89class MultilevelParquetTableTestCase(ParquetTableTestCase):
90 """Test case for MultilevelParquetTable
91 """
93 def simulateDF(self):
94 self.datasets = ['meas', 'ref']
95 self.filters = ['G', 'R']
96 self.columns = ['coord_ra', 'coord_dec']
97 simpleDF = super(MultilevelParquetTableTestCase, self).simulateDF()
98 dfFilterDSCombos = []
99 for ds in self.datasets:
100 for filterName in self.filters:
101 df = copy.copy(simpleDF)
102 df.reindex(sorted(df.columns), axis=1)
103 df['dataset'] = 'meas'
104 df['filter'] = filterName
105 df.columns = pd.MultiIndex.from_tuples([(ds, filterName, c) for c in df.columns],
106 names=('dataset', 'filter', 'column'))
107 dfFilterDSCombos.append(df)
109 return functools.reduce(lambda d1, d2: d1.join(d2), dfFilterDSCombos)
111 def getParq(self, filename, df):
112 fromFile = MultilevelParquetTable(filename)
113 fromDf = MultilevelParquetTable(dataFrame=df)
114 return fromFile, fromDf
116 def testProperties(self):
117 self.assertTrue(all([x == y for x, y in zip(self.parq.columnLevels, self.df.columns.names)]))
118 self.assertEqual(len(self.parq.columns), len(self.df.columns))
120 self.assertTrue(all([x == y for x, y in zip(self.dfParq.columnLevels, self.df.columns.names)]))
121 self.assertEqual(len(self.dfParq.columns), len(self.df.columns))
123 def testColumns(self):
124 df = self.df
125 parq = self.parq
127 # Case A, each level has multiple values
128 datasets_A = self.datasets
129 filters_A = self.filters
130 columns_A = self.columns
131 columnDict_A = {'dataset': datasets_A,
132 'filter': filters_A,
133 'column': columns_A
134 }
135 colTuples_A = [(self.datasets[0], self.filters[0], self.columns[0]),
136 (self.datasets[0], self.filters[0], self.columns[1]),
137 (self.datasets[0], self.filters[1], self.columns[0]),
138 (self.datasets[0], self.filters[1], self.columns[1]),
139 (self.datasets[1], self.filters[0], self.columns[0]),
140 (self.datasets[1], self.filters[0], self.columns[1]),
141 (self.datasets[1], self.filters[1], self.columns[0]),
142 (self.datasets[1], self.filters[1], self.columns[1])]
143 df_A = df[colTuples_A]
144 assert_frame_equal(parq.toDataFrame(columns=columnDict_A), df_A)
146 # Case A1, add a bogus column and test that it gets ignored
147 datasets_A1 = self.datasets
148 filters_A1 = self.filters
149 columns_A1 = self.columns + ['garbage']
150 columnDict_A1 = {'dataset': datasets_A1,
151 'filter': filters_A1,
152 'column': columns_A1}
153 colTuples_A1 = [(self.datasets[0], self.filters[0], self.columns[0]),
154 (self.datasets[0], self.filters[0], self.columns[1]),
155 (self.datasets[0], self.filters[1], self.columns[0]),
156 (self.datasets[0], self.filters[1], self.columns[1]),
157 (self.datasets[1], self.filters[0], self.columns[0]),
158 (self.datasets[1], self.filters[0], self.columns[1]),
159 (self.datasets[1], self.filters[1], self.columns[0]),
160 (self.datasets[1], self.filters[1], self.columns[1])]
161 df_A1 = df[colTuples_A1]
162 assert_frame_equal(parq.toDataFrame(columns=columnDict_A1), df_A1)
164 # Case B: One level has only a single value
165 datasets_B = self.datasets[0]
166 filters_B = self.filters
167 columns_B = self.columns
168 columnDict_B = {'dataset': datasets_B,
169 'filter': filters_B,
170 'column': columns_B}
171 colTuples_B = [(self.datasets[0], self.filters[0], self.columns[0]),
172 (self.datasets[0], self.filters[0], self.columns[1]),
173 (self.datasets[0], self.filters[1], self.columns[0]),
174 (self.datasets[0], self.filters[1], self.columns[1])]
175 df_B = df[colTuples_B]
176 df_B.columns = df_B.columns.droplevel('dataset')
177 assert_frame_equal(parq.toDataFrame(columns=columnDict_B), df_B)
178 assert_frame_equal(df_B, parq.toDataFrame(columns=colTuples_B))
180 # When explicit columns are not provided, comparison requires
181 # first getting the column index in sorted order. Apparently this
182 # happens by default in parq.toDataFrame(); to be honest, I'm not
183 # exactly sure how/why.
185 # Case C: Two levels have a single value; third is not provided
186 datasets_C = self.datasets[0]
187 filters_C = self.filters[0]
188 columnDict_C = {'dataset': datasets_C,
189 'filter': filters_C}
190 df_C = df[datasets_C][filters_C].sort_index(axis=1)
192 self.assertTrue(parq.toDataFrame(columns=columnDict_C).equals(df_C))
194 # Case D: Only one level (first level) is provided
195 dataset_D = self.datasets[0]
196 columnDict_D = {'dataset': dataset_D}
197 df_D = df[dataset_D].sort_index(axis=1)
198 self.assertTrue(parq.toDataFrame(columns=columnDict_D).equals(df_D))
200 # Case E: Only one level (second level) is provided
201 filters_E = self.filters[1]
202 columnDict_E = {'filter': filters_E}
203 # get second level of multi-index column using .xs()
204 df_E = df.xs(filters_E, level=1, axis=1).sort_index(axis=1)
205 self.assertTrue(parq.toDataFrame(columns=columnDict_E).equals(df_E))
207 # Case when all requested columns don't exist
208 columnDictNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('hello')}
209 self.assertRaises(ValueError, parq.toDataFrame, columns=columnDictNonsense)
211 # Case when some requested columns don't exist.
212 # TO DO: DM-21976 Confirm this is the behavior we want
213 # Quietly ignore nonsense columns
214 columnDictSomeNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('coord_ra', 'hello')}
215 dfGood = pd.DataFrame(df['meas']['G']['coord_ra'])
216 self.assertTrue(parq.toDataFrame(columns=columnDictSomeNonsense).equals(dfGood))
219if __name__ == "__main__": 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true
220 lsst.utils.tests.init()
221 unittest.main()