Coverage for tests/test_parquet.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_tasks.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import unittest
23import copy
24import functools
25import pandas as pd
26from pandas.util.testing import assert_frame_equal
28import lsst.utils.tests
30# TODO: Remove skipUnless and this try block DM-22256
31try:
32 import pyarrow as pa
33 import pyarrow.parquet as pq
34 from lsst.pipe.tasks.parquetTable import ParquetTable, MultilevelParquetTable
35 havePyArrow = True
36except ImportError:
37 havePyArrow = False
40def setup_module(module):
41 lsst.utils.tests.init()
44@unittest.skipUnless(havePyArrow, "Requires pyarrow")
45class ParquetTableTestCase(unittest.TestCase):
46 """Test case for ParquetTable
47 """
49 def simulateDF(self):
50 """Create a simple test DataFrame
51 """
52 df = pd.DataFrame({
53 "coord_ra": [3.77654137, 3.77643059, 3.77621148, 3.77611944, 3.77610396],
54 "coord_dec": [0.01127624, 0.01127787, 0.01127543, 0.01127543, 0.01127543],
55 "slot_Centroid_flag": [True, True, True, True, True],
56 "slot_Centroid_x": [16208., 16344., 16613., 16726., 16745.],
57 "slot_Centroid_y": [15905., 15907., 15904., 15904., 15904.],
58 "slot_PsfFlux_apCorr": [0.98636465, 0.98437287, 0.97212515, 0.97179828, 0.97182371],
59 "slot_PsfFlux_apCorrSigma": [0., 0., 0., 0., 0.],
60 "slot_PsfFlux_flag": [True, True, True, True, True],
61 "slot_PsfFlux_instFlux": [0.28106412, 1.98260751, 0.08900771, 1.11375753, 1.3835924],
62 "slot_PsfFlux_instFluxSigma": [0.22967081, 0.25409701, 0.2120654, 0.23031162, 0.24262261],
63 "calib_psfUsed": [False, False, False, False, False],
64 "detect_isPatchInner": [False, False, False, False, False],
65 "detect_isPrimary": [False, False, False, False, False],
66 "detect_isTractInner": [True, True, True, True, True]})
67 return df
69 def setUp(self):
70 self.df = self.simulateDF()
71 with lsst.utils.tests.getTempFilePath('*.parq') as filename:
72 table = pa.Table.from_pandas(self.df)
73 pq.write_table(table, filename, compression='none')
74 self.parq, self.dfParq = self.getParq(filename, self.df)
76 def tearDown(self):
77 del self.df
78 del self.parq
80 def getParq(self, filename, df):
81 return ParquetTable(filename), ParquetTable(dataFrame=df)
83 def testRoundTrip(self):
84 self.assertTrue(self.parq.toDataFrame().equals(self.df))
86 def testColumns(self):
87 columns = ['coord_ra', 'coord_dec']
88 self.assertTrue(self.parq.toDataFrame(columns=columns).equals(self.df[columns]))
90 # TO DO: DM-21976 Confirm this is the behavior we want
91 # Quietly ignore nonsense columns
92 self.assertTrue(self.parq.toDataFrame(columns=columns + ['hello']).equals(self.df[columns]))
95@unittest.skipUnless(havePyArrow, "Requires pyarrow")
96class MultilevelParquetTableTestCase(ParquetTableTestCase):
97 """Test case for MultilevelParquetTable
98 """
100 def simulateDF(self):
101 self.datasets = ['meas', 'ref']
102 self.filters = ['G', 'R']
103 self.columns = ['coord_ra', 'coord_dec']
104 simpleDF = super(MultilevelParquetTableTestCase, self).simulateDF()
105 dfFilterDSCombos = []
106 for ds in self.datasets:
107 for filterName in self.filters:
108 df = copy.copy(simpleDF)
109 df.reindex(sorted(df.columns), axis=1)
110 df['dataset'] = 'meas'
111 df['filter'] = filterName
112 df.columns = pd.MultiIndex.from_tuples([(ds, filterName, c) for c in df.columns],
113 names=('dataset', 'filter', 'column'))
114 dfFilterDSCombos.append(df)
116 return functools.reduce(lambda d1, d2: d1.join(d2), dfFilterDSCombos)
118 def getParq(self, filename, df):
119 fromFile = MultilevelParquetTable(filename)
120 fromDf = MultilevelParquetTable(dataFrame=df)
121 return fromFile, fromDf
123 def testProperties(self):
124 self.assertTrue(all([x == y for x, y in zip(self.parq.columnLevels, self.df.columns.names)]))
125 self.assertEqual(len(self.parq.columns), len(self.df.columns))
127 self.assertTrue(all([x == y for x, y in zip(self.dfParq.columnLevels, self.df.columns.names)]))
128 self.assertEqual(len(self.dfParq.columns), len(self.df.columns))
130 def testColumns(self):
131 df = self.df
132 parq = self.parq
134 # Case A, each level has multiple values
135 datasets_A = self.datasets
136 filters_A = self.filters
137 columns_A = self.columns
138 columnDict_A = {'dataset': datasets_A,
139 'filter': filters_A,
140 'column': columns_A
141 }
142 colTuples_A = [(self.datasets[0], self.filters[0], self.columns[0]),
143 (self.datasets[0], self.filters[0], self.columns[1]),
144 (self.datasets[0], self.filters[1], self.columns[0]),
145 (self.datasets[0], self.filters[1], self.columns[1]),
146 (self.datasets[1], self.filters[0], self.columns[0]),
147 (self.datasets[1], self.filters[0], self.columns[1]),
148 (self.datasets[1], self.filters[1], self.columns[0]),
149 (self.datasets[1], self.filters[1], self.columns[1])]
150 df_A = df[colTuples_A]
151 assert_frame_equal(parq.toDataFrame(columns=columnDict_A), df_A)
153 # Case A1, add a bogus column and test that it gets ignored
154 datasets_A1 = self.datasets
155 filters_A1 = self.filters
156 columns_A1 = self.columns + ['garbage']
157 columnDict_A1 = {'dataset': datasets_A1,
158 'filter': filters_A1,
159 'column': columns_A1}
160 colTuples_A1 = [(self.datasets[0], self.filters[0], self.columns[0]),
161 (self.datasets[0], self.filters[0], self.columns[1]),
162 (self.datasets[0], self.filters[1], self.columns[0]),
163 (self.datasets[0], self.filters[1], self.columns[1]),
164 (self.datasets[1], self.filters[0], self.columns[0]),
165 (self.datasets[1], self.filters[0], self.columns[1]),
166 (self.datasets[1], self.filters[1], self.columns[0]),
167 (self.datasets[1], self.filters[1], self.columns[1])]
168 df_A1 = df[colTuples_A1]
169 assert_frame_equal(parq.toDataFrame(columns=columnDict_A1), df_A1)
171 # Case B: One level has only a single value
172 datasets_B = self.datasets[0]
173 filters_B = self.filters
174 columns_B = self.columns
175 columnDict_B = {'dataset': datasets_B,
176 'filter': filters_B,
177 'column': columns_B}
178 colTuples_B = [(self.datasets[0], self.filters[0], self.columns[0]),
179 (self.datasets[0], self.filters[0], self.columns[1]),
180 (self.datasets[0], self.filters[1], self.columns[0]),
181 (self.datasets[0], self.filters[1], self.columns[1])]
182 df_B = df[colTuples_B]
183 df_B.columns = df_B.columns.droplevel('dataset')
184 assert_frame_equal(parq.toDataFrame(columns=columnDict_B), df_B)
185 assert_frame_equal(df_B, parq.toDataFrame(columns=colTuples_B))
187 # When explicit columns are not provided, comparison requires
188 # first getting the column index in sorted order. Apparently this
189 # happens by default in parq.toDataFrame(); to be honest, I'm not
190 # exactly sure how/why.
192 # Case C: Two levels have a single value; third is not provided
193 datasets_C = self.datasets[0]
194 filters_C = self.filters[0]
195 columnDict_C = {'dataset': datasets_C,
196 'filter': filters_C}
197 df_C = df[datasets_C][filters_C].sort_index(axis=1)
199 self.assertTrue(parq.toDataFrame(columns=columnDict_C).equals(df_C))
201 # Case D: Only one level (first level) is provided
202 dataset_D = self.datasets[0]
203 columnDict_D = {'dataset': dataset_D}
204 df_D = df[dataset_D].sort_index(axis=1)
205 self.assertTrue(parq.toDataFrame(columns=columnDict_D).equals(df_D))
207 # Case E: Only one level (second level) is provided
208 filters_E = self.filters[1]
209 columnDict_E = {'filter': filters_E}
210 # get second level of multi-index column using .xs()
211 df_E = df.xs(filters_E, level=1, axis=1).sort_index(axis=1)
212 self.assertTrue(parq.toDataFrame(columns=columnDict_E).equals(df_E))
214 # Case when all requested columns don't exist
215 columnDictNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('hello')}
216 self.assertRaises(ValueError, parq.toDataFrame, columns=columnDictNonsense)
218 # Case when some requested columns don't exist.
219 # TO DO: DM-21976 Confirm this is the behavior we want
220 # Quietly ignore nonsense columns
221 columnDictSomeNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('coord_ra', 'hello')}
222 dfGood = pd.DataFrame(df['meas']['G']['coord_ra'])
223 self.assertTrue(parq.toDataFrame(columns=columnDictSomeNonsense).equals(dfGood))
226if __name__ == "__main__": 226 ↛ 227line 226 didn't jump to line 227, because the condition on line 226 was never true
227 lsst.utils.tests.init()
228 unittest.main()