Coverage for tests/test_parquet.py: 19%

110 statements  

« prev     ^ index     » next       coverage.py v7.2.6, created at 2023-05-24 09:27 +0000

1# This file is part of pipe_tasks. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import warnings 

23import unittest 

24import copy 

25import functools 

26import pandas as pd 

27from pandas.testing import assert_frame_equal 

28 

29import lsst.utils.tests 

30 

31import pyarrow as pa 

32import pyarrow.parquet as pq 

33 

34from lsst.pipe.tasks.parquetTable import ParquetTable, MultilevelParquetTable 

35 

36 

37def setup_module(module): 

38 lsst.utils.tests.init() 

39 

40 

41class ParquetTableTestCase(unittest.TestCase): 

42 """Test case for ParquetTable 

43 """ 

44 

45 def simulateDF(self): 

46 """Create a simple test DataFrame 

47 """ 

48 df = pd.DataFrame({ 

49 "coord_ra": [3.77654137, 3.77643059, 3.77621148, 3.77611944, 3.77610396], 

50 "coord_dec": [0.01127624, 0.01127787, 0.01127543, 0.01127543, 0.01127543], 

51 "slot_Centroid_flag": [True, True, True, True, True], 

52 "slot_Centroid_x": [16208., 16344., 16613., 16726., 16745.], 

53 "slot_Centroid_y": [15905., 15907., 15904., 15904., 15904.], 

54 "slot_PsfFlux_apCorr": [0.98636465, 0.98437287, 0.97212515, 0.97179828, 0.97182371], 

55 "slot_PsfFlux_apCorrSigma": [0., 0., 0., 0., 0.], 

56 "slot_PsfFlux_flag": [True, True, True, True, True], 

57 "slot_PsfFlux_instFlux": [0.28106412, 1.98260751, 0.08900771, 1.11375753, 1.3835924], 

58 "slot_PsfFlux_instFluxSigma": [0.22967081, 0.25409701, 0.2120654, 0.23031162, 0.24262261], 

59 "calib_psfUsed": [False, False, False, False, False], 

60 "detect_isPatchInner": [False, False, False, False, False], 

61 "detect_isPrimary": [False, False, False, False, False], 

62 "detect_isTractInner": [True, True, True, True, True]}) 

63 return df 

64 

65 def setUp(self): 

66 self.df = self.simulateDF() 

67 with lsst.utils.tests.getTempFilePath('*.parq') as filename: 

68 table = pa.Table.from_pandas(self.df) 

69 pq.write_table(table, filename) 

70 self.parq, self.dfParq = self.getParq(filename, self.df) 

71 

72 def tearDown(self): 

73 del self.df 

74 del self.parq 

75 

76 def getParq(self, filename, df): 

77 with warnings.catch_warnings(): 

78 warnings.simplefilter("ignore") 

79 fromFile, fromDF = ParquetTable(filename), ParquetTable(dataFrame=df) 

80 

81 return fromFile, fromDF 

82 

83 def testRoundTrip(self): 

84 self.assertTrue(self.parq.toDataFrame().equals(self.df)) 

85 

86 def testColumns(self): 

87 columns = ['coord_ra', 'coord_dec'] 

88 self.assertTrue(self.parq.toDataFrame(columns=columns).equals(self.df[columns])) 

89 

90 # TO DO: DM-21976 Confirm this is the behavior we want 

91 # Quietly ignore nonsense columns 

92 self.assertTrue(self.parq.toDataFrame(columns=columns + ['hello']).equals(self.df[columns])) 

93 

94 

95class MultilevelParquetTableTestCase(ParquetTableTestCase): 

96 """Test case for MultilevelParquetTable 

97 """ 

98 

99 def simulateDF(self): 

100 self.datasets = ['meas', 'ref'] 

101 self.filters = ['G', 'R'] 

102 self.columns = ['coord_ra', 'coord_dec'] 

103 simpleDF = super(MultilevelParquetTableTestCase, self).simulateDF() 

104 dfFilterDSCombos = [] 

105 for ds in self.datasets: 

106 for filterName in self.filters: 

107 df = copy.copy(simpleDF) 

108 df.reindex(sorted(df.columns), axis=1) 

109 df['dataset'] = 'meas' 

110 df['filter'] = filterName 

111 df.columns = pd.MultiIndex.from_tuples([(ds, filterName, c) for c in df.columns], 

112 names=('dataset', 'filter', 'column')) 

113 dfFilterDSCombos.append(df) 

114 

115 return functools.reduce(lambda d1, d2: d1.join(d2), dfFilterDSCombos) 

116 

117 def getParq(self, filename, df): 

118 with warnings.catch_warnings(): 

119 warnings.simplefilter("ignore") 

120 fromFile = MultilevelParquetTable(filename) 

121 fromDf = MultilevelParquetTable(dataFrame=df) 

122 return fromFile, fromDf 

123 

124 def testProperties(self): 

125 self.assertTrue(all([x == y for x, y in zip(self.parq.columnLevels, self.df.columns.names)])) 

126 self.assertEqual(len(self.parq.columns), len(self.df.columns)) 

127 

128 self.assertTrue(all([x == y for x, y in zip(self.dfParq.columnLevels, self.df.columns.names)])) 

129 self.assertEqual(len(self.dfParq.columns), len(self.df.columns)) 

130 

131 def testColumns(self): 

132 df = self.df 

133 parq = self.parq 

134 

135 # Case A, each level has multiple values 

136 datasets_A = self.datasets 

137 filters_A = self.filters 

138 columns_A = self.columns 

139 columnDict_A = {'dataset': datasets_A, 

140 'filter': filters_A, 

141 'column': columns_A 

142 } 

143 colTuples_A = [(self.datasets[0], self.filters[0], self.columns[0]), 

144 (self.datasets[0], self.filters[0], self.columns[1]), 

145 (self.datasets[0], self.filters[1], self.columns[0]), 

146 (self.datasets[0], self.filters[1], self.columns[1]), 

147 (self.datasets[1], self.filters[0], self.columns[0]), 

148 (self.datasets[1], self.filters[0], self.columns[1]), 

149 (self.datasets[1], self.filters[1], self.columns[0]), 

150 (self.datasets[1], self.filters[1], self.columns[1])] 

151 df_A = df[colTuples_A] 

152 assert_frame_equal(parq.toDataFrame(columns=columnDict_A), df_A) 

153 

154 # Case A1, add a bogus column and test that it gets ignored 

155 datasets_A1 = self.datasets 

156 filters_A1 = self.filters 

157 columns_A1 = self.columns + ['garbage'] 

158 columnDict_A1 = {'dataset': datasets_A1, 

159 'filter': filters_A1, 

160 'column': columns_A1} 

161 colTuples_A1 = [(self.datasets[0], self.filters[0], self.columns[0]), 

162 (self.datasets[0], self.filters[0], self.columns[1]), 

163 (self.datasets[0], self.filters[1], self.columns[0]), 

164 (self.datasets[0], self.filters[1], self.columns[1]), 

165 (self.datasets[1], self.filters[0], self.columns[0]), 

166 (self.datasets[1], self.filters[0], self.columns[1]), 

167 (self.datasets[1], self.filters[1], self.columns[0]), 

168 (self.datasets[1], self.filters[1], self.columns[1])] 

169 df_A1 = df[colTuples_A1] 

170 assert_frame_equal(parq.toDataFrame(columns=columnDict_A1), df_A1) 

171 

172 # Case B: One level has only a single value 

173 datasets_B = self.datasets[0] 

174 filters_B = self.filters 

175 columns_B = self.columns 

176 columnDict_B = {'dataset': datasets_B, 

177 'filter': filters_B, 

178 'column': columns_B} 

179 colTuples_B = [(self.datasets[0], self.filters[0], self.columns[0]), 

180 (self.datasets[0], self.filters[0], self.columns[1]), 

181 (self.datasets[0], self.filters[1], self.columns[0]), 

182 (self.datasets[0], self.filters[1], self.columns[1])] 

183 df_B = df[colTuples_B] 

184 df_B.columns = df_B.columns.droplevel('dataset') 

185 assert_frame_equal(parq.toDataFrame(columns=columnDict_B), df_B) 

186 assert_frame_equal(df_B, parq.toDataFrame(columns=colTuples_B)) 

187 

188 # When explicit columns are not provided, comparison requires 

189 # first getting the column index in sorted order. Apparently this 

190 # happens by default in parq.toDataFrame(); to be honest, I'm not 

191 # exactly sure how/why. 

192 

193 # Case C: Two levels have a single value; third is not provided 

194 datasets_C = self.datasets[0] 

195 filters_C = self.filters[0] 

196 columnDict_C = {'dataset': datasets_C, 

197 'filter': filters_C} 

198 df_C = df[datasets_C][filters_C].sort_index(axis=1) 

199 

200 self.assertTrue(parq.toDataFrame(columns=columnDict_C).equals(df_C)) 

201 

202 # Case D: Only one level (first level) is provided 

203 dataset_D = self.datasets[0] 

204 columnDict_D = {'dataset': dataset_D} 

205 df_D = df[dataset_D].sort_index(axis=1) 

206 self.assertTrue(parq.toDataFrame(columns=columnDict_D).equals(df_D)) 

207 

208 # Case E: Only one level (second level) is provided 

209 filters_E = self.filters[1] 

210 columnDict_E = {'filter': filters_E} 

211 # get second level of multi-index column using .xs() 

212 df_E = df.xs(filters_E, level=1, axis=1).sort_index(axis=1) 

213 self.assertTrue(parq.toDataFrame(columns=columnDict_E).equals(df_E)) 

214 

215 # Case when all requested columns don't exist 

216 columnDictNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('hello')} 

217 self.assertRaises(ValueError, parq.toDataFrame, columns=columnDictNonsense) 

218 

219 # Case when some requested columns don't exist. 

220 # TO DO: DM-21976 Confirm this is the behavior we want 

221 # Quietly ignore nonsense columns 

222 columnDictSomeNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('coord_ra', 'hello')} 

223 dfGood = pd.DataFrame(df['meas']['G']['coord_ra']) 

224 self.assertTrue(parq.toDataFrame(columns=columnDictSomeNonsense).equals(dfGood)) 

225 

226 

227if __name__ == "__main__": 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true

228 lsst.utils.tests.init() 

229 unittest.main()