Coverage for tests/test_parquet.py: 23%

104 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-29 03:24 -0700

1# This file is part of pipe_tasks. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import unittest 

23import copy 

24import functools 

25import pandas as pd 

26from pandas.testing import assert_frame_equal 

27 

28import lsst.utils.tests 

29 

30import pyarrow as pa 

31import pyarrow.parquet as pq 

32from lsst.pipe.tasks.parquetTable import ParquetTable, MultilevelParquetTable 

33 

34 

35def setup_module(module): 

36 lsst.utils.tests.init() 

37 

38 

39class ParquetTableTestCase(unittest.TestCase): 

40 """Test case for ParquetTable 

41 """ 

42 

43 def simulateDF(self): 

44 """Create a simple test DataFrame 

45 """ 

46 df = pd.DataFrame({ 

47 "coord_ra": [3.77654137, 3.77643059, 3.77621148, 3.77611944, 3.77610396], 

48 "coord_dec": [0.01127624, 0.01127787, 0.01127543, 0.01127543, 0.01127543], 

49 "slot_Centroid_flag": [True, True, True, True, True], 

50 "slot_Centroid_x": [16208., 16344., 16613., 16726., 16745.], 

51 "slot_Centroid_y": [15905., 15907., 15904., 15904., 15904.], 

52 "slot_PsfFlux_apCorr": [0.98636465, 0.98437287, 0.97212515, 0.97179828, 0.97182371], 

53 "slot_PsfFlux_apCorrSigma": [0., 0., 0., 0., 0.], 

54 "slot_PsfFlux_flag": [True, True, True, True, True], 

55 "slot_PsfFlux_instFlux": [0.28106412, 1.98260751, 0.08900771, 1.11375753, 1.3835924], 

56 "slot_PsfFlux_instFluxSigma": [0.22967081, 0.25409701, 0.2120654, 0.23031162, 0.24262261], 

57 "calib_psfUsed": [False, False, False, False, False], 

58 "detect_isPatchInner": [False, False, False, False, False], 

59 "detect_isPrimary": [False, False, False, False, False], 

60 "detect_isTractInner": [True, True, True, True, True]}) 

61 return df 

62 

63 def setUp(self): 

64 self.df = self.simulateDF() 

65 with lsst.utils.tests.getTempFilePath('*.parq') as filename: 

66 table = pa.Table.from_pandas(self.df) 

67 pq.write_table(table, filename) 

68 self.parq, self.dfParq = self.getParq(filename, self.df) 

69 

70 def tearDown(self): 

71 del self.df 

72 del self.parq 

73 

74 def getParq(self, filename, df): 

75 return ParquetTable(filename), ParquetTable(dataFrame=df) 

76 

77 def testRoundTrip(self): 

78 self.assertTrue(self.parq.toDataFrame().equals(self.df)) 

79 

80 def testColumns(self): 

81 columns = ['coord_ra', 'coord_dec'] 

82 self.assertTrue(self.parq.toDataFrame(columns=columns).equals(self.df[columns])) 

83 

84 # TO DO: DM-21976 Confirm this is the behavior we want 

85 # Quietly ignore nonsense columns 

86 self.assertTrue(self.parq.toDataFrame(columns=columns + ['hello']).equals(self.df[columns])) 

87 

88 

89class MultilevelParquetTableTestCase(ParquetTableTestCase): 

90 """Test case for MultilevelParquetTable 

91 """ 

92 

93 def simulateDF(self): 

94 self.datasets = ['meas', 'ref'] 

95 self.filters = ['G', 'R'] 

96 self.columns = ['coord_ra', 'coord_dec'] 

97 simpleDF = super(MultilevelParquetTableTestCase, self).simulateDF() 

98 dfFilterDSCombos = [] 

99 for ds in self.datasets: 

100 for filterName in self.filters: 

101 df = copy.copy(simpleDF) 

102 df.reindex(sorted(df.columns), axis=1) 

103 df['dataset'] = 'meas' 

104 df['filter'] = filterName 

105 df.columns = pd.MultiIndex.from_tuples([(ds, filterName, c) for c in df.columns], 

106 names=('dataset', 'filter', 'column')) 

107 dfFilterDSCombos.append(df) 

108 

109 return functools.reduce(lambda d1, d2: d1.join(d2), dfFilterDSCombos) 

110 

111 def getParq(self, filename, df): 

112 fromFile = MultilevelParquetTable(filename) 

113 fromDf = MultilevelParquetTable(dataFrame=df) 

114 return fromFile, fromDf 

115 

116 def testProperties(self): 

117 self.assertTrue(all([x == y for x, y in zip(self.parq.columnLevels, self.df.columns.names)])) 

118 self.assertEqual(len(self.parq.columns), len(self.df.columns)) 

119 

120 self.assertTrue(all([x == y for x, y in zip(self.dfParq.columnLevels, self.df.columns.names)])) 

121 self.assertEqual(len(self.dfParq.columns), len(self.df.columns)) 

122 

123 def testColumns(self): 

124 df = self.df 

125 parq = self.parq 

126 

127 # Case A, each level has multiple values 

128 datasets_A = self.datasets 

129 filters_A = self.filters 

130 columns_A = self.columns 

131 columnDict_A = {'dataset': datasets_A, 

132 'filter': filters_A, 

133 'column': columns_A 

134 } 

135 colTuples_A = [(self.datasets[0], self.filters[0], self.columns[0]), 

136 (self.datasets[0], self.filters[0], self.columns[1]), 

137 (self.datasets[0], self.filters[1], self.columns[0]), 

138 (self.datasets[0], self.filters[1], self.columns[1]), 

139 (self.datasets[1], self.filters[0], self.columns[0]), 

140 (self.datasets[1], self.filters[0], self.columns[1]), 

141 (self.datasets[1], self.filters[1], self.columns[0]), 

142 (self.datasets[1], self.filters[1], self.columns[1])] 

143 df_A = df[colTuples_A] 

144 assert_frame_equal(parq.toDataFrame(columns=columnDict_A), df_A) 

145 

146 # Case A1, add a bogus column and test that it gets ignored 

147 datasets_A1 = self.datasets 

148 filters_A1 = self.filters 

149 columns_A1 = self.columns + ['garbage'] 

150 columnDict_A1 = {'dataset': datasets_A1, 

151 'filter': filters_A1, 

152 'column': columns_A1} 

153 colTuples_A1 = [(self.datasets[0], self.filters[0], self.columns[0]), 

154 (self.datasets[0], self.filters[0], self.columns[1]), 

155 (self.datasets[0], self.filters[1], self.columns[0]), 

156 (self.datasets[0], self.filters[1], self.columns[1]), 

157 (self.datasets[1], self.filters[0], self.columns[0]), 

158 (self.datasets[1], self.filters[0], self.columns[1]), 

159 (self.datasets[1], self.filters[1], self.columns[0]), 

160 (self.datasets[1], self.filters[1], self.columns[1])] 

161 df_A1 = df[colTuples_A1] 

162 assert_frame_equal(parq.toDataFrame(columns=columnDict_A1), df_A1) 

163 

164 # Case B: One level has only a single value 

165 datasets_B = self.datasets[0] 

166 filters_B = self.filters 

167 columns_B = self.columns 

168 columnDict_B = {'dataset': datasets_B, 

169 'filter': filters_B, 

170 'column': columns_B} 

171 colTuples_B = [(self.datasets[0], self.filters[0], self.columns[0]), 

172 (self.datasets[0], self.filters[0], self.columns[1]), 

173 (self.datasets[0], self.filters[1], self.columns[0]), 

174 (self.datasets[0], self.filters[1], self.columns[1])] 

175 df_B = df[colTuples_B] 

176 df_B.columns = df_B.columns.droplevel('dataset') 

177 assert_frame_equal(parq.toDataFrame(columns=columnDict_B), df_B) 

178 assert_frame_equal(df_B, parq.toDataFrame(columns=colTuples_B)) 

179 

180 # When explicit columns are not provided, comparison requires 

181 # first getting the column index in sorted order. Apparently this 

182 # happens by default in parq.toDataFrame(); to be honest, I'm not 

183 # exactly sure how/why. 

184 

185 # Case C: Two levels have a single value; third is not provided 

186 datasets_C = self.datasets[0] 

187 filters_C = self.filters[0] 

188 columnDict_C = {'dataset': datasets_C, 

189 'filter': filters_C} 

190 df_C = df[datasets_C][filters_C].sort_index(axis=1) 

191 

192 self.assertTrue(parq.toDataFrame(columns=columnDict_C).equals(df_C)) 

193 

194 # Case D: Only one level (first level) is provided 

195 dataset_D = self.datasets[0] 

196 columnDict_D = {'dataset': dataset_D} 

197 df_D = df[dataset_D].sort_index(axis=1) 

198 self.assertTrue(parq.toDataFrame(columns=columnDict_D).equals(df_D)) 

199 

200 # Case E: Only one level (second level) is provided 

201 filters_E = self.filters[1] 

202 columnDict_E = {'filter': filters_E} 

203 # get second level of multi-index column using .xs() 

204 df_E = df.xs(filters_E, level=1, axis=1).sort_index(axis=1) 

205 self.assertTrue(parq.toDataFrame(columns=columnDict_E).equals(df_E)) 

206 

207 # Case when all requested columns don't exist 

208 columnDictNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('hello')} 

209 self.assertRaises(ValueError, parq.toDataFrame, columns=columnDictNonsense) 

210 

211 # Case when some requested columns don't exist. 

212 # TO DO: DM-21976 Confirm this is the behavior we want 

213 # Quietly ignore nonsense columns 

214 columnDictSomeNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('coord_ra', 'hello')} 

215 dfGood = pd.DataFrame(df['meas']['G']['coord_ra']) 

216 self.assertTrue(parq.toDataFrame(columns=columnDictSomeNonsense).equals(dfGood)) 

217 

218 

219if __name__ == "__main__": 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true

220 lsst.utils.tests.init() 

221 unittest.main()