Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_tasks. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import unittest 

23import copy 

24import functools 

25import pandas as pd 

26from pandas.util.testing import assert_frame_equal 

27 

28import lsst.utils.tests 

29 

30# TODO: Remove skipUnless and this try block DM-22256 

31try: 

32 import pyarrow as pa 

33 import pyarrow.parquet as pq 

34 from lsst.pipe.tasks.parquetTable import ParquetTable, MultilevelParquetTable 

35 havePyArrow = True 

36except ImportError: 

37 havePyArrow = False 

38 

39 

40def setup_module(module): 

41 lsst.utils.tests.init() 

42 

43 

44@unittest.skipUnless(havePyArrow, "Requires pyarrow") 

45class ParquetTableTestCase(unittest.TestCase): 

46 """Test case for ParquetTable 

47 """ 

48 

49 def simulateDF(self): 

50 """Create a simple test DataFrame 

51 """ 

52 df = pd.DataFrame({ 

53 "coord_ra": [3.77654137, 3.77643059, 3.77621148, 3.77611944, 3.77610396], 

54 "coord_dec": [0.01127624, 0.01127787, 0.01127543, 0.01127543, 0.01127543], 

55 "slot_Centroid_flag": [True, True, True, True, True], 

56 "slot_Centroid_x": [16208., 16344., 16613., 16726., 16745.], 

57 "slot_Centroid_y": [15905., 15907., 15904., 15904., 15904.], 

58 "slot_PsfFlux_apCorr": [0.98636465, 0.98437287, 0.97212515, 0.97179828, 0.97182371], 

59 "slot_PsfFlux_apCorrSigma": [0., 0., 0., 0., 0.], 

60 "slot_PsfFlux_flag": [True, True, True, True, True], 

61 "slot_PsfFlux_instFlux": [0.28106412, 1.98260751, 0.08900771, 1.11375753, 1.3835924], 

62 "slot_PsfFlux_instFluxSigma": [0.22967081, 0.25409701, 0.2120654, 0.23031162, 0.24262261], 

63 "calib_psfUsed": [False, False, False, False, False], 

64 "detect_isPatchInner": [False, False, False, False, False], 

65 "detect_isPrimary": [False, False, False, False, False], 

66 "detect_isTractInner": [True, True, True, True, True]}) 

67 return df 

68 

69 def setUp(self): 

70 self.df = self.simulateDF() 

71 with lsst.utils.tests.getTempFilePath('*.parq') as filename: 

72 table = pa.Table.from_pandas(self.df) 

73 pq.write_table(table, filename, compression='none') 

74 self.parq, self.dfParq = self.getParq(filename, self.df) 

75 

76 def tearDown(self): 

77 del self.df 

78 del self.parq 

79 

80 def getParq(self, filename, df): 

81 return ParquetTable(filename), ParquetTable(dataFrame=df) 

82 

83 def testRoundTrip(self): 

84 self.assertTrue(self.parq.toDataFrame().equals(self.df)) 

85 

86 def testColumns(self): 

87 columns = ['coord_ra', 'coord_dec'] 

88 self.assertTrue(self.parq.toDataFrame(columns=columns).equals(self.df[columns])) 

89 

90 # TO DO: DM-21976 Confirm this is the behavior we want 

91 # Quietly ignore nonsense columns 

92 self.assertTrue(self.parq.toDataFrame(columns=columns + ['hello']).equals(self.df[columns])) 

93 

94 

95@unittest.skipUnless(havePyArrow, "Requires pyarrow") 

96class MultilevelParquetTableTestCase(ParquetTableTestCase): 

97 """Test case for MultilevelParquetTable 

98 """ 

99 

100 def simulateDF(self): 

101 self.datasets = ['meas', 'ref'] 

102 self.filters = ['G', 'R'] 

103 self.columns = ['coord_ra', 'coord_dec'] 

104 simpleDF = super(MultilevelParquetTableTestCase, self).simulateDF() 

105 dfFilterDSCombos = [] 

106 for ds in self.datasets: 

107 for filterName in self.filters: 

108 df = copy.copy(simpleDF) 

109 df.reindex(sorted(df.columns), axis=1) 

110 df['dataset'] = 'meas' 

111 df['filter'] = filterName 

112 df.columns = pd.MultiIndex.from_tuples([(ds, filterName, c) for c in df.columns], 

113 names=('dataset', 'filter', 'column')) 

114 dfFilterDSCombos.append(df) 

115 

116 return functools.reduce(lambda d1, d2: d1.join(d2), dfFilterDSCombos) 

117 

118 def getParq(self, filename, df): 

119 fromFile = MultilevelParquetTable(filename) 

120 fromDf = MultilevelParquetTable(dataFrame=df) 

121 return fromFile, fromDf 

122 

123 def testProperties(self): 

124 self.assertTrue(all([x == y for x, y in zip(self.parq.columnLevels, self.df.columns.names)])) 

125 self.assertEqual(len(self.parq.columns), len(self.df.columns)) 

126 

127 self.assertTrue(all([x == y for x, y in zip(self.dfParq.columnLevels, self.df.columns.names)])) 

128 self.assertEqual(len(self.dfParq.columns), len(self.df.columns)) 

129 

130 def testColumns(self): 

131 df = self.df 

132 parq = self.parq 

133 

134 # Case A, each level has multiple values 

135 datasets_A = self.datasets 

136 filters_A = self.filters 

137 columns_A = self.columns 

138 columnDict_A = {'dataset': datasets_A, 

139 'filter': filters_A, 

140 'column': columns_A 

141 } 

142 colTuples_A = [(self.datasets[0], self.filters[0], self.columns[0]), 

143 (self.datasets[0], self.filters[0], self.columns[1]), 

144 (self.datasets[0], self.filters[1], self.columns[0]), 

145 (self.datasets[0], self.filters[1], self.columns[1]), 

146 (self.datasets[1], self.filters[0], self.columns[0]), 

147 (self.datasets[1], self.filters[0], self.columns[1]), 

148 (self.datasets[1], self.filters[1], self.columns[0]), 

149 (self.datasets[1], self.filters[1], self.columns[1])] 

150 df_A = df[colTuples_A] 

151 assert_frame_equal(parq.toDataFrame(columns=columnDict_A), df_A) 

152 

153 # Case A1, add a bogus column and test that it gets ignored 

154 datasets_A1 = self.datasets 

155 filters_A1 = self.filters 

156 columns_A1 = self.columns + ['garbage'] 

157 columnDict_A1 = {'dataset': datasets_A1, 

158 'filter': filters_A1, 

159 'column': columns_A1} 

160 colTuples_A1 = [(self.datasets[0], self.filters[0], self.columns[0]), 

161 (self.datasets[0], self.filters[0], self.columns[1]), 

162 (self.datasets[0], self.filters[1], self.columns[0]), 

163 (self.datasets[0], self.filters[1], self.columns[1]), 

164 (self.datasets[1], self.filters[0], self.columns[0]), 

165 (self.datasets[1], self.filters[0], self.columns[1]), 

166 (self.datasets[1], self.filters[1], self.columns[0]), 

167 (self.datasets[1], self.filters[1], self.columns[1])] 

168 df_A1 = df[colTuples_A1] 

169 assert_frame_equal(parq.toDataFrame(columns=columnDict_A1), df_A1) 

170 

171 # Case B: One level has only a single value 

172 datasets_B = self.datasets[0] 

173 filters_B = self.filters 

174 columns_B = self.columns 

175 columnDict_B = {'dataset': datasets_B, 

176 'filter': filters_B, 

177 'column': columns_B} 

178 colTuples_B = [(self.datasets[0], self.filters[0], self.columns[0]), 

179 (self.datasets[0], self.filters[0], self.columns[1]), 

180 (self.datasets[0], self.filters[1], self.columns[0]), 

181 (self.datasets[0], self.filters[1], self.columns[1])] 

182 df_B = df[colTuples_B] 

183 df_B.columns = df_B.columns.droplevel('dataset') 

184 assert_frame_equal(parq.toDataFrame(columns=columnDict_B), df_B) 

185 assert_frame_equal(df_B, parq.toDataFrame(columns=colTuples_B)) 

186 

187 # When explicit columns are not provided, comparison requires 

188 # first getting the column index in sorted order. Apparently this 

189 # happens by default in parq.toDataFrame(); to be honest, I'm not 

190 # exactly sure how/why. 

191 

192 # Case C: Two levels have a single value; third is not provided 

193 datasets_C = self.datasets[0] 

194 filters_C = self.filters[0] 

195 columnDict_C = {'dataset': datasets_C, 

196 'filter': filters_C} 

197 df_C = df[datasets_C][filters_C].sort_index(axis=1) 

198 

199 self.assertTrue(parq.toDataFrame(columns=columnDict_C).equals(df_C)) 

200 

201 # Case D: Only one level (first level) is provided 

202 dataset_D = self.datasets[0] 

203 columnDict_D = {'dataset': dataset_D} 

204 df_D = df[dataset_D].sort_index(axis=1) 

205 self.assertTrue(parq.toDataFrame(columns=columnDict_D).equals(df_D)) 

206 

207 # Case E: Only one level (second level) is provided 

208 filters_E = self.filters[1] 

209 columnDict_E = {'filter': filters_E} 

210 # get second level of multi-index column using .xs() 

211 df_E = df.xs(filters_E, level=1, axis=1).sort_index(axis=1) 

212 self.assertTrue(parq.toDataFrame(columns=columnDict_E).equals(df_E)) 

213 

214 # Case when all requested columns don't exist 

215 columnDictNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('hello')} 

216 self.assertRaises(ValueError, parq.toDataFrame, columns=columnDictNonsense) 

217 

218 # Case when some requested columns don't exist. 

219 # TO DO: DM-21976 Confirm this is the behavior we want 

220 # Quietly ignore nonsense columns 

221 columnDictSomeNonsense = {'dataset': 'meas', 'filter': 'G', 'column': ('coord_ra', 'hello')} 

222 dfGood = pd.DataFrame(df['meas']['G']['coord_ra']) 

223 self.assertTrue(parq.toDataFrame(columns=columnDictSomeNonsense).equals(dfGood)) 

224 

225 

226if __name__ == "__main__": 226 ↛ 227line 226 didn't jump to line 227, because the condition on line 226 was never true

227 lsst.utils.tests.init() 

228 unittest.main()