Coverage for python/lsst/pipe/tasks/parquetTable.py: 17%

130 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-13 02:59 -0700

1# This file is part of pipe_tasks. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21""" 

22Implementation of thin wrappers to pyarrow.ParquetFile. 

23""" 

24 

25import re 

26import json 

27from itertools import product 

28import pyarrow 

29import pyarrow.parquet 

30import numpy as np 

31import pandas as pd 

32 

33 

34class ParquetTable(object): 

35 """Thin wrapper to pyarrow's ParquetFile object 

36 

37 Call `toDataFrame` method to get a `pandas.DataFrame` object, 

38 optionally passing specific columns. 

39 

40 The main purpose of having this wrapper rather than directly 

41 using `pyarrow.ParquetFile` is to make it nicer to load 

42 selected subsets of columns, especially from dataframes with multi-level 

43 column indices. 

44 

45 Instantiated with either a path to a parquet file or a dataFrame 

46 

47 Parameters 

48 ---------- 

49 filename : str, optional 

50 Path to Parquet file. 

51 dataFrame : dataFrame, optional 

52 """ 

53 

54 def __init__(self, filename=None, dataFrame=None): 

55 self.filename = filename 

56 if filename is not None: 

57 self._pf = pyarrow.parquet.ParquetFile(filename) 

58 self._df = None 

59 self._pandasMd = None 

60 elif dataFrame is not None: 

61 self._df = dataFrame 

62 self._pf = None 

63 else: 

64 raise ValueError("Either filename or dataFrame must be passed.") 

65 

66 self._columns = None 

67 self._columnIndex = None 

68 

69 def write(self, filename): 

70 """Write pandas dataframe to parquet 

71 

72 Parameters 

73 ---------- 

74 filename : str 

75 Path to which to write. 

76 """ 

77 if self._df is None: 

78 raise ValueError("df property must be defined to write.") 

79 table = pyarrow.Table.from_pandas(self._df) 

80 pyarrow.parquet.write_table(table, filename) 

81 

82 @property 

83 def pandasMd(self): 

84 if self._pf is None: 

85 raise AttributeError("This property is only accessible if ._pf is set.") 

86 if self._pandasMd is None: 

87 self._pandasMd = json.loads(self._pf.metadata.metadata[b"pandas"]) 

88 return self._pandasMd 

89 

90 @property 

91 def columnIndex(self): 

92 """Columns as a pandas Index 

93 """ 

94 if self._columnIndex is None: 

95 self._columnIndex = self._getColumnIndex() 

96 return self._columnIndex 

97 

98 def _getColumnIndex(self): 

99 if self._df is not None: 

100 return self._df.columns 

101 else: 

102 return pd.Index(self.columns) 

103 

104 @property 

105 def columns(self): 

106 """List of column names (or column index if df is set) 

107 

108 This may either be a list of column names, or a 

109 pandas.Index object describing the column index, depending 

110 on whether the ParquetTable object is wrapping a ParquetFile 

111 or a DataFrame. 

112 """ 

113 if self._columns is None: 

114 self._columns = self._getColumns() 

115 return self._columns 

116 

117 def _getColumns(self): 

118 if self._df is not None: 

119 return self._sanitizeColumns(self._df.columns) 

120 else: 

121 return self._pf.metadata.schema.names 

122 

123 def _sanitizeColumns(self, columns): 

124 return [c for c in columns if c in self.columnIndex] 

125 

126 def toDataFrame(self, columns=None): 

127 """Get table (or specified columns) as a pandas DataFrame 

128 

129 Parameters 

130 ---------- 

131 columns : list, optional 

132 Desired columns. If `None`, then all columns will be 

133 returned. 

134 """ 

135 if self._pf is None: 

136 if columns is None: 

137 return self._df 

138 else: 

139 return self._df[columns] 

140 

141 if columns is None: 

142 return self._pf.read().to_pandas() 

143 

144 df = self._pf.read(columns=columns, use_pandas_metadata=True).to_pandas() 

145 return df 

146 

147 

148class MultilevelParquetTable(ParquetTable): 

149 """Wrapper to access dataframe with multi-level column index from Parquet 

150 

151 This subclass of `ParquetTable` to handle the multi-level is necessary 

152 because there is not a convenient way to request specific table subsets 

153 by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`. 

154 

155 Additionally, pyarrow stores multilevel index information in a very strange 

156 way. Pandas stores it as a tuple, so that one can access a single column 

157 from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for 

158 some reason pyarrow saves these indices as "stringified" tuples, such that 

159 in order to read thissame column from a table written to Parquet, you would 

160 have to do the following: 

161 

162 pf = pyarrow.ParquetFile(filename) 

163 df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"]) 

164 

165 See also https://github.com/apache/arrow/issues/1771, where we've raised 

166 this issue. 

167 

168 As multilevel-indexed dataframes can be very useful to store data like 

169 multiple filters' worth of data in the same table, this case deserves a 

170 wrapper to enable easier access; 

171 that's what this object is for. For example, 

172 

173 parq = MultilevelParquetTable(filename) 

174 columnDict = {'dataset':'meas', 

175 'filter':'HSC-G', 

176 'column':['coord_ra', 'coord_dec']} 

177 df = parq.toDataFrame(columns=columnDict) 

178 

179 will return just the coordinate columns; the equivalent of calling 

180 `df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe, 

181 but without having to load the whole frame into memory---this reads just 

182 those columns from disk. You can also request a sub-table; e.g., 

183 

184 parq = MultilevelParquetTable(filename) 

185 columnDict = {'dataset':'meas', 

186 'filter':'HSC-G'} 

187 df = parq.toDataFrame(columns=columnDict) 

188 

189 and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe. 

190 

191 Parameters 

192 ---------- 

193 filename : str, optional 

194 Path to Parquet file. 

195 dataFrame : dataFrame, optional 

196 """ 

197 

198 def __init__(self, *args, **kwargs): 

199 super(MultilevelParquetTable, self).__init__(*args, **kwargs) 

200 

201 self._columnLevelNames = None 

202 

203 @property 

204 def columnLevelNames(self): 

205 if self._columnLevelNames is None: 

206 self._columnLevelNames = { 

207 level: list(np.unique(np.array(self.columns)[:, i])) 

208 for i, level in enumerate(self.columnLevels) 

209 } 

210 return self._columnLevelNames 

211 

212 @property 

213 def columnLevels(self): 

214 """Names of levels in column index 

215 """ 

216 return self.columnIndex.names 

217 

218 def _getColumnIndex(self): 

219 if self._df is not None: 

220 return super()._getColumnIndex() 

221 else: 

222 levelNames = [f["name"] for f in self.pandasMd["column_indexes"]] 

223 return pd.MultiIndex.from_tuples(self.columns, names=levelNames) 

224 

225 def _getColumns(self): 

226 if self._df is not None: 

227 return super()._getColumns() 

228 else: 

229 columns = self._pf.metadata.schema.names 

230 n = len(self.pandasMd["column_indexes"]) 

231 pattern = re.compile(", ".join(["'(.*)'"] * n)) 

232 matches = [re.search(pattern, c) for c in columns] 

233 return [m.groups() for m in matches if m is not None] 

234 

235 def toDataFrame(self, columns=None, droplevels=True): 

236 """Get table (or specified columns) as a pandas DataFrame 

237 

238 To get specific columns in specified sub-levels: 

239 

240 parq = MultilevelParquetTable(filename) 

241 columnDict = {'dataset':'meas', 

242 'filter':'HSC-G', 

243 'column':['coord_ra', 'coord_dec']} 

244 df = parq.toDataFrame(columns=columnDict) 

245 

246 Or, to get an entire subtable, leave out one level name: 

247 

248 parq = MultilevelParquetTable(filename) 

249 columnDict = {'dataset':'meas', 

250 'filter':'HSC-G'} 

251 df = parq.toDataFrame(columns=columnDict) 

252 

253 Parameters 

254 ---------- 

255 columns : list or dict, optional 

256 Desired columns. If `None`, then all columns will be 

257 returned. If a list, then the names of the columns must 

258 be *exactly* as stored by pyarrow; that is, stringified tuples. 

259 If a dictionary, then the entries of the dictionary must 

260 correspond to the level names of the column multi-index 

261 (that is, the `columnLevels` attribute). Not every level 

262 must be passed; if any level is left out, then all entries 

263 in that level will be implicitly included. 

264 droplevels : bool 

265 If True drop levels of column index that have just one entry 

266 

267 """ 

268 if columns is None: 

269 if self._pf is None: 

270 return self._df 

271 else: 

272 return self._pf.read().to_pandas() 

273 

274 if isinstance(columns, dict): 

275 columns = self._colsFromDict(columns) 

276 

277 if self._pf is None: 

278 try: 

279 df = self._df[columns] 

280 except (AttributeError, KeyError): 

281 newColumns = [c for c in columns if c in self.columnIndex] 

282 if not newColumns: 

283 raise ValueError("None of the requested columns ({}) are available!".format(columns)) 

284 df = self._df[newColumns] 

285 else: 

286 pfColumns = self._stringify(columns) 

287 try: 

288 df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas() 

289 except (AttributeError, KeyError): 

290 newColumns = [c for c in columns if c in self.columnIndex] 

291 if not newColumns: 

292 raise ValueError("None of the requested columns ({}) are available!".format(columns)) 

293 pfColumns = self._stringify(newColumns) 

294 df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas() 

295 

296 if droplevels: 

297 # Drop levels of column index that have just one entry 

298 levelsToDrop = [n for lev, n in zip(df.columns.levels, df.columns.names) if len(lev) == 1] 

299 

300 # Prevent error when trying to drop *all* columns 

301 if len(levelsToDrop) == len(df.columns.names): 

302 levelsToDrop.remove(df.columns.names[-1]) 

303 

304 df.columns = df.columns.droplevel(levelsToDrop) 

305 

306 return df 

307 

308 def _colsFromDict(self, colDict): 

309 new_colDict = {} 

310 for i, lev in enumerate(self.columnLevels): 

311 if lev in colDict: 

312 if isinstance(colDict[lev], str): 

313 new_colDict[lev] = [colDict[lev]] 

314 else: 

315 new_colDict[lev] = colDict[lev] 

316 else: 

317 new_colDict[lev] = self.columnIndex.levels[i] 

318 

319 levelCols = [new_colDict[lev] for lev in self.columnLevels] 

320 cols = product(*levelCols) 

321 return list(cols) 

322 

323 def _stringify(self, cols): 

324 return [str(c) for c in cols]