Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_tasks. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21""" 

22Implementation of thin wrappers to pyarrow.ParquetFile. 

23""" 

24 

25import re 

26import json 

27from itertools import product 

28import pyarrow 

29import pyarrow.parquet 

30import numpy as np 

31import pandas as pd 

32 

33 

34class ParquetTable(object): 

35 """Thin wrapper to pyarrow's ParquetFile object 

36 

37 Call `toDataFrame` method to get a `pandas.DataFrame` object, 

38 optionally passing specific columns. 

39 

40 The main purpose of having this wrapper rather than directly 

41 using `pyarrow.ParquetFile` is to make it nicer to load 

42 selected subsets of columns, especially from dataframes with multi-level 

43 column indices. 

44 

45 Instantiated with either a path to a parquet file or a dataFrame 

46 

47 Parameters 

48 ---------- 

49 filename : str, optional 

50 Path to Parquet file. 

51 dataFrame : dataFrame, optional 

52 """ 

53 

54 def __init__(self, filename=None, dataFrame=None): 

55 if filename is not None: 

56 self._pf = pyarrow.parquet.ParquetFile(filename) 

57 self._df = None 

58 self._pandasMd = None 

59 elif dataFrame is not None: 

60 self._df = dataFrame 

61 self._pf = None 

62 else: 

63 raise ValueError('Either filename or dataFrame must be passed.') 

64 

65 self._columns = None 

66 self._columnIndex = None 

67 

68 def write(self, filename): 

69 """Write pandas dataframe to parquet 

70 

71 Parameters 

72 ---------- 

73 filename : str 

74 Path to which to write. 

75 """ 

76 if self._df is None: 

77 raise ValueError('df property must be defined to write.') 

78 table = pyarrow.Table.from_pandas(self._df) 

79 pyarrow.parquet.write_table(table, filename, compression='none') 

80 

81 @property 

82 def pandasMd(self): 

83 if self._pf is None: 

84 raise AttributeError("This property is only accessible if ._pf is set.") 

85 if self._pandasMd is None: 

86 self._pandasMd = json.loads(self._pf.metadata.metadata[b'pandas']) 

87 return self._pandasMd 

88 

89 @property 

90 def columnIndex(self): 

91 """Columns as a pandas Index 

92 """ 

93 if self._columnIndex is None: 

94 self._columnIndex = self._getColumnIndex() 

95 return self._columnIndex 

96 

97 def _getColumnIndex(self): 

98 if self._df is not None: 

99 return self._df.columns 

100 else: 

101 return pd.Index(self.columns) 

102 

103 @property 

104 def columns(self): 

105 """List of column names (or column index if df is set) 

106 

107 This may either be a list of column names, or a 

108 pandas.Index object describing the column index, depending 

109 on whether the ParquetTable object is wrapping a ParquetFile 

110 or a DataFrame. 

111 """ 

112 if self._columns is None: 

113 self._columns = self._getColumns() 

114 return self._columns 

115 

116 def _getColumns(self): 

117 if self._df is not None: 

118 return self._sanitizeColumns(self._df.columns) 

119 else: 

120 return self._pf.metadata.schema.names 

121 

122 def _sanitizeColumns(self, columns): 

123 return [c for c in columns if c in self.columnIndex] 

124 

125 def toDataFrame(self, columns=None): 

126 """Get table (or specified columns) as a pandas DataFrame 

127 

128 Parameters 

129 ---------- 

130 columns : list, optional 

131 Desired columns. If `None`, then all columns will be 

132 returned. 

133 """ 

134 if self._pf is None: 

135 if columns is None: 

136 return self._df 

137 else: 

138 return self._df[columns] 

139 

140 if columns is None: 

141 return self._pf.read().to_pandas() 

142 

143 df = self._pf.read(columns=columns, use_pandas_metadata=True).to_pandas() 

144 return df 

145 

146 

147class MultilevelParquetTable(ParquetTable): 

148 """Wrapper to access dataframe with multi-level column index from Parquet 

149 

150 This subclass of `ParquetTable` to handle the multi-level is necessary 

151 because there is not a convenient way to request specific table subsets 

152 by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`. 

153 

154 Additionally, pyarrow stores multilevel index information in a very strange 

155 way. Pandas stores it as a tuple, so that one can access a single column 

156 from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for 

157 some reason pyarrow saves these indices as "stringified" tuples, such that 

158 in order to read thissame column from a table written to Parquet, you would 

159 have to do the following: 

160 

161 pf = pyarrow.ParquetFile(filename) 

162 df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"]) 

163 

164 See also https://github.com/apache/arrow/issues/1771, where we've raised 

165 this issue. 

166 

167 As multilevel-indexed dataframes can be very useful to store data like 

168 multiple filters' worth of data in the same table, this case deserves a 

169 wrapper to enable easier access; 

170 that's what this object is for. For example, 

171 

172 parq = MultilevelParquetTable(filename) 

173 columnDict = {'dataset':'meas', 

174 'filter':'HSC-G', 

175 'column':['coord_ra', 'coord_dec']} 

176 df = parq.toDataFrame(columns=columnDict) 

177 

178 will return just the coordinate columns; the equivalent of calling 

179 `df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe, 

180 but without having to load the whole frame into memory---this reads just 

181 those columns from disk. You can also request a sub-table; e.g., 

182 

183 parq = MultilevelParquetTable(filename) 

184 columnDict = {'dataset':'meas', 

185 'filter':'HSC-G'} 

186 df = parq.toDataFrame(columns=columnDict) 

187 

188 and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe. 

189 

190 Parameters 

191 ---------- 

192 filename : str, optional 

193 Path to Parquet file. 

194 dataFrame : dataFrame, optional 

195 """ 

196 def __init__(self, *args, **kwargs): 

197 super(MultilevelParquetTable, self).__init__(*args, **kwargs) 

198 

199 self._columnLevelNames = None 

200 

201 @property 

202 def columnLevelNames(self): 

203 if self._columnLevelNames is None: 

204 self._columnLevelNames = {level: list(np.unique(np.array(self.columns)[:, i])) 

205 for i, level in enumerate(self.columnLevels)} 

206 return self._columnLevelNames 

207 

208 @property 

209 def columnLevels(self): 

210 """Names of levels in column index 

211 """ 

212 return self.columnIndex.names 

213 

214 def _getColumnIndex(self): 

215 if self._df is not None: 

216 return super()._getColumnIndex() 

217 else: 

218 levelNames = [f['name'] for f in self.pandasMd['column_indexes']] 

219 return pd.MultiIndex.from_tuples(self.columns, names=levelNames) 

220 

221 def _getColumns(self): 

222 if self._df is not None: 

223 return super()._getColumns() 

224 else: 

225 columns = self._pf.metadata.schema.names 

226 n = len(self.pandasMd['column_indexes']) 

227 pattern = re.compile(', '.join(["'(.*)'"] * n)) 

228 matches = [re.search(pattern, c) for c in columns] 

229 return [m.groups() for m in matches if m is not None] 

230 

231 def toDataFrame(self, columns=None, droplevels=True): 

232 """Get table (or specified columns) as a pandas DataFrame 

233 

234 To get specific columns in specified sub-levels: 

235 

236 parq = MultilevelParquetTable(filename) 

237 columnDict = {'dataset':'meas', 

238 'filter':'HSC-G', 

239 'column':['coord_ra', 'coord_dec']} 

240 df = parq.toDataFrame(columns=columnDict) 

241 

242 Or, to get an entire subtable, leave out one level name: 

243 

244 parq = MultilevelParquetTable(filename) 

245 columnDict = {'dataset':'meas', 

246 'filter':'HSC-G'} 

247 df = parq.toDataFrame(columns=columnDict) 

248 

249 Parameters 

250 ---------- 

251 columns : list or dict, optional 

252 Desired columns. If `None`, then all columns will be 

253 returned. If a list, then the names of the columns must 

254 be *exactly* as stored by pyarrow; that is, stringified tuples. 

255 If a dictionary, then the entries of the dictionary must 

256 correspond to the level names of the column multi-index 

257 (that is, the `columnLevels` attribute). Not every level 

258 must be passed; if any level is left out, then all entries 

259 in that level will be implicitly included. 

260 droplevels : bool 

261 If True drop levels of column index that have just one entry 

262 

263 """ 

264 if columns is None: 

265 if self._pf is None: 

266 return self._df 

267 else: 

268 return self._pf.read().to_pandas() 

269 

270 if isinstance(columns, dict): 

271 columns = self._colsFromDict(columns) 

272 

273 if self._pf is None: 

274 try: 

275 df = self._df[columns] 

276 except (AttributeError, KeyError): 

277 newColumns = [c for c in columns if c in self.columnIndex] 

278 if not newColumns: 

279 raise ValueError('None of the requested columns ({}) are available!'.format(columns)) 

280 df = self._df[columns] 

281 else: 

282 pfColumns = self._stringify(columns) 

283 try: 

284 df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas() 

285 except (AttributeError, KeyError): 

286 newColumns = [c for c in columns if c in self.columnIndex] 

287 if not newColumns: 

288 raise ValueError('None of the requested columns ({}) are available!'.format(columns)) 

289 pfColumns = self._stringify(newColumns) 

290 df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas() 

291 

292 if droplevels: 

293 # Drop levels of column index that have just one entry 

294 levelsToDrop = [n for l, n in zip(df.columns.levels, df.columns.names) 

295 if len(l) == 1] 

296 

297 # Prevent error when trying to drop *all* columns 

298 if len(levelsToDrop) == len(df.columns.names): 

299 levelsToDrop.remove(df.columns.names[-1]) 

300 

301 df.columns = df.columns.droplevel(levelsToDrop) 

302 

303 return df 

304 

305 def _colsFromDict(self, colDict): 

306 new_colDict = {} 

307 for i, l in enumerate(self.columnLevels): 

308 if l in colDict: 

309 if isinstance(colDict[l], str): 

310 new_colDict[l] = [colDict[l]] 

311 else: 

312 new_colDict[l] = colDict[l] 

313 else: 

314 new_colDict[l] = self.columnIndex.levels[i] 

315 

316 levelCols = [new_colDict[l] for l in self.columnLevels] 

317 cols = product(*levelCols) 

318 return list(cols) 

319 

320 def _stringify(self, cols): 

321 return [str(c) for c in cols]