Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

# This file is part of pipe_tasks. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

""" 

Implementation of thin wrappers to pyarrow.ParquetFile. 

""" 

 

import re 

import json 

from itertools import product 

import pyarrow 

import pyarrow.parquet 

import numpy as np 

import pandas as pd 

 

 

class ParquetTable(object): 

"""Thin wrapper to pyarrow's ParquetFile object 

 

Call `toDataFrame` method to get a `pandas.DataFrame` object, 

optionally passing specific columns. 

 

The main purpose of having this wrapper rather than directly 

using `pyarrow.ParquetFile` is to make it nicer to load 

selected subsets of columns, especially from dataframes with multi-level 

column indices. 

 

Instantiated with either a path to a parquet file or a dataFrame 

 

Parameters 

---------- 

filename : str, optional 

Path to Parquet file. 

dataFrame : dataFrame, optional 

""" 

 

def __init__(self, filename=None, dataFrame=None): 

if filename is not None: 

self._pf = pyarrow.parquet.ParquetFile(filename) 

self._df = None 

self._pandasMd = None 

elif dataFrame is not None: 

self._df = dataFrame 

self._pf = None 

else: 

raise ValueError('Either filename or dataFrame must be passed.') 

 

self._columns = None 

self._columnIndex = None 

 

def write(self, filename): 

"""Write pandas dataframe to parquet 

 

Parameters 

---------- 

filename : str 

Path to which to write. 

""" 

if self._df is None: 

raise ValueError('df property must be defined to write.') 

table = pyarrow.Table.from_pandas(self._df) 

pyarrow.parquet.write_table(table, filename, compression='none') 

 

@property 

def pandasMd(self): 

if self._pf is None: 

raise AttributeError("This property is only accessible if ._pf is set.") 

if self._pandasMd is None: 

self._pandasMd = json.loads(self._pf.metadata.metadata[b'pandas']) 

return self._pandasMd 

 

@property 

def columnIndex(self): 

"""Columns as a pandas Index 

""" 

if self._columnIndex is None: 

self._columnIndex = self._getColumnIndex() 

return self._columnIndex 

 

def _getColumnIndex(self): 

if self._df is not None: 

return self._df.columns 

else: 

return pd.Index(self.columns) 

 

@property 

def columns(self): 

"""List of column names (or column index if df is set) 

 

This may either be a list of column names, or a 

pandas.Index object describing the column index, depending 

on whether the ParquetTable object is wrapping a ParquetFile 

or a DataFrame. 

""" 

if self._columns is None: 

self._columns = self._getColumns() 

return self._columns 

 

def _getColumns(self): 

if self._df is not None: 

return self._sanitizeColumns(self._df.columns) 

else: 

return self._pf.metadata.schema.names 

 

def _sanitizeColumns(self, columns): 

return [c for c in columns if c in self.columnIndex] 

 

def toDataFrame(self, columns=None): 

"""Get table (or specified columns) as a pandas DataFrame 

 

Parameters 

---------- 

columns : list, optional 

Desired columns. If `None`, then all columns will be 

returned. 

""" 

if self._pf is None: 

if columns is None: 

return self._df 

else: 

return self._df[columns] 

 

if columns is None: 

return self._pf.read().to_pandas() 

 

df = self._pf.read(columns=columns, use_pandas_metadata=True).to_pandas() 

return df 

 

 

class MultilevelParquetTable(ParquetTable): 

"""Wrapper to access dataframe with multi-level column index from Parquet 

 

This subclass of `ParquetTable` to handle the multi-level is necessary 

because there is not a convenient way to request specific table subsets 

by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`. 

 

Additionally, pyarrow stores multilevel index information in a very strange 

way. Pandas stores it as a tuple, so that one can access a single column 

from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for 

some reason pyarrow saves these indices as "stringified" tuples, such that 

in order to read thissame column from a table written to Parquet, you would 

have to do the following: 

 

pf = pyarrow.ParquetFile(filename) 

df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"]) 

 

See also https://github.com/apache/arrow/issues/1771, where we've raised 

this issue. 

 

As multilevel-indexed dataframes can be very useful to store data like 

multiple filters' worth of data in the same table, this case deserves a 

wrapper to enable easier access; 

that's what this object is for. For example, 

 

parq = MultilevelParquetTable(filename) 

columnDict = {'dataset':'meas', 

'filter':'HSC-G', 

'column':['coord_ra', 'coord_dec']} 

df = parq.toDataFrame(columns=columnDict) 

 

will return just the coordinate columns; the equivalent of calling 

`df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe, 

but without having to load the whole frame into memory---this reads just 

those columns from disk. You can also request a sub-table; e.g., 

 

parq = MultilevelParquetTable(filename) 

columnDict = {'dataset':'meas', 

'filter':'HSC-G'} 

df = parq.toDataFrame(columns=columnDict) 

 

and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe. 

 

Parameters 

---------- 

filename : str, optional 

Path to Parquet file. 

dataFrame : dataFrame, optional 

""" 

def __init__(self, *args, **kwargs): 

super(MultilevelParquetTable, self).__init__(*args, **kwargs) 

 

self._columnLevelNames = None 

 

@property 

def columnLevelNames(self): 

if self._columnLevelNames is None: 

self._columnLevelNames = {level: list(np.unique(np.array(self.columns)[:, i])) 

for i, level in enumerate(self.columnLevels)} 

return self._columnLevelNames 

 

@property 

def columnLevels(self): 

"""Names of levels in column index 

""" 

return self.columnIndex.names 

 

def _getColumnIndex(self): 

if self._df is not None: 

return super()._getColumnIndex() 

else: 

levelNames = [f['name'] for f in self.pandasMd['column_indexes']] 

return pd.MultiIndex.from_tuples(self.columns, names=levelNames) 

 

def _getColumns(self): 

if self._df is not None: 

return super()._getColumns() 

else: 

columns = self._pf.metadata.schema.names 

n = len(self.pandasMd['column_indexes']) 

pattern = re.compile(', '.join(["'(.*)'"] * n)) 

matches = [re.search(pattern, c) for c in columns] 

return [m.groups() for m in matches if m is not None] 

 

def toDataFrame(self, columns=None, droplevels=True): 

"""Get table (or specified columns) as a pandas DataFrame 

 

To get specific columns in specified sub-levels: 

 

parq = MultilevelParquetTable(filename) 

columnDict = {'dataset':'meas', 

'filter':'HSC-G', 

'column':['coord_ra', 'coord_dec']} 

df = parq.toDataFrame(columns=columnDict) 

 

Or, to get an entire subtable, leave out one level name: 

 

parq = MultilevelParquetTable(filename) 

columnDict = {'dataset':'meas', 

'filter':'HSC-G'} 

df = parq.toDataFrame(columns=columnDict) 

 

Parameters 

---------- 

columns : list or dict, optional 

Desired columns. If `None`, then all columns will be 

returned. If a list, then the names of the columns must 

be *exactly* as stored by pyarrow; that is, stringified tuples. 

If a dictionary, then the entries of the dictionary must 

correspond to the level names of the column multi-index 

(that is, the `columnLevels` attribute). Not every level 

must be passed; if any level is left out, then all entries 

in that level will be implicitly included. 

droplevels : bool 

If True drop levels of column index that have just one entry 

 

""" 

if columns is None: 

if self._pf is None: 

return self._df 

else: 

return self._pf.read().to_pandas() 

 

if isinstance(columns, dict): 

columns = self._colsFromDict(columns) 

 

if self._pf is None: 

try: 

df = self._df[columns] 

except (AttributeError, KeyError): 

newColumns = [c for c in columns if c in self.columnIndex] 

if not newColumns: 

raise ValueError('None of the requested columns ({}) are available!'.format(columns)) 

df = self._df[columns] 

else: 

pfColumns = self._stringify(columns) 

try: 

df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas() 

except (AttributeError, KeyError): 

newColumns = [c for c in columns if c in self.columnIndex] 

if not newColumns: 

raise ValueError('None of the requested columns ({}) are available!'.format(columns)) 

pfColumns = self._stringify(newColumns) 

df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas() 

 

if droplevels: 

# Drop levels of column index that have just one entry 

levelsToDrop = [n for l, n in zip(df.columns.levels, df.columns.names) 

if len(l) == 1] 

 

# Prevent error when trying to drop *all* columns 

if len(levelsToDrop) == len(df.columns.names): 

levelsToDrop.remove(df.columns.names[-1]) 

 

df.columns = df.columns.droplevel(levelsToDrop) 

 

return df 

 

def _colsFromDict(self, colDict): 

new_colDict = {} 

for i, l in enumerate(self.columnLevels): 

if l in colDict: 

if isinstance(colDict[l], str): 

new_colDict[l] = [colDict[l]] 

else: 

new_colDict[l] = colDict[l] 

else: 

new_colDict[l] = self.columnIndex.levels[i] 

 

levelCols = [new_colDict[l] for l in self.columnLevels] 

cols = product(*levelCols) 

return list(cols) 

 

def _stringify(self, cols): 

return [str(c) for c in cols]