22 Implementation of thin wrappers to pyarrow.ParquetFile. 27 from itertools
import product
29 import pyarrow.parquet
35 """Thin wrapper to pyarrow's ParquetFile object 37 Call `toDataFrame` method to get a `pandas.DataFrame` object, 38 optionally passing specific columns. 40 The main purpose of having this wrapper rather than directly 41 using `pyarrow.ParquetFile` is to make it nicer to load 42 selected subsets of columns, especially from dataframes with multi-level 45 Instantiated with either a path to a parquet file or a dataFrame 49 filename : str, optional 51 dataFrame : dataFrame, optional 54 def __init__(self, filename=None, dataFrame=None):
55 if filename
is not None:
56 self.
_pf = pyarrow.parquet.ParquetFile(filename)
59 elif dataFrame
is not None:
63 raise ValueError(
'Either filename or dataFrame must be passed.')
69 """Write pandas dataframe to parquet 74 Path to which to write. 77 raise ValueError(
'df property must be defined to write.')
78 table = pyarrow.Table.from_pandas(self.
_df)
79 pyarrow.parquet.write_table(table, filename, compression=
'none')
84 raise AttributeError(
"This property is only accessible if ._pf is set.")
86 self.
_pandasMd = json.loads(self.
_pf.metadata.metadata[b
'pandas'])
91 """Columns as a pandas Index 97 def _getColumnIndex(self):
98 if self.
_df is not None:
99 return self.
_df.columns
105 """List of column names (or column index if df is set) 107 This may either be a list of column names, or a 108 pandas.Index object describing the column index, depending 109 on whether the ParquetTable object is wrapping a ParquetFile 116 def _getColumns(self):
117 if self.
_df is not None:
120 return self.
_pf.metadata.schema.names
122 def _sanitizeColumns(self, columns):
123 return [c
for c
in columns
if c
in self.
columnIndex]
126 """Get table (or specified columns) as a pandas DataFrame 130 columns : list, optional 131 Desired columns. If `None`, then all columns will be 138 return self.
_df[columns]
141 return self.
_pf.read().to_pandas()
143 df = self.
_pf.read(columns=columns, use_pandas_metadata=
True).to_pandas()
148 """Wrapper to access dataframe with multi-level column index from Parquet 150 This subclass of `ParquetTable` to handle the multi-level is necessary 151 because there is not a convenient way to request specific table subsets 152 by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`. 154 Additionally, pyarrow stores multilevel index information in a very strange 155 way. Pandas stores it as a tuple, so that one can access a single column 156 from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for 157 some reason pyarrow saves these indices as "stringified" tuples, such that 158 in order to read thissame column from a table written to Parquet, you would 159 have to do the following: 161 pf = pyarrow.ParquetFile(filename) 162 df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"]) 164 See also https://github.com/apache/arrow/issues/1771, where we've raised 167 As multilevel-indexed dataframes can be very useful to store data like 168 multiple filters' worth of data in the same table, this case deserves a 169 wrapper to enable easier access; 170 that's what this object is for. For example, 172 parq = MultilevelParquetTable(filename) 173 columnDict = {'dataset':'meas', 175 'column':['coord_ra', 'coord_dec']} 176 df = parq.toDataFrame(columns=columnDict) 178 will return just the coordinate columns; the equivalent of calling 179 `df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe, 180 but without having to load the whole frame into memory---this reads just 181 those columns from disk. You can also request a sub-table; e.g., 183 parq = MultilevelParquetTable(filename) 184 columnDict = {'dataset':'meas', 186 df = parq.toDataFrame(columns=columnDict) 188 and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe. 192 filename : str, optional 193 Path to Parquet file. 194 dataFrame : dataFrame, optional 197 super(MultilevelParquetTable, self).
__init__(*args, **kwargs)
210 """Names of levels in column index 214 def _getColumnIndex(self):
215 if self.
_df is not None:
216 return super()._getColumnIndex()
218 levelNames = [f[
'name']
for f
in self.
pandasMd[
'column_indexes']]
219 return pd.MultiIndex.from_tuples(self.
columns, names=levelNames)
221 def _getColumns(self):
222 if self.
_df is not None:
223 return super()._getColumns()
225 columns = self.
_pf.metadata.schema.names
226 n = len(self.
pandasMd[
'column_indexes'])
227 pattern = re.compile(
', '.join([
"'(.*)'"] * n))
228 matches = [re.search(pattern, c)
for c
in columns]
229 return [m.groups()
for m
in matches
if m
is not None]
232 """Get table (or specified columns) as a pandas DataFrame 234 To get specific columns in specified sub-levels: 236 parq = MultilevelParquetTable(filename) 237 columnDict = {'dataset':'meas', 239 'column':['coord_ra', 'coord_dec']} 240 df = parq.toDataFrame(columns=columnDict) 242 Or, to get an entire subtable, leave out one level name: 244 parq = MultilevelParquetTable(filename) 245 columnDict = {'dataset':'meas', 247 df = parq.toDataFrame(columns=columnDict) 251 columns : list or dict, optional 252 Desired columns. If `None`, then all columns will be 253 returned. If a list, then the names of the columns must 254 be *exactly* as stored by pyarrow; that is, stringified tuples. 255 If a dictionary, then the entries of the dictionary must 256 correspond to the level names of the column multi-index 257 (that is, the `columnLevels` attribute). Not every level 258 must be passed; if any level is left out, then all entries 259 in that level will be implicitly included. 261 If True drop levels of column index that have just one entry 268 return self.
_pf.read().to_pandas()
270 if isinstance(columns, dict):
275 df = self.
_df[columns]
276 except (AttributeError, KeyError):
277 newColumns = [c
for c
in columns
if c
in self.
columnIndex]
279 raise ValueError(
'None of the requested columns ({}) are available!'.format(columns))
280 df = self.
_df[columns]
284 df = self.
_pf.read(columns=pfColumns, use_pandas_metadata=
True).to_pandas()
285 except (AttributeError, KeyError):
286 newColumns = [c
for c
in columns
if c
in self.
columnIndex]
288 raise ValueError(
'None of the requested columns ({}) are available!'.format(columns))
290 df = self.
_pf.read(columns=pfColumns, use_pandas_metadata=
True).to_pandas()
294 levelsToDrop = [n
for l, n
in zip(df.columns.levels, df.columns.names)
298 if len(levelsToDrop) == len(df.columns.names):
299 levelsToDrop.remove(df.columns.names[-1])
301 df.columns = df.columns.droplevel(levelsToDrop)
305 def _colsFromDict(self, colDict):
309 if isinstance(colDict[l], str):
310 new_colDict[l] = [colDict[l]]
312 new_colDict[l] = colDict[l]
317 cols = product(*levelCols)
320 def _stringify(self, cols):
321 return [str(c)
for c
in cols]
def _getColumnIndex(self)
def _sanitizeColumns(self, columns)
def toDataFrame(self, columns=None)
def __init__(self, args, kwargs)
def __init__(self, filename=None, dataFrame=None)
def write(self, filename)
def columnLevelNames(self)
def toDataFrame(self, columns=None, droplevels=True)
def _stringify(self, cols)
def _colsFromDict(self, colDict)