22 Implementation of thin wrappers to pyarrow.ParquetFile.
27 from itertools
import product
29 import pyarrow.parquet
35 """Thin wrapper to pyarrow's ParquetFile object
37 Call `toDataFrame` method to get a `pandas.DataFrame` object,
38 optionally passing specific columns.
40 The main purpose of having this wrapper rather than directly
41 using `pyarrow.ParquetFile` is to make it nicer to load
42 selected subsets of columns, especially from dataframes with multi-level
45 Instantiated with either a path to a parquet file or a dataFrame
49 filename : str, optional
51 dataFrame : dataFrame, optional
54 def __init__(self, filename=None, dataFrame=None):
55 if filename
is not None:
56 self.
_pf = pyarrow.parquet.ParquetFile(filename)
59 elif dataFrame
is not None:
63 raise ValueError(
'Either filename or dataFrame must be passed.')
69 """Write pandas dataframe to parquet
74 Path to which to write.
77 raise ValueError(
'df property must be defined to write.')
78 table = pyarrow.Table.from_pandas(self.
_df)
79 pyarrow.parquet.write_table(table, filename, compression=
'none')
84 raise AttributeError(
"This property is only accessible if ._pf is set.")
86 self.
_pandasMd = json.loads(self.
_pf.metadata.metadata[b
'pandas'])
91 """Columns as a pandas Index
97 def _getColumnIndex(self):
98 if self.
_df is not None:
99 return self.
_df.columns
105 """List of column names (or column index if df is set)
107 This may either be a list of column names, or a
108 pandas.Index object describing the column index, depending
109 on whether the ParquetTable object is wrapping a ParquetFile
116 def _getColumns(self):
117 if self.
_df is not None:
120 return self.
_pf.metadata.schema.names
122 def _sanitizeColumns(self, columns):
123 return [c
for c
in columns
if c
in self.
columnIndex]
126 """Get table (or specified columns) as a pandas DataFrame
130 columns : list, optional
131 Desired columns. If `None`, then all columns will be
138 return self.
_df[columns]
141 return self.
_pf.read().to_pandas()
143 df = self.
_pf.read(columns=columns, use_pandas_metadata=
True).to_pandas()
148 """Wrapper to access dataframe with multi-level column index from Parquet
150 This subclass of `ParquetTable` to handle the multi-level is necessary
151 because there is not a convenient way to request specific table subsets
152 by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`.
154 Additionally, pyarrow stores multilevel index information in a very strange
155 way. Pandas stores it as a tuple, so that one can access a single column
156 from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for
157 some reason pyarrow saves these indices as "stringified" tuples, such that
158 in order to read thissame column from a table written to Parquet, you would
159 have to do the following:
161 pf = pyarrow.ParquetFile(filename)
162 df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"])
164 See also https://github.com/apache/arrow/issues/1771, where we've raised
167 As multilevel-indexed dataframes can be very useful to store data like
168 multiple filters' worth of data in the same table, this case deserves a
169 wrapper to enable easier access;
170 that's what this object is for. For example,
172 parq = MultilevelParquetTable(filename)
173 columnDict = {'dataset':'meas',
175 'column':['coord_ra', 'coord_dec']}
176 df = parq.toDataFrame(columns=columnDict)
178 will return just the coordinate columns; the equivalent of calling
179 `df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe,
180 but without having to load the whole frame into memory---this reads just
181 those columns from disk. You can also request a sub-table; e.g.,
183 parq = MultilevelParquetTable(filename)
184 columnDict = {'dataset':'meas',
186 df = parq.toDataFrame(columns=columnDict)
188 and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe.
192 filename : str, optional
193 Path to Parquet file.
194 dataFrame : dataFrame, optional
197 super(MultilevelParquetTable, self).
__init__(*args, **kwargs)
210 """Names of levels in column index
214 def _getColumnIndex(self):
215 if self.
_df is not None:
216 return super()._getColumnIndex()
218 levelNames = [f[
'name']
for f
in self.
pandasMd[
'column_indexes']]
219 return pd.MultiIndex.from_tuples(self.
columns, names=levelNames)
221 def _getColumns(self):
222 if self.
_df is not None:
223 return super()._getColumns()
225 columns = self.
_pf.metadata.schema.names
226 n = len(self.
pandasMd[
'column_indexes'])
227 pattern = re.compile(
', '.join([
"'(.*)'"] * n))
228 matches = [re.search(pattern, c)
for c
in columns]
229 return [m.groups()
for m
in matches
if m
is not None]
232 """Get table (or specified columns) as a pandas DataFrame
234 To get specific columns in specified sub-levels:
236 parq = MultilevelParquetTable(filename)
237 columnDict = {'dataset':'meas',
239 'column':['coord_ra', 'coord_dec']}
240 df = parq.toDataFrame(columns=columnDict)
242 Or, to get an entire subtable, leave out one level name:
244 parq = MultilevelParquetTable(filename)
245 columnDict = {'dataset':'meas',
247 df = parq.toDataFrame(columns=columnDict)
251 columns : list or dict, optional
252 Desired columns. If `None`, then all columns will be
253 returned. If a list, then the names of the columns must
254 be *exactly* as stored by pyarrow; that is, stringified tuples.
255 If a dictionary, then the entries of the dictionary must
256 correspond to the level names of the column multi-index
257 (that is, the `columnLevels` attribute). Not every level
258 must be passed; if any level is left out, then all entries
259 in that level will be implicitly included.
261 If True drop levels of column index that have just one entry
268 return self.
_pf.read().to_pandas()
270 if isinstance(columns, dict):
275 df = self.
_df[columns]
276 except (AttributeError, KeyError):
277 newColumns = [c
for c
in columns
if c
in self.
columnIndex]
279 raise ValueError(
'None of the requested columns ({}) are available!'.format(columns))
280 df = self.
_df[columns]
284 df = self.
_pf.read(columns=pfColumns, use_pandas_metadata=
True).to_pandas()
285 except (AttributeError, KeyError):
286 newColumns = [c
for c
in columns
if c
in self.
columnIndex]
288 raise ValueError(
'None of the requested columns ({}) are available!'.format(columns))
290 df = self.
_pf.read(columns=pfColumns, use_pandas_metadata=
True).to_pandas()
294 levelsToDrop = [n
for l, n
in zip(df.columns.levels, df.columns.names)
298 if len(levelsToDrop) == len(df.columns.names):
299 levelsToDrop.remove(df.columns.names[-1])
301 df.columns = df.columns.droplevel(levelsToDrop)
305 def _colsFromDict(self, colDict):
309 if isinstance(colDict[l], str):
310 new_colDict[l] = [colDict[l]]
312 new_colDict[l] = colDict[l]
317 cols = product(*levelCols)
320 def _stringify(self, cols):
321 return [str(c)
for c
in cols]