Coverage for python/lsst/pipe/tasks/parquetTable.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_tasks.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""
22Implementation of thin wrappers to pyarrow.ParquetFile.
23"""
25import re
26import json
27from itertools import product
28import pyarrow
29import pyarrow.parquet
30import numpy as np
31import pandas as pd
34class ParquetTable(object):
35 """Thin wrapper to pyarrow's ParquetFile object
37 Call `toDataFrame` method to get a `pandas.DataFrame` object,
38 optionally passing specific columns.
40 The main purpose of having this wrapper rather than directly
41 using `pyarrow.ParquetFile` is to make it nicer to load
42 selected subsets of columns, especially from dataframes with multi-level
43 column indices.
45 Instantiated with either a path to a parquet file or a dataFrame
47 Parameters
48 ----------
49 filename : str, optional
50 Path to Parquet file.
51 dataFrame : dataFrame, optional
52 """
54 def __init__(self, filename=None, dataFrame=None):
55 if filename is not None:
56 self._pf = pyarrow.parquet.ParquetFile(filename)
57 self._df = None
58 self._pandasMd = None
59 elif dataFrame is not None:
60 self._df = dataFrame
61 self._pf = None
62 else:
63 raise ValueError('Either filename or dataFrame must be passed.')
65 self._columns = None
66 self._columnIndex = None
68 def write(self, filename):
69 """Write pandas dataframe to parquet
71 Parameters
72 ----------
73 filename : str
74 Path to which to write.
75 """
76 if self._df is None:
77 raise ValueError('df property must be defined to write.')
78 table = pyarrow.Table.from_pandas(self._df)
79 pyarrow.parquet.write_table(table, filename, compression='none')
81 @property
82 def pandasMd(self):
83 if self._pf is None:
84 raise AttributeError("This property is only accessible if ._pf is set.")
85 if self._pandasMd is None:
86 self._pandasMd = json.loads(self._pf.metadata.metadata[b'pandas'])
87 return self._pandasMd
89 @property
90 def columnIndex(self):
91 """Columns as a pandas Index
92 """
93 if self._columnIndex is None:
94 self._columnIndex = self._getColumnIndex()
95 return self._columnIndex
97 def _getColumnIndex(self):
98 if self._df is not None:
99 return self._df.columns
100 else:
101 return pd.Index(self.columns)
103 @property
104 def columns(self):
105 """List of column names (or column index if df is set)
107 This may either be a list of column names, or a
108 pandas.Index object describing the column index, depending
109 on whether the ParquetTable object is wrapping a ParquetFile
110 or a DataFrame.
111 """
112 if self._columns is None:
113 self._columns = self._getColumns()
114 return self._columns
116 def _getColumns(self):
117 if self._df is not None:
118 return self._sanitizeColumns(self._df.columns)
119 else:
120 return self._pf.metadata.schema.names
122 def _sanitizeColumns(self, columns):
123 return [c for c in columns if c in self.columnIndex]
125 def toDataFrame(self, columns=None):
126 """Get table (or specified columns) as a pandas DataFrame
128 Parameters
129 ----------
130 columns : list, optional
131 Desired columns. If `None`, then all columns will be
132 returned.
133 """
134 if self._pf is None:
135 if columns is None:
136 return self._df
137 else:
138 return self._df[columns]
140 if columns is None:
141 return self._pf.read().to_pandas()
143 df = self._pf.read(columns=columns, use_pandas_metadata=True).to_pandas()
144 return df
147class MultilevelParquetTable(ParquetTable):
148 """Wrapper to access dataframe with multi-level column index from Parquet
150 This subclass of `ParquetTable` to handle the multi-level is necessary
151 because there is not a convenient way to request specific table subsets
152 by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`.
154 Additionally, pyarrow stores multilevel index information in a very strange
155 way. Pandas stores it as a tuple, so that one can access a single column
156 from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for
157 some reason pyarrow saves these indices as "stringified" tuples, such that
158 in order to read thissame column from a table written to Parquet, you would
159 have to do the following:
161 pf = pyarrow.ParquetFile(filename)
162 df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"])
164 See also https://github.com/apache/arrow/issues/1771, where we've raised
165 this issue.
167 As multilevel-indexed dataframes can be very useful to store data like
168 multiple filters' worth of data in the same table, this case deserves a
169 wrapper to enable easier access;
170 that's what this object is for. For example,
172 parq = MultilevelParquetTable(filename)
173 columnDict = {'dataset':'meas',
174 'filter':'HSC-G',
175 'column':['coord_ra', 'coord_dec']}
176 df = parq.toDataFrame(columns=columnDict)
178 will return just the coordinate columns; the equivalent of calling
179 `df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe,
180 but without having to load the whole frame into memory---this reads just
181 those columns from disk. You can also request a sub-table; e.g.,
183 parq = MultilevelParquetTable(filename)
184 columnDict = {'dataset':'meas',
185 'filter':'HSC-G'}
186 df = parq.toDataFrame(columns=columnDict)
188 and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe.
190 Parameters
191 ----------
192 filename : str, optional
193 Path to Parquet file.
194 dataFrame : dataFrame, optional
195 """
196 def __init__(self, *args, **kwargs):
197 super(MultilevelParquetTable, self).__init__(*args, **kwargs)
199 self._columnLevelNames = None
201 @property
202 def columnLevelNames(self):
203 if self._columnLevelNames is None:
204 self._columnLevelNames = {level: list(np.unique(np.array(self.columns)[:, i]))
205 for i, level in enumerate(self.columnLevels)}
206 return self._columnLevelNames
208 @property
209 def columnLevels(self):
210 """Names of levels in column index
211 """
212 return self.columnIndex.names
214 def _getColumnIndex(self):
215 if self._df is not None:
216 return super()._getColumnIndex()
217 else:
218 levelNames = [f['name'] for f in self.pandasMd['column_indexes']]
219 return pd.MultiIndex.from_tuples(self.columns, names=levelNames)
221 def _getColumns(self):
222 if self._df is not None:
223 return super()._getColumns()
224 else:
225 columns = self._pf.metadata.schema.names
226 n = len(self.pandasMd['column_indexes'])
227 pattern = re.compile(', '.join(["'(.*)'"] * n))
228 matches = [re.search(pattern, c) for c in columns]
229 return [m.groups() for m in matches if m is not None]
231 def toDataFrame(self, columns=None, droplevels=True):
232 """Get table (or specified columns) as a pandas DataFrame
234 To get specific columns in specified sub-levels:
236 parq = MultilevelParquetTable(filename)
237 columnDict = {'dataset':'meas',
238 'filter':'HSC-G',
239 'column':['coord_ra', 'coord_dec']}
240 df = parq.toDataFrame(columns=columnDict)
242 Or, to get an entire subtable, leave out one level name:
244 parq = MultilevelParquetTable(filename)
245 columnDict = {'dataset':'meas',
246 'filter':'HSC-G'}
247 df = parq.toDataFrame(columns=columnDict)
249 Parameters
250 ----------
251 columns : list or dict, optional
252 Desired columns. If `None`, then all columns will be
253 returned. If a list, then the names of the columns must
254 be *exactly* as stored by pyarrow; that is, stringified tuples.
255 If a dictionary, then the entries of the dictionary must
256 correspond to the level names of the column multi-index
257 (that is, the `columnLevels` attribute). Not every level
258 must be passed; if any level is left out, then all entries
259 in that level will be implicitly included.
260 droplevels : bool
261 If True drop levels of column index that have just one entry
263 """
264 if columns is None:
265 if self._pf is None:
266 return self._df
267 else:
268 return self._pf.read().to_pandas()
270 if isinstance(columns, dict):
271 columns = self._colsFromDict(columns)
273 if self._pf is None:
274 try:
275 df = self._df[columns]
276 except (AttributeError, KeyError):
277 newColumns = [c for c in columns if c in self.columnIndex]
278 if not newColumns:
279 raise ValueError('None of the requested columns ({}) are available!'.format(columns))
280 df = self._df[columns]
281 else:
282 pfColumns = self._stringify(columns)
283 try:
284 df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas()
285 except (AttributeError, KeyError):
286 newColumns = [c for c in columns if c in self.columnIndex]
287 if not newColumns:
288 raise ValueError('None of the requested columns ({}) are available!'.format(columns))
289 pfColumns = self._stringify(newColumns)
290 df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas()
292 if droplevels:
293 # Drop levels of column index that have just one entry
294 levelsToDrop = [n for l, n in zip(df.columns.levels, df.columns.names)
295 if len(l) == 1]
297 # Prevent error when trying to drop *all* columns
298 if len(levelsToDrop) == len(df.columns.names):
299 levelsToDrop.remove(df.columns.names[-1])
301 df.columns = df.columns.droplevel(levelsToDrop)
303 return df
305 def _colsFromDict(self, colDict):
306 new_colDict = {}
307 for i, l in enumerate(self.columnLevels):
308 if l in colDict:
309 if isinstance(colDict[l], str):
310 new_colDict[l] = [colDict[l]]
311 else:
312 new_colDict[l] = colDict[l]
313 else:
314 new_colDict[l] = self.columnIndex.levels[i]
316 levelCols = [new_colDict[l] for l in self.columnLevels]
317 cols = product(*levelCols)
318 return list(cols)
320 def _stringify(self, cols):
321 return [str(c) for c in cols]