lsst.pipe.tasks  19.0.0-53-g31c4d99c
parquetTable.py
Go to the documentation of this file.
1 # This file is part of pipe_tasks.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """
22 Implementation of thin wrappers to pyarrow.ParquetFile.
23 """
24 
25 import re
26 import json
27 from itertools import product
28 import pyarrow
29 import pyarrow.parquet
30 import numpy as np
31 import pandas as pd
32 
33 
34 class ParquetTable(object):
35  """Thin wrapper to pyarrow's ParquetFile object
36 
37  Call `toDataFrame` method to get a `pandas.DataFrame` object,
38  optionally passing specific columns.
39 
40  The main purpose of having this wrapper rather than directly
41  using `pyarrow.ParquetFile` is to make it nicer to load
42  selected subsets of columns, especially from dataframes with multi-level
43  column indices.
44 
45  Instantiated with either a path to a parquet file or a dataFrame
46 
47  Parameters
48  ----------
49  filename : str, optional
50  Path to Parquet file.
51  dataFrame : dataFrame, optional
52  """
53 
54  def __init__(self, filename=None, dataFrame=None):
55  if filename is not None:
56  self._pf = pyarrow.parquet.ParquetFile(filename)
57  self._df = None
58  self._pandasMd = None
59  elif dataFrame is not None:
60  self._df = dataFrame
61  self._pf = None
62  else:
63  raise ValueError('Either filename or dataFrame must be passed.')
64 
65  self._columns = None
66  self._columnIndex = None
67 
68  def write(self, filename):
69  """Write pandas dataframe to parquet
70 
71  Parameters
72  ----------
73  filename : str
74  Path to which to write.
75  """
76  if self._df is None:
77  raise ValueError('df property must be defined to write.')
78  table = pyarrow.Table.from_pandas(self._df)
79  pyarrow.parquet.write_table(table, filename, compression='none')
80 
81  @property
82  def pandasMd(self):
83  if self._pf is None:
84  raise AttributeError("This property is only accessible if ._pf is set.")
85  if self._pandasMd is None:
86  self._pandasMd = json.loads(self._pf.metadata.metadata[b'pandas'])
87  return self._pandasMd
88 
89  @property
90  def columnIndex(self):
91  """Columns as a pandas Index
92  """
93  if self._columnIndex is None:
94  self._columnIndex = self._getColumnIndex()
95  return self._columnIndex
96 
97  def _getColumnIndex(self):
98  if self._df is not None:
99  return self._df.columns
100  else:
101  return pd.Index(self.columns)
102 
103  @property
104  def columns(self):
105  """List of column names (or column index if df is set)
106 
107  This may either be a list of column names, or a
108  pandas.Index object describing the column index, depending
109  on whether the ParquetTable object is wrapping a ParquetFile
110  or a DataFrame.
111  """
112  if self._columns is None:
113  self._columns = self._getColumns()
114  return self._columns
115 
116  def _getColumns(self):
117  if self._df is not None:
118  return self._sanitizeColumns(self._df.columns)
119  else:
120  return self._pf.metadata.schema.names
121 
122  def _sanitizeColumns(self, columns):
123  return [c for c in columns if c in self.columnIndex]
124 
125  def toDataFrame(self, columns=None):
126  """Get table (or specified columns) as a pandas DataFrame
127 
128  Parameters
129  ----------
130  columns : list, optional
131  Desired columns. If `None`, then all columns will be
132  returned.
133  """
134  if self._pf is None:
135  if columns is None:
136  return self._df
137  else:
138  return self._df[columns]
139 
140  if columns is None:
141  return self._pf.read().to_pandas()
142 
143  df = self._pf.read(columns=columns, use_pandas_metadata=True).to_pandas()
144  return df
145 
146 
148  """Wrapper to access dataframe with multi-level column index from Parquet
149 
150  This subclass of `ParquetTable` to handle the multi-level is necessary
151  because there is not a convenient way to request specific table subsets
152  by level via Parquet through pyarrow, as there is with a `pandas.DataFrame`.
153 
154  Additionally, pyarrow stores multilevel index information in a very strange
155  way. Pandas stores it as a tuple, so that one can access a single column
156  from a pandas dataframe as `df[('ref', 'HSC-G', 'coord_ra')]`. However, for
157  some reason pyarrow saves these indices as "stringified" tuples, such that
158  in order to read thissame column from a table written to Parquet, you would
159  have to do the following:
160 
161  pf = pyarrow.ParquetFile(filename)
162  df = pf.read(columns=["('ref', 'HSC-G', 'coord_ra')"])
163 
164  See also https://github.com/apache/arrow/issues/1771, where we've raised
165  this issue.
166 
167  As multilevel-indexed dataframes can be very useful to store data like
168  multiple filters' worth of data in the same table, this case deserves a
169  wrapper to enable easier access;
170  that's what this object is for. For example,
171 
172  parq = MultilevelParquetTable(filename)
173  columnDict = {'dataset':'meas',
174  'filter':'HSC-G',
175  'column':['coord_ra', 'coord_dec']}
176  df = parq.toDataFrame(columns=columnDict)
177 
178  will return just the coordinate columns; the equivalent of calling
179  `df['meas']['HSC-G'][['coord_ra', 'coord_dec']]` on the total dataframe,
180  but without having to load the whole frame into memory---this reads just
181  those columns from disk. You can also request a sub-table; e.g.,
182 
183  parq = MultilevelParquetTable(filename)
184  columnDict = {'dataset':'meas',
185  'filter':'HSC-G'}
186  df = parq.toDataFrame(columns=columnDict)
187 
188  and this will be the equivalent of `df['meas']['HSC-G']` on the total dataframe.
189 
190  Parameters
191  ----------
192  filename : str, optional
193  Path to Parquet file.
194  dataFrame : dataFrame, optional
195  """
196  def __init__(self, *args, **kwargs):
197  super(MultilevelParquetTable, self).__init__(*args, **kwargs)
198 
199  self._columnLevelNames = None
200 
201  @property
202  def columnLevelNames(self):
203  if self._columnLevelNames is None:
204  self._columnLevelNames = {level: list(np.unique(np.array(self.columns)[:, i]))
205  for i, level in enumerate(self.columnLevels)}
206  return self._columnLevelNames
207 
208  @property
209  def columnLevels(self):
210  """Names of levels in column index
211  """
212  return self.columnIndex.names
213 
214  def _getColumnIndex(self):
215  if self._df is not None:
216  return super()._getColumnIndex()
217  else:
218  levelNames = [f['name'] for f in self.pandasMd['column_indexes']]
219  return pd.MultiIndex.from_tuples(self.columns, names=levelNames)
220 
221  def _getColumns(self):
222  if self._df is not None:
223  return super()._getColumns()
224  else:
225  columns = self._pf.metadata.schema.names
226  n = len(self.pandasMd['column_indexes'])
227  pattern = re.compile(', '.join(["'(.*)'"] * n))
228  matches = [re.search(pattern, c) for c in columns]
229  return [m.groups() for m in matches if m is not None]
230 
231  def toDataFrame(self, columns=None, droplevels=True):
232  """Get table (or specified columns) as a pandas DataFrame
233 
234  To get specific columns in specified sub-levels:
235 
236  parq = MultilevelParquetTable(filename)
237  columnDict = {'dataset':'meas',
238  'filter':'HSC-G',
239  'column':['coord_ra', 'coord_dec']}
240  df = parq.toDataFrame(columns=columnDict)
241 
242  Or, to get an entire subtable, leave out one level name:
243 
244  parq = MultilevelParquetTable(filename)
245  columnDict = {'dataset':'meas',
246  'filter':'HSC-G'}
247  df = parq.toDataFrame(columns=columnDict)
248 
249  Parameters
250  ----------
251  columns : list or dict, optional
252  Desired columns. If `None`, then all columns will be
253  returned. If a list, then the names of the columns must
254  be *exactly* as stored by pyarrow; that is, stringified tuples.
255  If a dictionary, then the entries of the dictionary must
256  correspond to the level names of the column multi-index
257  (that is, the `columnLevels` attribute). Not every level
258  must be passed; if any level is left out, then all entries
259  in that level will be implicitly included.
260  droplevels : bool
261  If True drop levels of column index that have just one entry
262 
263  """
264  if columns is None:
265  if self._pf is None:
266  return self._df
267  else:
268  return self._pf.read().to_pandas()
269 
270  if isinstance(columns, dict):
271  columns = self._colsFromDict(columns)
272 
273  if self._pf is None:
274  try:
275  df = self._df[columns]
276  except (AttributeError, KeyError):
277  newColumns = [c for c in columns if c in self.columnIndex]
278  if not newColumns:
279  raise ValueError('None of the requested columns ({}) are available!'.format(columns))
280  df = self._df[columns]
281  else:
282  pfColumns = self._stringify(columns)
283  try:
284  df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas()
285  except (AttributeError, KeyError):
286  newColumns = [c for c in columns if c in self.columnIndex]
287  if not newColumns:
288  raise ValueError('None of the requested columns ({}) are available!'.format(columns))
289  pfColumns = self._stringify(newColumns)
290  df = self._pf.read(columns=pfColumns, use_pandas_metadata=True).to_pandas()
291 
292  if droplevels:
293  # Drop levels of column index that have just one entry
294  levelsToDrop = [n for l, n in zip(df.columns.levels, df.columns.names)
295  if len(l) == 1]
296 
297  # Prevent error when trying to drop *all* columns
298  if len(levelsToDrop) == len(df.columns.names):
299  levelsToDrop.remove(df.columns.names[-1])
300 
301  df.columns = df.columns.droplevel(levelsToDrop)
302 
303  return df
304 
305  def _colsFromDict(self, colDict):
306  new_colDict = {}
307  for i, l in enumerate(self.columnLevels):
308  if l in colDict:
309  if isinstance(colDict[l], str):
310  new_colDict[l] = [colDict[l]]
311  else:
312  new_colDict[l] = colDict[l]
313  else:
314  new_colDict[l] = self.columnIndex.levels[i]
315 
316  levelCols = [new_colDict[l] for l in self.columnLevels]
317  cols = product(*levelCols)
318  return list(cols)
319 
320  def _stringify(self, cols):
321  return [str(c) for c in cols]
lsst.pipe.tasks.parquetTable.ParquetTable._sanitizeColumns
def _sanitizeColumns(self, columns)
Definition: parquetTable.py:122
lsst.pipe.tasks.parquetTable.MultilevelParquetTable.columnLevelNames
def columnLevelNames(self)
Definition: parquetTable.py:202
lsst.pipe.tasks.parquetTable.ParquetTable._df
_df
Definition: parquetTable.py:57
lsst.pipe.tasks.parquetTable.ParquetTable._columns
_columns
Definition: parquetTable.py:65
lsst.pipe.tasks.parquetTable.ParquetTable._pandasMd
_pandasMd
Definition: parquetTable.py:58
lsst.pipe.tasks.parquetTable.ParquetTable._getColumnIndex
def _getColumnIndex(self)
Definition: parquetTable.py:97
lsst.pipe.tasks.parquetTable.MultilevelParquetTable._columnLevelNames
_columnLevelNames
Definition: parquetTable.py:199
lsst.pipe.tasks.parquetTable.ParquetTable.columnIndex
def columnIndex(self)
Definition: parquetTable.py:90
lsst.pipe.tasks.parquetTable.ParquetTable.__init__
def __init__(self, filename=None, dataFrame=None)
Definition: parquetTable.py:54
lsst.pipe.tasks.parquetTable.ParquetTable.columns
def columns(self)
Definition: parquetTable.py:104
lsst.pipe.tasks.parquetTable.MultilevelParquetTable._colsFromDict
def _colsFromDict(self, colDict)
Definition: parquetTable.py:305
lsst.pipe.tasks.parquetTable.MultilevelParquetTable.__init__
def __init__(self, *args, **kwargs)
Definition: parquetTable.py:196
lsst.pipe.tasks.parquetTable.ParquetTable._getColumns
def _getColumns(self)
Definition: parquetTable.py:116
lsst.pipe.tasks.parquetTable.MultilevelParquetTable
Definition: parquetTable.py:147
lsst.pipe.tasks.parquetTable.ParquetTable.write
def write(self, filename)
Definition: parquetTable.py:68
lsst.pipe.tasks.parquetTable.MultilevelParquetTable.columnLevels
def columnLevels(self)
Definition: parquetTable.py:209
lsst.pipe.tasks.parquetTable.MultilevelParquetTable.toDataFrame
def toDataFrame(self, columns=None, droplevels=True)
Definition: parquetTable.py:231
lsst.pipe.tasks.parquetTable.ParquetTable._pf
_pf
Definition: parquetTable.py:56
lsst.pipe.tasks.parquetTable.ParquetTable
Definition: parquetTable.py:34
lsst.pipe.tasks.parquetTable.ParquetTable.pandasMd
def pandasMd(self)
Definition: parquetTable.py:82
lsst.pipe.tasks.parquetTable.MultilevelParquetTable._stringify
def _stringify(self, cols)
Definition: parquetTable.py:320
lsst.pipe.tasks.parquetTable.ParquetTable.toDataFrame
def toDataFrame(self, columns=None)
Definition: parquetTable.py:125
lsst.pipe.tasks.parquetTable.ParquetTable._columnIndex
_columnIndex
Definition: parquetTable.py:66