Coverage for python/lsst/daf/butler/formatters/parquetFormatter.py : 96%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ("ParquetFormatter", )
24import json
25import re
26import collections.abc
27import itertools
28from typing import (
29 Any,
30 Dict,
31 Iterable,
32 Iterator,
33 List,
34 Optional,
35 Tuple,
36 Union,
37)
39import pyarrow.parquet as pq
40import pandas as pd
41import pyarrow as pa
43from lsst.daf.butler.core.utils import iterable
44from lsst.daf.butler import Formatter
47class _ParquetLoader:
48 """Helper class for loading Parquet files into `pandas.DataFrame`
49 instances.
51 Parameters
52 ----------
53 path : `str`
54 Full path to the file to be loaded.
55 """
57 def __init__(self, path: str):
58 self.file = pq.ParquetFile(path)
59 self.md = json.loads(self.file.metadata.metadata[b"pandas"])
60 indexes = self.md["column_indexes"]
61 if len(indexes) == 1:
62 self.columns = pd.Index(name for name in self.file.metadata.schema.names
63 if not name.startswith("__"))
64 else:
65 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names))
66 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
67 self.indexLevelNames = tuple(self.columns.names)
69 @staticmethod
70 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Tuple[str]]:
71 """Split a string that represents a multi-index column.
73 PyArrow maps Pandas' multi-index column names (which are tuples in
74 Pythons) to flat strings on disk. This routine exists to
75 reconstruct the original tuple.
77 Parameters
78 ----------
79 n : `int`
80 Number of levels in the `pd.MultiIndex` that is being
81 reconstructed.
82 names : `~collections.abc.Iterable` of `str`
83 Strings to be split.
85 Yields
86 ------
87 tuple : `tuple` of `str`
88 A multi-index column name tuple.
89 """
90 pattern = re.compile(r"\({}\)".format(', '.join(["'(.*)'"] * n)))
91 for name in names:
92 m = re.search(pattern, name)
93 if m is not None:
94 yield m.groups()
96 def _standardizeColumnParameter(self, columns: Dict[str, Union[str, List[str]]]) -> Iterator[str]:
97 """Transform a dictionary index into a multi-index column into a
98 string directly understandable by PyArrow.
100 Parameters
101 ----------
102 columns : `dict`
103 Dictionary whose elements are string multi-index level names
104 and whose values are the value or values (as a list) for that
105 level.
107 Yields
108 ------
109 name : `str`
110 Stringified tuple representing a multi-index column name.
111 """
112 if not isinstance(columns, collections.abc.Mapping):
113 raise ValueError("columns parameter for multi-index data frame must be a dictionary.")
114 if not set(self.indexLevelNames).issuperset(columns.keys()): 114 ↛ 115line 114 didn't jump to line 115, because the condition on line 114 was never true
115 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} "
116 f"to select columns from {self.indexLevelNames}.")
117 factors = [iterable(columns.get(level, self.columns.levels[i]))
118 for i, level in enumerate(self.indexLevelNames)]
119 for requested in itertools.product(*factors):
120 for i, value in enumerate(requested):
121 if value not in self.columns.levels[i]: 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true
122 raise ValueError(f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}.")
123 yield str(requested)
125 def read(self, columns: Union[str, List[str], Dict[str, Union[str, List[str]]]] = None
126 ) -> pd.DataFrame:
127 """Read some or all of the Parquet file into a `pandas.DataFrame`
128 instance.
130 Parameters
131 ----------
132 columns: : `dict`, `list`, or `str`, optional
133 A description of the columns to be loaded. See
134 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`.
136 Returns
137 -------
138 df : `pandas.DataFrame`
139 A Pandas DataFrame.
140 """
141 if columns is None:
142 return self.file.read(use_pandas_metadata=True).to_pandas()
143 elif isinstance(self.columns, pd.MultiIndex):
144 columns = list(self._standardizeColumnParameter(columns))
145 else:
146 for column in columns:
147 if column not in self.columns:
148 raise ValueError(f"Unrecognized column name {column!r}.")
149 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas()
152def _writeParquet(path: str, inMemoryDataset: pd.DataFrame):
153 """Write a `pandas.DataFrame` instance as a Parquet file.
154 """
155 table = pa.Table.from_pandas(inMemoryDataset)
156 pq.write_table(table, path, compression='none')
159class ParquetFormatter(Formatter):
160 """Interface for reading and writing Pandas DataFrames to and from Parquet
161 files.
163 This formatter is for the
164 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass.
165 """
166 extension = ".parq"
168 def read(self, component: Optional[str] = None) -> object:
169 # Docstring inherited from Formatter.read.
170 loader = _ParquetLoader(self.fileDescriptor.location.path)
171 if component == 'columns':
172 return loader.columns
174 if not self.fileDescriptor.parameters:
175 return loader.read()
177 return loader.read(**self.fileDescriptor.parameters)
179 def write(self, inMemoryDataset: Any) -> str:
180 # Docstring inherited from Formatter.write.
181 location = self.makeUpdatedLocation(self.fileDescriptor.location)
182 _writeParquet(location.path, inMemoryDataset)
183 return location.pathInStore