Coverage for python/lsst/daf/butler/formatters/parquet.py : 96%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("ParquetFormatter", )
26import json
27import re
28import collections.abc
29import itertools
30from typing import (
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Optional,
37 Tuple,
38 Union,
39)
41import pyarrow.parquet as pq
42import pandas as pd
43import pyarrow as pa
45from lsst.daf.butler.core.utils import iterable
46from lsst.daf.butler import Formatter
49class _ParquetLoader:
50 """Helper class for loading Parquet files into `pandas.DataFrame`
51 instances.
53 Parameters
54 ----------
55 path : `str`
56 Full path to the file to be loaded.
57 """
59 def __init__(self, path: str):
60 self.file = pq.ParquetFile(path)
61 self.md = json.loads(self.file.metadata.metadata[b"pandas"])
62 indexes = self.md["column_indexes"]
63 if len(indexes) == 1:
64 self.columns = pd.Index(name for name in self.file.metadata.schema.names
65 if not name.startswith("__"))
66 else:
67 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names))
68 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
69 self.indexLevelNames = tuple(self.columns.names)
71 @staticmethod
72 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Tuple[str, ...]]:
73 """Split a string that represents a multi-index column.
75 PyArrow maps Pandas' multi-index column names (which are tuples in
76 Pythons) to flat strings on disk. This routine exists to
77 reconstruct the original tuple.
79 Parameters
80 ----------
81 n : `int`
82 Number of levels in the `pd.MultiIndex` that is being
83 reconstructed.
84 names : `~collections.abc.Iterable` of `str`
85 Strings to be split.
87 Yields
88 ------
89 tuple : `tuple` of `str`
90 A multi-index column name tuple.
91 """
92 pattern = re.compile(r"\({}\)".format(', '.join(["'(.*)'"] * n)))
93 for name in names:
94 m = re.search(pattern, name)
95 if m is not None:
96 yield m.groups()
98 def _standardizeColumnParameter(self, columns: Dict[str, Union[str, List[str]]]) -> Iterator[str]:
99 """Transform a dictionary index into a multi-index column into a
100 string directly understandable by PyArrow.
102 Parameters
103 ----------
104 columns : `dict`
105 Dictionary whose elements are string multi-index level names
106 and whose values are the value or values (as a list) for that
107 level.
109 Yields
110 ------
111 name : `str`
112 Stringified tuple representing a multi-index column name.
113 """
114 if not isinstance(columns, collections.abc.Mapping):
115 raise ValueError("columns parameter for multi-index data frame must be a dictionary.")
116 if not set(self.indexLevelNames).issuperset(columns.keys()): 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true
117 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} "
118 f"to select columns from {self.indexLevelNames}.")
119 factors = [iterable(columns.get(level, self.columns.levels[i]))
120 for i, level in enumerate(self.indexLevelNames)]
121 for requested in itertools.product(*factors):
122 for i, value in enumerate(requested):
123 if value not in self.columns.levels[i]: 123 ↛ 124line 123 didn't jump to line 124, because the condition on line 123 was never true
124 raise ValueError(f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}.")
125 yield str(requested)
127 def read(self, columns: Union[str, List[str], Dict[str, Union[str, List[str]]]] = None
128 ) -> pd.DataFrame:
129 """Read some or all of the Parquet file into a `pandas.DataFrame`
130 instance.
132 Parameters
133 ----------
134 columns: : `dict`, `list`, or `str`, optional
135 A description of the columns to be loaded. See
136 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`.
138 Returns
139 -------
140 df : `pandas.DataFrame`
141 A Pandas DataFrame.
142 """
143 if columns is None:
144 return self.file.read(use_pandas_metadata=True).to_pandas()
145 elif isinstance(self.columns, pd.MultiIndex):
146 columns = list(self._standardizeColumnParameter(columns))
147 else:
148 for column in columns:
149 if column not in self.columns:
150 raise ValueError(f"Unrecognized column name {column!r}.")
151 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas()
154def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None:
155 """Write a `pandas.DataFrame` instance as a Parquet file.
156 """
157 table = pa.Table.from_pandas(inMemoryDataset)
158 pq.write_table(table, path, compression='none')
161class ParquetFormatter(Formatter):
162 """Interface for reading and writing Pandas DataFrames to and from Parquet
163 files.
165 This formatter is for the
166 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass.
167 """
168 extension = ".parq"
170 def read(self, component: Optional[str] = None) -> Any:
171 # Docstring inherited from Formatter.read.
172 loader = _ParquetLoader(self.fileDescriptor.location.path)
173 if component == 'columns':
174 return loader.columns
176 if not self.fileDescriptor.parameters:
177 return loader.read()
179 return loader.read(**self.fileDescriptor.parameters)
181 def write(self, inMemoryDataset: Any) -> str:
182 # Docstring inherited from Formatter.write.
183 location = self.makeUpdatedLocation(self.fileDescriptor.location)
184 _writeParquet(location.path, inMemoryDataset)
185 return location.pathInStore