Coverage for python/lsst/daf/butler/formatters/parquet.py: 95%
70 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("ParquetFormatter",)
26import json
27import re
28import collections.abc
29import itertools
30from typing import (
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Optional,
37 Sequence,
38 Union,
39)
41import pyarrow.parquet as pq
42import pandas as pd
43import pyarrow as pa
45from lsst.daf.butler.core.utils import iterable
46from lsst.daf.butler import Formatter
49class _ParquetLoader:
50 """Helper class for loading Parquet files into `pandas.DataFrame`
51 instances.
53 Parameters
54 ----------
55 path : `str`
56 Full path to the file to be loaded.
57 """
58 def __init__(self, path: str):
59 self.file = pq.ParquetFile(path)
60 self.md = json.loads(self.file.metadata.metadata[b"pandas"])
61 indexes = self.md["column_indexes"]
62 if len(indexes) <= 1:
63 self.columns = pd.Index(
64 name for name in self.file.metadata.schema.names if not name.startswith("__")
65 )
66 else:
67 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names))
68 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
69 self.indexLevelNames = tuple(self.columns.names)
71 @staticmethod
72 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Sequence[str]]:
73 """Split a string that represents a multi-index column.
75 PyArrow maps Pandas' multi-index column names (which are tuples in
76 Pythons) to flat strings on disk. This routine exists to
77 reconstruct the original tuple.
79 Parameters
80 ----------
81 n : `int`
82 Number of levels in the `pd.MultiIndex` that is being
83 reconstructed.
84 names : `~collections.abc.Iterable` of `str`
85 Strings to be split.
87 Yields
88 ------
89 tuple : `tuple` of `str`
90 A multi-index column name tuple.
91 """
92 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n)))
93 for name in names:
94 m = re.search(pattern, name)
95 if m is not None:
96 yield m.groups()
98 def _standardizeColumnParameter(
99 self, columns: Union[List[str], List[tuple], Dict[str, Union[str, List[str]]]]
100 ) -> Iterator[str]:
101 """Transform a dictionary index into a multi-index column into a
102 string directly understandable by PyArrow.
104 Parameters
105 ----------
106 columns : `dict`
107 Dictionary whose elements are string multi-index level names
108 and whose values are the value or values (as a list) for that
109 level.
111 Yields
112 ------
113 name : `str`
114 Stringified tuple representing a multi-index column name.
115 """
116 if isinstance(columns, list):
117 for requested in columns:
118 if not isinstance(requested, tuple):
119 raise ValueError("columns parameter for multi-index data frame"
120 "must be either a dictionary or list of tuples.")
121 yield str(requested)
122 else:
123 if not isinstance(columns, collections.abc.Mapping): 123 ↛ 124line 123 didn't jump to line 124, because the condition on line 123 was never true
124 raise ValueError("columns parameter for multi-index data frame"
125 "must be either a dictionary or list of tuples.")
126 if not set(self.indexLevelNames).issuperset(columns.keys()): 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true
127 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} "
128 f"to select columns from {self.indexLevelNames}.")
129 factors = [iterable(columns.get(level, self.columns.levels[i]))
130 for i, level in enumerate(self.indexLevelNames)]
131 for requested in itertools.product(*factors):
132 for i, value in enumerate(requested):
133 if value not in self.columns.levels[i]: 133 ↛ 134line 133 didn't jump to line 134, because the condition on line 133 was never true
134 raise ValueError(f"Unrecognized value {value!r}"
135 f"for index {self.indexLevelNames[i]!r}.")
136 yield str(requested)
138 def read(self, columns: Union[str, List[str], List[tuple],
139 Dict[str, Union[str, List[str]]]] = None) -> pd.DataFrame:
140 """Read some or all of the Parquet file into a `pandas.DataFrame`
141 instance.
143 Parameters
144 ----------
145 columns: : `dict`, `list`, or `str`, optional
146 A description of the columns to be loaded. See
147 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`.
149 Returns
150 -------
151 df : `pandas.DataFrame`
152 A Pandas DataFrame.
153 """
154 if columns is None:
155 return self.file.read(use_pandas_metadata=True).to_pandas()
156 elif isinstance(self.columns, pd.MultiIndex):
157 assert isinstance(columns, dict) or isinstance(columns, list)
158 columns = list(self._standardizeColumnParameter(columns))
159 else:
160 for column in columns:
161 if column not in self.columns:
162 raise ValueError(f"Unrecognized column name {column!r}.")
163 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas()
166def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None:
167 """Write a `pandas.DataFrame` instance as a Parquet file.
168 """
169 table = pa.Table.from_pandas(inMemoryDataset)
170 pq.write_table(table, path, compression="none")
173class ParquetFormatter(Formatter):
174 """Interface for reading and writing Pandas DataFrames to and from Parquet
175 files.
177 This formatter is for the
178 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass.
179 """
181 extension = ".parq"
183 def read(self, component: Optional[str] = None) -> Any:
184 # Docstring inherited from Formatter.read.
185 loader = _ParquetLoader(self.fileDescriptor.location.path)
186 if component == "columns":
187 return loader.columns
189 if not self.fileDescriptor.parameters:
190 return loader.read()
192 return loader.read(**self.fileDescriptor.parameters)
194 def write(self, inMemoryDataset: Any) -> None:
195 # Docstring inherited from Formatter.write.
196 location = self.makeUpdatedLocation(self.fileDescriptor.location)
197 _writeParquet(location.path, inMemoryDataset)