Coverage for python/lsst/daf/butler/formatters/parquet.py: 95%
70 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:40 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:40 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("ParquetFormatter",)
26import collections.abc
27import itertools
28import json
29import re
30from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Union
32import pandas as pd
33import pyarrow as pa
34import pyarrow.parquet as pq
35from lsst.daf.butler import Formatter
36from lsst.utils.iteration import ensure_iterable
39class _ParquetLoader:
40 """Helper class for loading Parquet files into `pandas.DataFrame`
41 instances.
43 Parameters
44 ----------
45 path : `str`
46 Full path to the file to be loaded.
47 """
49 def __init__(self, path: str):
50 self.file = pq.ParquetFile(path)
51 self.md = json.loads(self.file.metadata.metadata[b"pandas"])
52 indexes = self.md["column_indexes"]
53 if len(indexes) <= 1:
54 self.columns = pd.Index(
55 name for name in self.file.metadata.schema.names if not name.startswith("__")
56 )
57 else:
58 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names))
59 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
60 self.indexLevelNames = tuple(self.columns.names)
62 @staticmethod
63 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Sequence[str]]:
64 """Split a string that represents a multi-index column.
66 PyArrow maps Pandas' multi-index column names (which are tuples in
67 Pythons) to flat strings on disk. This routine exists to
68 reconstruct the original tuple.
70 Parameters
71 ----------
72 n : `int`
73 Number of levels in the `pd.MultiIndex` that is being
74 reconstructed.
75 names : `~collections.abc.Iterable` of `str`
76 Strings to be split.
78 Yields
79 ------
80 tuple : `tuple` of `str`
81 A multi-index column name tuple.
82 """
83 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n)))
84 for name in names:
85 m = re.search(pattern, name)
86 if m is not None:
87 yield m.groups()
89 def _standardizeColumnParameter(
90 self, columns: Union[List[str], List[tuple], Dict[str, Union[str, List[str]]]]
91 ) -> Iterator[str]:
92 """Transform a dictionary index into a multi-index column into a
93 string directly understandable by PyArrow.
95 Parameters
96 ----------
97 columns : `dict`
98 Dictionary whose elements are string multi-index level names
99 and whose values are the value or values (as a list) for that
100 level.
102 Yields
103 ------
104 name : `str`
105 Stringified tuple representing a multi-index column name.
106 """
107 if isinstance(columns, list):
108 for requested in columns:
109 if not isinstance(requested, tuple):
110 raise ValueError(
111 "columns parameter for multi-index data frame"
112 "must be either a dictionary or list of tuples."
113 )
114 yield str(requested)
115 else:
116 if not isinstance(columns, collections.abc.Mapping): 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true
117 raise ValueError(
118 "columns parameter for multi-index data frame"
119 "must be either a dictionary or list of tuples."
120 )
121 if not set(self.indexLevelNames).issuperset(columns.keys()): 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true
122 raise ValueError(
123 f"Cannot use dict with keys {set(columns.keys())} "
124 f"to select columns from {self.indexLevelNames}."
125 )
126 factors = [
127 ensure_iterable(columns.get(level, self.columns.levels[i]))
128 for i, level in enumerate(self.indexLevelNames)
129 ]
130 for requested in itertools.product(*factors):
131 for i, value in enumerate(requested):
132 if value not in self.columns.levels[i]: 132 ↛ 133line 132 didn't jump to line 133, because the condition on line 132 was never true
133 raise ValueError(
134 f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}."
135 )
136 yield str(requested)
138 def read(
139 self, columns: Union[str, List[str], List[tuple], Dict[str, Union[str, List[str]]]] = None
140 ) -> pd.DataFrame:
141 """Read some or all of the Parquet file into a `pandas.DataFrame`
142 instance.
144 Parameters
145 ----------
146 columns: : `dict`, `list`, or `str`, optional
147 A description of the columns to be loaded. See
148 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`.
150 Returns
151 -------
152 df : `pandas.DataFrame`
153 A Pandas DataFrame.
154 """
155 if columns is None:
156 return self.file.read(use_pandas_metadata=True).to_pandas()
157 elif isinstance(self.columns, pd.MultiIndex):
158 assert isinstance(columns, dict) or isinstance(columns, list)
159 columns = list(self._standardizeColumnParameter(columns))
160 else:
161 for column in ensure_iterable(columns):
162 if column not in self.columns:
163 raise ValueError(f"Unrecognized column name {column!r}.")
164 return self.file.read(columns=ensure_iterable(columns), use_pandas_metadata=True).to_pandas()
167def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None:
168 """Write a `pandas.DataFrame` instance as a Parquet file."""
169 table = pa.Table.from_pandas(inMemoryDataset)
170 pq.write_table(table, path)
173class ParquetFormatter(Formatter):
174 """Interface for reading and writing Pandas DataFrames to and from Parquet
175 files.
177 This formatter is for the
178 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass.
179 """
181 extension = ".parq"
183 def read(self, component: Optional[str] = None) -> Any:
184 # Docstring inherited from Formatter.read.
185 loader = _ParquetLoader(self.fileDescriptor.location.path)
186 if component == "columns":
187 return loader.columns
189 if not self.fileDescriptor.parameters:
190 return loader.read()
192 return loader.read(**self.fileDescriptor.parameters)
194 def write(self, inMemoryDataset: Any) -> None:
195 # Docstring inherited from Formatter.write.
196 location = self.makeUpdatedLocation(self.fileDescriptor.location)
197 _writeParquet(location.path, inMemoryDataset)