Coverage for python/lsst/daf/butler/delegates/dataframe.py: 23%
46 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for reading DataFrames."""
23from __future__ import annotations
25import collections.abc
26from collections.abc import Mapping
27from typing import Any
29import pandas
30from lsst.daf.butler import StorageClassDelegate
31from lsst.daf.butler.formatters.parquet import DataFrameSchema
32from lsst.utils.introspection import get_full_type_name
33from lsst.utils.iteration import ensure_iterable
35from ..formatters.parquet import _standardize_multi_index_columns
37__all__ = ["DataFrameDelegate"]
40class DataFrameDelegate(StorageClassDelegate):
41 """Delegate that understands the ``DataFrame`` storage class."""
43 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any:
44 """Get a component from a DataFrame.
46 Parameters
47 ----------
48 composite : `~pandas.DataFrame`
49 ``DataFrame`` to access component.
50 componentName : `str`
51 Name of component to retrieve.
53 Returns
54 -------
55 component : `object`
56 The component.
58 Raises
59 ------
60 AttributeError
61 The component can not be found.
62 """
63 if componentName == "columns":
64 if isinstance(composite.columns, pandas.MultiIndex):
65 return composite.columns
66 else:
67 return pandas.Index(self._getAllColumns(composite))
68 elif componentName == "rowcount":
69 return len(composite)
70 elif componentName == "schema":
71 return DataFrameSchema(composite.iloc[:0])
72 else:
73 raise AttributeError(
74 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}"
75 )
77 def handleParameters(
78 self, inMemoryDataset: pandas.DataFrame, parameters: Mapping[str, Any] | None = None
79 ) -> Any:
80 """Return possibly new in-memory dataset using the supplied parameters.
82 Parameters
83 ----------
84 inMemoryDataset : `object`
85 Object to modify based on the parameters.
86 parameters : `dict`, optional
87 Parameters to apply. Values are specific to the parameter.
88 Supported parameters are defined in the associated
89 `StorageClass`. If no relevant parameters are specified the
90 ``inMemoryDataset`` will be return unchanged.
92 Returns
93 -------
94 inMemoryDataset : `object`
95 Original in-memory dataset, or updated form after parameters
96 have been used.
97 """
98 if not isinstance(inMemoryDataset, pandas.DataFrame):
99 raise ValueError(
100 "handleParameters for a DataFrame must get a DataFrame, "
101 f"not {get_full_type_name(inMemoryDataset)}."
102 )
104 if parameters is None:
105 return inMemoryDataset
107 if "columns" in parameters:
108 allColumns = self._getAllColumns(inMemoryDataset)
110 if not isinstance(parameters["columns"], collections.abc.Iterable):
111 raise NotImplementedError(
112 "InMemoryDataset of a DataFrame only supports list/tuple of string column names"
113 )
115 if isinstance(inMemoryDataset.columns, pandas.MultiIndex):
116 # We have a multi-index dataframe which needs special handling.
117 readColumns = _standardize_multi_index_columns(
118 inMemoryDataset.columns,
119 parameters["columns"],
120 stringify=False,
121 )
122 else:
123 for column in ensure_iterable(parameters["columns"]):
124 if not isinstance(column, str):
125 raise NotImplementedError(
126 "InMemoryDataset of a DataFrame only supports string column names."
127 )
128 if column not in allColumns:
129 raise ValueError(f"Unrecognized column name {column!r}.")
131 # Exclude index columns from the subset.
132 readColumns = [
133 name
134 for name in ensure_iterable(parameters["columns"])
135 if name not in inMemoryDataset.index.names
136 ]
138 # Ensure uniqueness, keeping order.
139 readColumns = list(dict.fromkeys(readColumns))
141 return inMemoryDataset[readColumns]
142 else:
143 return inMemoryDataset
145 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]:
146 """Get all columns, including index columns.
148 Returns
149 -------
150 columns : `list` [`str`]
151 List of all columns.
152 """
153 allColumns = list(inMemoryDataset.columns)
154 if inMemoryDataset.index.names[0] is not None:
155 allColumns.extend(inMemoryDataset.index.names)
157 return allColumns