Coverage for python/lsst/daf/butler/delegates/dataframe.py: 22%
45 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 09:50 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 09:50 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for reading DataFrames."""
23from __future__ import annotations
25import collections.abc
26from typing import Any, Mapping, Optional
28import pandas
29from lsst.daf.butler import StorageClassDelegate
30from lsst.daf.butler.formatters.parquet import DataFrameSchema
31from lsst.utils.introspection import get_full_type_name
32from lsst.utils.iteration import ensure_iterable
34from ..formatters.parquet import _standardize_multi_index_columns
36__all__ = ["DataFrameDelegate"]
39class DataFrameDelegate(StorageClassDelegate):
40 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any:
41 """Get a component from a DataFrame.
43 Parameters
44 ----------
45 composite : `~pandas.DataFrame`
46 ``DataFrame`` to access component.
47 componentName : `str`
48 Name of component to retrieve.
50 Returns
51 -------
52 component : `object`
53 The component.
55 Raises
56 ------
57 AttributeError
58 The component can not be found.
59 """
60 if componentName == "columns":
61 if isinstance(composite.columns, pandas.MultiIndex):
62 return composite.columns
63 else:
64 return pandas.Index(self._getAllColumns(composite))
65 elif componentName == "rowcount":
66 return len(composite)
67 elif componentName == "schema":
68 return DataFrameSchema(composite.iloc[:0])
69 else:
70 raise AttributeError(
71 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}"
72 )
74 def handleParameters(
75 self, inMemoryDataset: pandas.DataFrame, parameters: Optional[Mapping[str, Any]] = None
76 ) -> Any:
77 """Return possibly new in-memory dataset using the supplied parameters.
79 Parameters
80 ----------
81 inMemoryDataset : `object`
82 Object to modify based on the parameters.
83 parameters : `dict`, optional
84 Parameters to apply. Values are specific to the parameter.
85 Supported parameters are defined in the associated
86 `StorageClass`. If no relevant parameters are specified the
87 ``inMemoryDataset`` will be return unchanged.
89 Returns
90 -------
91 inMemoryDataset : `object`
92 Original in-memory dataset, or updated form after parameters
93 have been used.
94 """
95 if not isinstance(inMemoryDataset, pandas.DataFrame):
96 raise ValueError(
97 "handleParameters for a DataFrame must get a DataFrame, "
98 f"not {get_full_type_name(inMemoryDataset)}."
99 )
101 if parameters is None:
102 return inMemoryDataset
104 if "columns" in parameters:
105 allColumns = self._getAllColumns(inMemoryDataset)
107 if not isinstance(parameters["columns"], collections.abc.Iterable):
108 raise NotImplementedError(
109 "InMemoryDataset of a DataFrame only supports list/tuple of string column names"
110 )
112 if isinstance(inMemoryDataset.columns, pandas.MultiIndex):
113 # We have a multi-index dataframe which needs special handling.
114 readColumns = _standardize_multi_index_columns(
115 inMemoryDataset.columns,
116 parameters["columns"],
117 stringify=False,
118 )
119 else:
120 for column in ensure_iterable(parameters["columns"]):
121 if not isinstance(column, str):
122 raise NotImplementedError(
123 "InMemoryDataset of a DataFrame only supports string column names."
124 )
125 if column not in allColumns:
126 raise ValueError(f"Unrecognized column name {column!r}.")
128 # Exclude index columns from the subset.
129 readColumns = [
130 name
131 for name in ensure_iterable(parameters["columns"])
132 if name not in inMemoryDataset.index.names
133 ]
135 # Ensure uniqueness, keeping order.
136 readColumns = list(dict.fromkeys(readColumns))
138 return inMemoryDataset[readColumns]
139 else:
140 return inMemoryDataset
142 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]:
143 """Get all columns, including index columns.
145 Returns
146 -------
147 columns : `list` [`str`]
148 List of all columns.
149 """
150 allColumns = list(inMemoryDataset.columns)
151 if inMemoryDataset.index.names[0] is not None:
152 allColumns.extend(inMemoryDataset.index.names)
154 return allColumns