Coverage for python/lsst/daf/butler/delegates/dataframe.py: 23%
46 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for reading DataFrames."""
29from __future__ import annotations
31import collections.abc
32from collections.abc import Mapping
33from typing import Any
35import pandas
36from lsst.daf.butler import StorageClassDelegate
37from lsst.daf.butler.formatters.parquet import DataFrameSchema
38from lsst.utils.introspection import get_full_type_name
39from lsst.utils.iteration import ensure_iterable
41from ..formatters.parquet import _standardize_multi_index_columns
43__all__ = ["DataFrameDelegate"]
46class DataFrameDelegate(StorageClassDelegate):
47 """Delegate that understands the ``DataFrame`` storage class."""
49 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any:
50 """Get a component from a DataFrame.
52 Parameters
53 ----------
54 composite : `~pandas.DataFrame`
55 ``DataFrame`` to access component.
56 componentName : `str`
57 Name of component to retrieve.
59 Returns
60 -------
61 component : `object`
62 The component.
64 Raises
65 ------
66 AttributeError
67 The component can not be found.
68 """
69 if componentName == "columns":
70 if isinstance(composite.columns, pandas.MultiIndex):
71 return composite.columns
72 else:
73 return pandas.Index(self._getAllColumns(composite))
74 elif componentName == "rowcount":
75 return len(composite)
76 elif componentName == "schema":
77 return DataFrameSchema(composite.iloc[:0])
78 else:
79 raise AttributeError(
80 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}"
81 )
83 def handleParameters(
84 self, inMemoryDataset: pandas.DataFrame, parameters: Mapping[str, Any] | None = None
85 ) -> Any:
86 """Return possibly new in-memory dataset using the supplied parameters.
88 Parameters
89 ----------
90 inMemoryDataset : `object`
91 Object to modify based on the parameters.
92 parameters : `dict`, optional
93 Parameters to apply. Values are specific to the parameter.
94 Supported parameters are defined in the associated
95 `StorageClass`. If no relevant parameters are specified the
96 ``inMemoryDataset`` will be return unchanged.
98 Returns
99 -------
100 inMemoryDataset : `object`
101 Original in-memory dataset, or updated form after parameters
102 have been used.
103 """
104 if not isinstance(inMemoryDataset, pandas.DataFrame):
105 raise ValueError(
106 "handleParameters for a DataFrame must get a DataFrame, "
107 f"not {get_full_type_name(inMemoryDataset)}."
108 )
110 if parameters is None:
111 return inMemoryDataset
113 if "columns" in parameters:
114 allColumns = self._getAllColumns(inMemoryDataset)
116 if not isinstance(parameters["columns"], collections.abc.Iterable):
117 raise NotImplementedError(
118 "InMemoryDataset of a DataFrame only supports list/tuple of string column names"
119 )
121 if isinstance(inMemoryDataset.columns, pandas.MultiIndex):
122 # We have a multi-index dataframe which needs special handling.
123 readColumns = _standardize_multi_index_columns(
124 inMemoryDataset.columns,
125 parameters["columns"],
126 stringify=False,
127 )
128 else:
129 for column in ensure_iterable(parameters["columns"]):
130 if not isinstance(column, str):
131 raise NotImplementedError(
132 "InMemoryDataset of a DataFrame only supports string column names."
133 )
134 if column not in allColumns:
135 raise ValueError(f"Unrecognized column name {column!r}.")
137 # Exclude index columns from the subset.
138 readColumns = [
139 name
140 for name in ensure_iterable(parameters["columns"])
141 if name not in inMemoryDataset.index.names
142 ]
144 # Ensure uniqueness, keeping order.
145 readColumns = list(dict.fromkeys(readColumns))
147 return inMemoryDataset[readColumns]
148 else:
149 return inMemoryDataset
151 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]:
152 """Get all columns, including index columns.
154 Returns
155 -------
156 columns : `list` [`str`]
157 List of all columns.
158 """
159 allColumns = list(inMemoryDataset.columns)
160 if inMemoryDataset.index.names[0] is not None:
161 allColumns.extend(inMemoryDataset.index.names)
163 return allColumns