Coverage for python/lsst/daf/butler/delegates/dataframe.py: 21%
42 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-11 02:31 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-11 02:31 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for reading DataFrames."""
23from __future__ import annotations
25import collections.abc
26from typing import Any, Mapping, Optional
28import pandas
29from lsst.daf.butler import StorageClassDelegate
30from lsst.daf.butler.formatters.parquet import DataFrameSchema
31from lsst.utils.introspection import get_full_type_name
32from lsst.utils.iteration import ensure_iterable
34__all__ = ["DataFrameDelegate"]
37class DataFrameDelegate(StorageClassDelegate):
38 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any:
39 """Get a component from a DataFrame.
41 Parameters
42 ----------
43 composite : `~pandas.DataFrame`
44 ``DataFrame`` to access component.
45 componentName : `str`
46 Name of component to retrieve.
48 Returns
49 -------
50 component : `object`
51 The component.
53 Raises
54 ------
55 AttributeError
56 The component can not be found.
57 """
58 if componentName == "columns":
59 return pandas.Index(self._getAllColumns(composite))
60 elif componentName == "rowcount":
61 return len(composite)
62 elif componentName == "schema":
63 return DataFrameSchema(composite.iloc[:0])
64 else:
65 raise AttributeError(
66 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}"
67 )
69 def handleParameters(
70 self, inMemoryDataset: pandas.DataFrame, parameters: Optional[Mapping[str, Any]] = None
71 ) -> Any:
72 """Return possibly new in-memory dataset using the supplied parameters.
74 Parameters
75 ----------
76 inMemoryDataset : `object`
77 Object to modify based on the parameters.
78 parameters : `dict`, optional
79 Parameters to apply. Values are specific to the parameter.
80 Supported parameters are defined in the associated
81 `StorageClass`. If no relevant parameters are specified the
82 ``inMemoryDataset`` will be return unchanged.
84 Returns
85 -------
86 inMemoryDataset : `object`
87 Original in-memory dataset, or updated form after parameters
88 have been used.
89 """
90 if not isinstance(inMemoryDataset, pandas.DataFrame):
91 raise ValueError(
92 "handleParameters for a DataFrame must get a DataFrame, "
93 f"not {get_full_type_name(inMemoryDataset)}."
94 )
96 if parameters is None:
97 return inMemoryDataset
99 if "columns" in parameters:
100 allColumns = self._getAllColumns(inMemoryDataset)
102 if not isinstance(parameters["columns"], collections.abc.Iterable):
103 raise NotImplementedError(
104 "InMemoryDataset of a DataFrame only supports list/tuple of string column names"
105 )
107 for column in ensure_iterable(parameters["columns"]):
108 if not isinstance(column, str):
109 raise NotImplementedError(
110 "InMemoryDataset of a DataFrame only supports string column names."
111 )
112 if column not in allColumns:
113 raise ValueError(f"Unrecognized column name {column!r}.")
115 # Exclude index columns from the subset.
116 readColumns = [
117 name
118 for name in ensure_iterable(parameters["columns"])
119 if name not in inMemoryDataset.index.names
120 ]
121 # Ensure uniqueness, keeping order.
122 readColumns = list(dict.fromkeys(readColumns))
124 return inMemoryDataset[readColumns]
125 else:
126 return inMemoryDataset
128 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]:
129 """Get all columns, including index columns.
131 Returns
132 -------
133 columns : `list` [`str`]
134 List of all columns.
135 """
136 allColumns = list(inMemoryDataset.columns)
137 if inMemoryDataset.index.names[0] is not None:
138 allColumns.extend(inMemoryDataset.index.names)
140 return allColumns