Coverage for python/lsst/daf/butler/delegates/arrowtable.py: 31%
35 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for reading Arrow tables."""
29from __future__ import annotations
31from collections.abc import Mapping
32from typing import Any
34import pyarrow as pa
35from lsst.daf.butler import StorageClassDelegate
36from lsst.utils.introspection import get_full_type_name
37from lsst.utils.iteration import ensure_iterable
39__all__ = ["ArrowTableDelegate"]
42class ArrowTableDelegate(StorageClassDelegate):
43 """Delegate that understands the ``ArrowTable`` storage class."""
45 _datasetType = pa.Table
47 def getComponent(self, composite: pa.Table, componentName: str) -> Any:
48 """Get a component from an Arrow table.
50 Parameters
51 ----------
52 composite : `~pyarrow.Table`
53 Arrow table to access component.
54 componentName : `str`
55 Name of component to retrieve.
57 Returns
58 -------
59 component : `object`
60 The component.
62 Raises
63 ------
64 AttributeError
65 The component can not be found.
66 """
67 if componentName in ("columns", "schema"):
68 # The schema will be translated to column format
69 # depending on the input type.
70 return composite.schema
71 elif componentName == "rowcount":
72 return len(composite[composite.schema.names[0]])
74 raise AttributeError(
75 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}"
76 )
78 def handleParameters(self, inMemoryDataset: Any, parameters: Mapping[str, Any] | None = None) -> Any:
79 if not isinstance(inMemoryDataset, self._datasetType):
80 raise ValueError(
81 f"inMemoryDataset must be a {get_full_type_name(self._datasetType)} and "
82 f"not {get_full_type_name(inMemoryDataset)}."
83 )
85 if parameters is None:
86 return inMemoryDataset
88 if "columns" in parameters:
89 read_columns = list(ensure_iterable(parameters["columns"]))
90 for column in read_columns:
91 if not isinstance(column, str):
92 raise NotImplementedError(
93 "InMemoryDataset of an Arrow Table only supports string column names."
94 )
95 if column not in self._getColumns(inMemoryDataset):
96 raise ValueError(f"Unrecognized column name {column!r}.")
98 # Ensure uniqueness, keeping order.
99 read_columns = list(dict.fromkeys(read_columns))
101 return self._selectColumns(inMemoryDataset, read_columns)
102 else:
103 return inMemoryDataset
105 def _getColumns(self, inMemoryDataset: pa.Table) -> list[str]:
106 """Get the column names from the inMemoryDataset.
108 Parameters
109 ----------
110 inMemoryDataset : `object`
111 Dataset to extract columns.
113 Returns
114 -------
115 columns : `list` [`str`]
116 List of columns.
117 """
118 return inMemoryDataset.schema.names
120 def _selectColumns(self, inMemoryDataset: pa.Table, columns: list[str]) -> pa.Table:
121 """Select a subset of columns from the inMemoryDataset.
123 Parameters
124 ----------
125 inMemoryDataset : `object`
126 Dataset to extract columns.
127 columns : `list` [`str`]
128 List of columns to extract.
130 Returns
131 -------
132 subDataset : `object`
133 Subselection of inMemoryDataset.
134 """
135 return inMemoryDataset.select(columns)