Coverage for python/lsst/daf/butler/dimensions/_record_table.py: 31%
69 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:16 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:16 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("DimensionRecordTable",)
32from collections.abc import Iterable, Iterator
33from typing import TYPE_CHECKING, Any, final, overload
35import pyarrow as pa
36import pyarrow.compute as pc
37from lsst.utils.iteration import chunk_iterable
39if TYPE_CHECKING:
40 from ._elements import DimensionElement
41 from ._records import DimensionRecord
42 from ._universe import DimensionUniverse
45@final
46class DimensionRecordTable:
47 """A table-like container for `DimensionRecord` objects.
49 Parameters
50 ----------
51 element : `DimensionElement` or `str`, optional
52 The dimension element that defines the records held by this table. If
53 not a `DimensionElement` instance, ``universe`` must be provided. If
54 not provided, ``table`` must have an "element" entry in its metadata
55 (as is the case for tables returned by the `to_arrow` method).
56 records : `~collections.abc.Iterable` [ `DimensionRecord` ], optional
57 Dimension records to add to the table.
58 universe : `DimensionUniverse`, optional
59 Object that defines all dimensions. Ignored if ``element`` is a
60 `DimensionElement` instance.
61 table : `pyarrow.Table`
62 Arrow table to copy columns from. Must have schema returned by
63 `make_arrow_schema` for this element. This argument is primarily
64 intended to serve as the way to reconstruct a `DimensionRecordTable`
65 that has been serialized to an Arrow-supported file or IPC format.
66 batch_size : `int`, optional
67 How many elements of ``records`` should be processed at a time, with
68 each batch yielding a `pyarrow.RecordBatch` in the created table.
69 Smaller values will reduce peak memory usage for large iterables.
70 Ignored if ``records`` is empty.
72 Notes
73 -----
74 `DimensionRecordTable` should generally have a smaller memory footprint
75 than `DimensionRecordSet` if its rows are unique, and it provides fast
76 column-oriented access and Arrow interoperability that `DimensionRecordSet`
77 lacks entirely. In other respects `DimensionRecordSet` is more
78 featureful and simpler to use efficiently.
79 """
81 def __init__(
82 self,
83 element: DimensionElement | str | None = None,
84 records: Iterable[DimensionRecord] = (),
85 universe: DimensionUniverse | None = None,
86 table: pa.Table | None = None,
87 batch_size: int | None = None,
88 ):
89 if element is None:
90 if table is not None and b"element" in table.schema.metadata:
91 element = table.schema.metadata[b"element"].decode()
92 else:
93 raise TypeError("If 'element' is not provided it must be present in 'table.schema.metadata'.")
94 if isinstance(element, str):
95 if universe is None:
96 raise TypeError("'universe' must be provided if 'element' is not a DimensionElement.")
97 element = universe[element]
98 else:
99 universe = element.universe
100 self._element = element
101 self._converters = element.schema.to_arrow()
102 arrow_schema = pa.schema(
103 [converter.field for converter in self._converters],
104 {
105 b"element": element.name.encode(),
106 # Since the Arrow table might be saved to a file on its own, we
107 # include the dimension universe's identifiers in its metadata.
108 b"namespace": element.universe.namespace.encode(),
109 b"version": str(element.universe.version).encode(),
110 },
111 )
112 self._required_value_fields = [pc.field(name) for name in self._element.schema.required.names]
113 if batch_size is None:
114 batches = [self._make_batch(records, arrow_schema)]
115 else:
116 batches = [
117 self._make_batch(record_chunk, arrow_schema)
118 for record_chunk in chunk_iterable(records, chunk_size=batch_size)
119 ]
120 if table is not None:
121 batches.extend(table.to_batches())
122 self._table: pa.Table = pa.Table.from_batches(batches, arrow_schema)
124 @classmethod
125 def make_arrow_schema(cls, element: DimensionElement) -> pa.Schema:
126 """Return the Arrow schema of the table returned by `to_arrow` with the
127 given dimension element.
129 Parameters
130 ----------
131 element : `DimensionElement`
132 Dimension element that defines the schema.
134 Returns
135 -------
136 schema : `pyarrow.Schema`
137 Arrow schema.
138 """
139 return pa.schema([converter.field for converter in element.schema.to_arrow()])
141 @property
142 def element(self) -> DimensionElement:
143 """The dimension element that defines the records of this table."""
144 return self._element
146 def __len__(self) -> int:
147 return self._table.num_rows
149 def __iter__(self) -> Iterator[DimensionRecord]:
150 for i in range(self._table.num_rows):
151 yield self._get_record_at(self._table, i)
153 @overload
154 def __getitem__(self, index: int) -> DimensionRecord: ... 154 ↛ exitline 154 didn't return from function '__getitem__', because
156 @overload
157 def __getitem__(self, index: slice) -> DimensionRecordTable: ... 157 ↛ exitline 157 didn't return from function '__getitem__', because
159 def __getitem__(self, index: int | slice) -> DimensionRecord | DimensionRecordTable:
160 if isinstance(index, slice):
161 result = object.__new__(DimensionRecordTable)
162 result._element = self._element
163 result._converters = self._converters
164 result._table = self._table[index]
165 return result
166 else:
167 return self._get_record_at(self._table, index)
169 def extend(self, records: Iterable[DimensionRecord]) -> None:
170 """Add new rows to the end of the table.
172 Parameters
173 ----------
174 records : `~collections.abc.Iterable` [ `DimensionRecord` ]
175 Dimension records to add to the table.
176 """
177 batches: list[pa.RecordBatch] = self._table.to_batches()
178 batches.append(self._make_batch(records, self._table.schema))
179 self._table = pa.Table.from_batches(batches, self._table.schema)
181 def column(self, name: str) -> pa.ChunkedArray:
182 """Return a single column from the table as an array.
184 Parameters
185 ----------
186 name : `str`
187 Name of the column. Valid options are given by
188 `DimensionElement.schema.names`, and are the same as the attributes
189 of the dimension records.
191 Returns
192 -------
193 array : `pyarrow.ChunkedArray`
194 An array view of the column.
195 """
196 return self._table.column(name)
198 def to_arrow(self) -> pa.Table:
199 """Return a Arrow table holding the same records."""
200 return self._table
202 def _make_batch(self, records: Iterable[DimensionRecord], arrow_schema: pa.Schema) -> pa.RecordBatch:
203 """Make a `pyarrow.RecordBatch` from an iterable of `DimensionRecord`.
205 Parameters
206 ----------
207 records : `~collections.abc.Iterable` [ `DimensionRecord` ]
208 Records to add.
209 arrow_schema : `pyarrow.Schema`
210 Arrow schema for the record batch.
212 Returns
213 -------
214 batch : `pyarrow.RecordBatch`
215 Record batch holding the records.
216 """
217 list_columns: list[list[Any]] = [list() for _ in self._converters]
218 for record in records:
219 for converter, column in zip(self._converters, list_columns):
220 converter.append(getattr(record, converter.name), column)
221 array_columns = [
222 converter.finish(column) for converter, column in zip(self._converters, list_columns)
223 ]
224 return pa.record_batch(array_columns, arrow_schema)
226 def _get_record_at(self, table: pa.Table | pa.RecordBatch, index: int) -> DimensionRecord:
227 """Construct a `DimensionRecord` from a row in the table.
229 Parameters
230 ----------
231 table : `pyarrow.Table` or `pyarrow.RecordBatch`
232 Table or record batch to get values from.
233 index : `int`
234 Index of the row to extract.
236 Returns
237 -------
238 record : `DimensionRecord`
239 Dimension record representing a table row.
240 """
241 return self._element.RecordClass(
242 **{k: table.column(j)[index].as_py() for j, k in enumerate(self._element.schema.all.names)}
243 )