Coverage for python/lsst/daf/butler/dimensions/_record_table.py: 31%

69 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-25 10:24 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("DimensionRecordTable",) 

31 

32from collections.abc import Iterable, Iterator 

33from typing import TYPE_CHECKING, Any, final, overload 

34 

35import pyarrow as pa 

36import pyarrow.compute as pc 

37from lsst.utils.iteration import chunk_iterable 

38 

39if TYPE_CHECKING: 

40 from ._elements import DimensionElement 

41 from ._records import DimensionRecord 

42 from ._universe import DimensionUniverse 

43 

44 

45@final 

46class DimensionRecordTable: 

47 """A table-like container for `DimensionRecord` objects. 

48 

49 Parameters 

50 ---------- 

51 element : `DimensionElement` or `str`, optional 

52 The dimension element that defines the records held by this table. If 

53 not a `DimensionElement` instance, ``universe`` must be provided. If 

54 not provided, ``table`` must have an "element" entry in its metadata 

55 (as is the case for tables returned by the `to_arrow` method). 

56 records : `~collections.abc.Iterable` [ `DimensionRecord` ], optional 

57 Dimension records to add to the table. 

58 universe : `DimensionUniverse`, optional 

59 Object that defines all dimensions. Ignored if ``element`` is a 

60 `DimensionElement` instance. 

61 table : `pyarrow.Table` 

62 Arrow table to copy columns from. Must have schema returned by 

63 `make_arrow_schema` for this element. This argument is primarily 

64 intended to serve as the way to reconstruct a `DimensionRecordTable` 

65 that has been serialized to an Arrow-supported file or IPC format. 

66 batch_size : `int`, optional 

67 How many elements of ``records`` should be processed at a time, with 

68 each batch yielding a `pyarrow.RecordBatch` in the created table. 

69 Smaller values will reduce peak memory usage for large iterables. 

70 Ignored if ``records`` is empty. 

71 

72 Notes 

73 ----- 

74 `DimensionRecordTable` should generally have a smaller memory footprint 

75 than `DimensionRecordSet` if its rows are unique, and it provides fast 

76 column-oriented access and Arrow interoperability that `DimensionRecordSet` 

77 lacks entirely. In other respects `DimensionRecordSet` is more 

78 featureful and simpler to use efficiently. 

79 """ 

80 

81 def __init__( 

82 self, 

83 element: DimensionElement | str | None = None, 

84 records: Iterable[DimensionRecord] = (), 

85 universe: DimensionUniverse | None = None, 

86 table: pa.Table | None = None, 

87 batch_size: int | None = None, 

88 ): 

89 if element is None: 

90 if table is not None and b"element" in table.schema.metadata: 

91 element = table.schema.metadata[b"element"].decode() 

92 else: 

93 raise TypeError("If 'element' is not provided it must be present in 'table.schema.metadata'.") 

94 if isinstance(element, str): 

95 if universe is None: 

96 raise TypeError("'universe' must be provided if 'element' is not a DimensionElement.") 

97 element = universe[element] 

98 else: 

99 universe = element.universe 

100 self._element = element 

101 self._converters = element.schema.to_arrow() 

102 arrow_schema = pa.schema( 

103 [converter.field for converter in self._converters], 

104 { 

105 b"element": element.name.encode(), 

106 # Since the Arrow table might be saved to a file on its own, we 

107 # include the dimension universe's identifiers in its metadata. 

108 b"namespace": element.universe.namespace.encode(), 

109 b"version": str(element.universe.version).encode(), 

110 }, 

111 ) 

112 self._required_value_fields = [pc.field(name) for name in self._element.schema.required.names] 

113 if batch_size is None: 

114 batches = [self._make_batch(records, arrow_schema)] 

115 else: 

116 batches = [ 

117 self._make_batch(record_chunk, arrow_schema) 

118 for record_chunk in chunk_iterable(records, chunk_size=batch_size) 

119 ] 

120 if table is not None: 

121 batches.extend(table.to_batches()) 

122 self._table: pa.Table = pa.Table.from_batches(batches, arrow_schema) 

123 

124 @classmethod 

125 def make_arrow_schema(cls, element: DimensionElement) -> pa.Schema: 

126 """Return the Arrow schema of the table returned by `to_arrow` with the 

127 given dimension element. 

128 

129 Parameters 

130 ---------- 

131 element : `DimensionElement` 

132 Dimension element that defines the schema. 

133 

134 Returns 

135 ------- 

136 schema : `pyarrow.Schema` 

137 Arrow schema. 

138 """ 

139 return pa.schema([converter.field for converter in element.schema.to_arrow()]) 

140 

141 @property 

142 def element(self) -> DimensionElement: 

143 """The dimension element that defines the records of this table.""" 

144 return self._element 

145 

146 def __len__(self) -> int: 

147 return self._table.num_rows 

148 

149 def __iter__(self) -> Iterator[DimensionRecord]: 

150 for i in range(self._table.num_rows): 

151 yield self._get_record_at(self._table, i) 

152 

153 @overload 

154 def __getitem__(self, index: int) -> DimensionRecord: ... 154 ↛ exitline 154 didn't return from function '__getitem__', because

155 

156 @overload 

157 def __getitem__(self, index: slice) -> DimensionRecordTable: ... 157 ↛ exitline 157 didn't return from function '__getitem__', because

158 

159 def __getitem__(self, index: int | slice) -> DimensionRecord | DimensionRecordTable: 

160 if isinstance(index, slice): 

161 result = object.__new__(DimensionRecordTable) 

162 result._element = self._element 

163 result._converters = self._converters 

164 result._table = self._table[index] 

165 return result 

166 else: 

167 return self._get_record_at(self._table, index) 

168 

169 def extend(self, records: Iterable[DimensionRecord]) -> None: 

170 """Add new rows to the end of the table. 

171 

172 Parameters 

173 ---------- 

174 records : `~collections.abc.Iterable` [ `DimensionRecord` ] 

175 Dimension records to add to the table. 

176 """ 

177 batches: list[pa.RecordBatch] = self._table.to_batches() 

178 batches.append(self._make_batch(records, self._table.schema)) 

179 self._table = pa.Table.from_batches(batches, self._table.schema) 

180 

181 def column(self, name: str) -> pa.ChunkedArray: 

182 """Return a single column from the table as an array. 

183 

184 Parameters 

185 ---------- 

186 name : `str` 

187 Name of the column. Valid options are given by 

188 `DimensionElement.schema.names`, and are the same as the attributes 

189 of the dimension records. 

190 

191 Returns 

192 ------- 

193 array : `pyarrow.ChunkedArray` 

194 An array view of the column. 

195 """ 

196 return self._table.column(name) 

197 

198 def to_arrow(self) -> pa.Table: 

199 """Return a Arrow table holding the same records.""" 

200 return self._table 

201 

202 def _make_batch(self, records: Iterable[DimensionRecord], arrow_schema: pa.Schema) -> pa.RecordBatch: 

203 """Make a `pyarrow.RecordBatch` from an iterable of `DimensionRecord`. 

204 

205 Parameters 

206 ---------- 

207 records : `~collections.abc.Iterable` [ `DimensionRecord` ] 

208 Records to add. 

209 arrow_schema : `pyarrow.Schema` 

210 Arrow schema for the record batch. 

211 

212 Returns 

213 ------- 

214 batch : `pyarrow.RecordBatch` 

215 Record batch holding the records. 

216 """ 

217 list_columns: list[list[Any]] = [list() for _ in self._converters] 

218 for record in records: 

219 for converter, column in zip(self._converters, list_columns): 

220 converter.append(getattr(record, converter.name), column) 

221 array_columns = [ 

222 converter.finish(column) for converter, column in zip(self._converters, list_columns) 

223 ] 

224 return pa.record_batch(array_columns, arrow_schema) 

225 

226 def _get_record_at(self, table: pa.Table | pa.RecordBatch, index: int) -> DimensionRecord: 

227 """Construct a `DimensionRecord` from a row in the table. 

228 

229 Parameters 

230 ---------- 

231 table : `pyarrow.Table` or `pyarrow.RecordBatch` 

232 Table or record batch to get values from. 

233 index : `int` 

234 Index of the row to extract. 

235 

236 Returns 

237 ------- 

238 record : `DimensionRecord` 

239 Dimension record representing a table row. 

240 """ 

241 return self._element.RecordClass( 

242 **{k: table.column(j)[index].as_py() for j, k in enumerate(self._element.schema.all.names)} 

243 )