Coverage for python/lsst/daf/butler/datastore/record_data.py: 32%

71 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-30 02:51 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Support for generic data stores.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData") 

33 

34import dataclasses 

35import uuid 

36from collections.abc import Mapping 

37from typing import TYPE_CHECKING, TypeAlias 

38 

39import pydantic 

40 

41from .._dataset_ref import DatasetId 

42from ..dimensions import DimensionUniverse 

43from ..persistence_context import PersistenceContextVars 

44from .stored_file_info import StoredDatastoreItemInfo 

45 

46if TYPE_CHECKING: 

47 from ..registry import Registry 

48 

49# Pydantic requires the possible value types to be explicitly enumerated in 

50# order for `uuid.UUID` in particular to work. `typing.Any` does not work 

51# here. 

52_Record: TypeAlias = dict[str, int | str | uuid.UUID | None] 

53 

54 

55class SerializedDatastoreRecordData(pydantic.BaseModel): 

56 """Representation of a `DatastoreRecordData` suitable for serialization.""" 

57 

58 dataset_ids: list[uuid.UUID] 

59 """List of dataset IDs""" 

60 

61 records: Mapping[str, Mapping[str, Mapping[str, list[_Record]]]] 

62 """List of records indexed by record class name, dataset ID (encoded as 

63 str, because JSON), and opaque table name. 

64 """ 

65 

66 @classmethod 

67 def direct( 

68 cls, 

69 *, 

70 dataset_ids: list[str | uuid.UUID], 

71 records: dict[str, dict[str, dict[str, list[_Record]]]], 

72 ) -> SerializedDatastoreRecordData: 

73 """Construct a `SerializedDatastoreRecordData` directly without 

74 validators. 

75 

76 Parameters 

77 ---------- 

78 dataset_ids : `list` [`str` or `uuid.UUID`] 

79 The dataset UUIDs. 

80 records : `dict` 

81 The datastore records. 

82 

83 Notes 

84 ----- 

85 This differs from the pydantic "construct" method in that the 

86 arguments are explicitly what the model requires, and it will recurse 

87 through members, constructing them from their corresponding `direct` 

88 methods. 

89 

90 This method should only be called when the inputs are trusted. 

91 """ 

92 data = cls.model_construct( 

93 _fields_set={"dataset_ids", "records"}, 

94 # JSON makes strings out of UUIDs, need to convert them back 

95 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids], 

96 records=records, 

97 ) 

98 

99 return data 

100 

101 

102@dataclasses.dataclass 

103class DatastoreRecordData: 

104 """A struct that represents a tabular data export from a single 

105 datastore. 

106 """ 

107 

108 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field( 

109 default_factory=dict 

110 ) 

111 """Opaque table data, indexed by dataset ID and grouped by opaque table 

112 name.""" 

113 

114 def update(self, other: DatastoreRecordData) -> None: 

115 """Update contents of this instance with data from another instance. 

116 

117 Parameters 

118 ---------- 

119 other : `DatastoreRecordData` 

120 Records to merge into this instance. 

121 

122 Notes 

123 ----- 

124 If a ``(dataset_id, table_name)`` combination has any records in 

125 ``self``, it is assumed that all records for that combination are 

126 already present. This allows duplicates of the same dataset to be 

127 handled gracefully. 

128 """ 

129 for dataset_id, table_records in other.records.items(): 

130 this_table_records = self.records.setdefault(dataset_id, {}) 

131 for table_name, records in table_records.items(): 

132 # If this (dataset_id, table_name) combination already has 

133 # records in `self`, we assume that means all of the records 

134 # for that combination; we require other code to ensure entire 

135 # (parent) datasets are exported to these data structures 

136 # (never components). 

137 if not (this_records := this_table_records.setdefault(table_name, [])): 

138 this_records.extend(records) 

139 

140 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None: 

141 """Extract a subset of the records that match given dataset IDs. 

142 

143 Parameters 

144 ---------- 

145 dataset_ids : `set` [ `DatasetId` ] 

146 Dataset IDs to match. 

147 

148 Returns 

149 ------- 

150 record_data : `DatastoreRecordData` or `None` 

151 `None` is returned if there are no matching refs. 

152 

153 Notes 

154 ----- 

155 Records in the returned instance are shared with this instance, clients 

156 should not update or extend records in the returned instance. 

157 """ 

158 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

159 for dataset_id in dataset_ids: 

160 if (id_records := self.records.get(dataset_id)) is not None: 

161 matching_records[dataset_id] = id_records 

162 if matching_records: 

163 return DatastoreRecordData(records=matching_records) 

164 else: 

165 return None 

166 

167 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData: 

168 """Make representation of the object for serialization. 

169 

170 Implements `~lsst.daf.butler.json.SupportsSimple` protocol. 

171 

172 Parameters 

173 ---------- 

174 minimal : `bool`, optional 

175 If True produce minimal representation, not used by this method. 

176 

177 Returns 

178 ------- 

179 simple : `dict` 

180 Representation of this instance as a simple dictionary. 

181 """ 

182 records: dict[str, dict[str, dict[str, list[_Record]]]] = {} 

183 for dataset_id, table_data in self.records.items(): 

184 for table_name, table_records in table_data.items(): 

185 class_name, infos = StoredDatastoreItemInfo.to_records(table_records) 

186 class_records = records.setdefault(class_name, {}) 

187 dataset_records = class_records.setdefault(dataset_id.hex, {}) 

188 dataset_records.setdefault(table_name, []).extend(dict(info) for info in infos) 

189 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records) 

190 

191 @classmethod 

192 def from_simple( 

193 cls, 

194 simple: SerializedDatastoreRecordData, 

195 universe: DimensionUniverse | None = None, 

196 registry: Registry | None = None, 

197 ) -> DatastoreRecordData: 

198 """Make an instance of this class from serialized data. 

199 

200 Implements `~lsst.daf.butler.json.SupportsSimple` protocol. 

201 

202 Parameters 

203 ---------- 

204 simple : `dict` 

205 Serialized representation returned from `to_simple` method. 

206 universe : `DimensionUniverse`, optional 

207 Dimension universe, not used by this method. 

208 registry : `Registry`, optional 

209 Registry instance, not used by this method. 

210 

211 Returns 

212 ------- 

213 item_info : `StoredDatastoreItemInfo` 

214 De-serialized instance of `StoredDatastoreItemInfo`. 

215 """ 

216 cache = PersistenceContextVars.dataStoreRecords.get() 

217 key = frozenset(simple.dataset_ids) 

218 if cache is not None and (cachedRecord := cache.get(key)) is not None: 

219 return cachedRecord 

220 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

221 # make sure that all dataset IDs appear in the dict even if they don't 

222 # have records. 

223 for dataset_id in simple.dataset_ids: 

224 records[dataset_id] = {} 

225 for class_name, class_data in simple.records.items(): 

226 for dataset_id_str, dataset_data in class_data.items(): 

227 for table_name, table_records in dataset_data.items(): 

228 try: 

229 infos = StoredDatastoreItemInfo.from_records(class_name, table_records) 

230 except TypeError as exc: 

231 raise RuntimeError( 

232 "The class specified in the SerializedDatastoreRecordData " 

233 f"({class_name}) is not a StoredDatastoreItemInfo." 

234 ) from exc 

235 dataset_records = records.setdefault(uuid.UUID(dataset_id_str), {}) 

236 dataset_records.setdefault(table_name, []).extend(infos) 

237 newRecord = cls(records=records) 

238 if cache is not None: 

239 cache[key] = newRecord 

240 return newRecord