Coverage for python/lsst/daf/butler/core/datastoreRecordData.py: 28%

85 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Support for generic data stores.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData") 

33 

34import dataclasses 

35import uuid 

36from collections.abc import Mapping 

37from typing import TYPE_CHECKING, Any, TypeAlias 

38 

39from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat 

40from lsst.utils import doImportType 

41from lsst.utils.introspection import get_full_type_name 

42 

43from .datasets import DatasetId 

44from .dimensions import DimensionUniverse 

45from .persistenceContext import PersistenceContextVars 

46from .storedFileInfo import StoredDatastoreItemInfo 

47 

48if TYPE_CHECKING: 

49 from ..registry import Registry 

50 

51# Pydantic 2 requires we be explicit about the types that are used in 

52# datastore records. Without this UUID can not be handled. Pydantic v1 

53# wants the opposite and does not work unless we use Any. 

54if PYDANTIC_V2: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 _Record: TypeAlias = dict[str, int | str | uuid.UUID | None] 

56else: 

57 _Record: TypeAlias = dict[str, Any] # type: ignore 

58 

59 

60class SerializedDatastoreRecordData(_BaseModelCompat): 

61 """Representation of a `DatastoreRecordData` suitable for serialization.""" 

62 

63 dataset_ids: list[uuid.UUID] 

64 """List of dataset IDs""" 

65 

66 records: Mapping[str, Mapping[str, list[_Record]]] 

67 """List of records indexed by record class name and table name.""" 

68 

69 @classmethod 

70 def direct( 

71 cls, 

72 *, 

73 dataset_ids: list[str | uuid.UUID], 

74 records: dict[str, dict[str, list[_Record]]], 

75 ) -> SerializedDatastoreRecordData: 

76 """Construct a `SerializedDatastoreRecordData` directly without 

77 validators. 

78 

79 This differs from the pydantic "construct" method in that the 

80 arguments are explicitly what the model requires, and it will recurse 

81 through members, constructing them from their corresponding `direct` 

82 methods. 

83 

84 This method should only be called when the inputs are trusted. 

85 """ 

86 # See also comments in record_ids_to_uuid() 

87 for table_data in records.values(): 

88 for table_records in table_data.values(): 

89 for record in table_records: 

90 # This only checks dataset_id value, if there are any other 

91 # columns that are UUIDs we'd need more generic approach. 

92 if (id := record.get("dataset_id")) is not None: 

93 record["dataset_id"] = uuid.UUID(id) if isinstance(id, str) else id 

94 

95 data = cls.model_construct( 

96 _fields_set={"dataset_ids", "records"}, 

97 # JSON makes strings out of UUIDs, need to convert them back 

98 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids], 

99 records=records, 

100 ) 

101 

102 return data 

103 

104 

105@dataclasses.dataclass 

106class DatastoreRecordData: 

107 """A struct that represents a tabular data export from a single 

108 datastore. 

109 """ 

110 

111 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field( 

112 default_factory=dict 

113 ) 

114 """Opaque table data, indexed by dataset ID and grouped by opaque table 

115 name.""" 

116 

117 def update(self, other: DatastoreRecordData) -> None: 

118 """Update contents of this instance with data from another instance. 

119 

120 Parameters 

121 ---------- 

122 other : `DatastoreRecordData` 

123 Records to merge into this instance. 

124 

125 Notes 

126 ----- 

127 If a ``(dataset_id, table_name)`` combination has any records in 

128 ``self``, it is assumed that all records for that combination are 

129 already present. This allows duplicates of the same dataset to be 

130 handled gracefully. 

131 """ 

132 for dataset_id, table_records in other.records.items(): 

133 this_table_records = self.records.setdefault(dataset_id, {}) 

134 for table_name, records in table_records.items(): 

135 # If this (dataset_id, table_name) combination already has 

136 # records in `self`, we assume that means all of the records 

137 # for that combination; we require other code to ensure entire 

138 # (parent) datasets are exported to these data structures 

139 # (never components). 

140 if not (this_records := this_table_records.setdefault(table_name, [])): 

141 this_records.extend(records) 

142 

143 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None: 

144 """Extract a subset of the records that match given dataset IDs. 

145 

146 Parameters 

147 ---------- 

148 dataset_ids : `set` [ `DatasetId` ] 

149 Dataset IDs to match. 

150 

151 Returns 

152 ------- 

153 record_data : `DatastoreRecordData` or `None` 

154 `None` is returned if there are no matching refs. 

155 

156 Notes 

157 ----- 

158 Records in the returned instance are shared with this instance, clients 

159 should not update or extend records in the returned instance. 

160 """ 

161 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

162 for dataset_id in dataset_ids: 

163 if (id_records := self.records.get(dataset_id)) is not None: 

164 matching_records[dataset_id] = id_records 

165 if matching_records: 

166 return DatastoreRecordData(records=matching_records) 

167 else: 

168 return None 

169 

170 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData: 

171 """Make representation of the object for serialization. 

172 

173 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol. 

174 

175 Parameters 

176 ---------- 

177 minimal : `bool`, optional 

178 If True produce minimal representation, not used by this method. 

179 

180 Returns 

181 ------- 

182 simple : `dict` 

183 Representation of this instance as a simple dictionary. 

184 """ 

185 

186 def _class_name(records: list[StoredDatastoreItemInfo]) -> str: 

187 """Get fully qualified class name for the records. Empty string 

188 returned if list is empty. Exception is raised if records are of 

189 different classes. 

190 """ 

191 if not records: 

192 return "" 

193 classes = {record.__class__ for record in records} 

194 assert len(classes) == 1, f"Records have to be of the same class: {classes}" 

195 return get_full_type_name(classes.pop()) 

196 

197 records: dict[str, dict[str, list[_Record]]] = {} 

198 for table_data in self.records.values(): 

199 for table_name, table_records in table_data.items(): 

200 class_name = _class_name(table_records) 

201 class_records = records.setdefault(class_name, {}) 

202 class_records.setdefault(table_name, []).extend( 

203 [record.to_record() for record in table_records] 

204 ) 

205 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records) 

206 

207 @classmethod 

208 def from_simple( 

209 cls, 

210 simple: SerializedDatastoreRecordData, 

211 universe: DimensionUniverse | None = None, 

212 registry: Registry | None = None, 

213 ) -> DatastoreRecordData: 

214 """Make an instance of this class from serialized data. 

215 

216 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol. 

217 

218 Parameters 

219 ---------- 

220 data : `dict` 

221 Serialized representation returned from `to_simple` method. 

222 universe : `DimensionUniverse`, optional 

223 Dimension universe, not used by this method. 

224 registry : `Registry`, optional 

225 Registry instance, not used by this method. 

226 

227 Returns 

228 ------- 

229 item_info : `StoredDatastoreItemInfo` 

230 De-serialized instance of `StoredDatastoreItemInfo`. 

231 """ 

232 cache = PersistenceContextVars.dataStoreRecords.get() 

233 key = frozenset(simple.dataset_ids) 

234 if cache is not None and (cachedRecord := cache.get(key)) is not None: 

235 return cachedRecord 

236 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

237 # make sure that all dataset IDs appear in the dict even if they don't 

238 # have records. 

239 for dataset_id in simple.dataset_ids: 

240 records[dataset_id] = {} 

241 for class_name, table_data in simple.records.items(): 

242 klass = doImportType(class_name) 

243 if not issubclass(klass, StoredDatastoreItemInfo): 

244 raise RuntimeError( 

245 "The class specified in the SerializedDatastoreRecordData " 

246 f"({get_full_type_name(klass)}) is not a StoredDatastoreItemInfo." 

247 ) 

248 for table_name, table_records in table_data.items(): 

249 for record in table_records: 

250 info = klass.from_record(record) 

251 dataset_type_records = records.setdefault(info.dataset_id, {}) 

252 dataset_type_records.setdefault(table_name, []).append(info) 

253 newRecord = cls(records=records) 

254 if cache is not None: 

255 cache[key] = newRecord 

256 return newRecord