Coverage for python/lsst/daf/butler/core/datastoreRecordData.py: 28%

85 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData") 

27 

28import dataclasses 

29import uuid 

30from collections.abc import Mapping 

31from typing import TYPE_CHECKING, Any, TypeAlias 

32 

33from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat 

34from lsst.utils import doImportType 

35from lsst.utils.introspection import get_full_type_name 

36 

37from .datasets import DatasetId 

38from .dimensions import DimensionUniverse 

39from .persistenceContext import PersistenceContextVars 

40from .storedFileInfo import StoredDatastoreItemInfo 

41 

42if TYPE_CHECKING: 

43 from ..registry import Registry 

44 

45# Pydantic 2 requires we be explicit about the types that are used in 

46# datastore records. Without this UUID can not be handled. Pydantic v1 

47# wants the opposite and does not work unless we use Any. 

48if PYDANTIC_V2: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 _Record: TypeAlias = dict[str, int | str | uuid.UUID | None] 

50else: 

51 _Record: TypeAlias = dict[str, Any] # type: ignore 

52 

53 

54class SerializedDatastoreRecordData(_BaseModelCompat): 

55 """Representation of a `DatastoreRecordData` suitable for serialization.""" 

56 

57 dataset_ids: list[uuid.UUID] 

58 """List of dataset IDs""" 

59 

60 records: Mapping[str, Mapping[str, list[_Record]]] 

61 """List of records indexed by record class name and table name.""" 

62 

63 @classmethod 

64 def direct( 

65 cls, 

66 *, 

67 dataset_ids: list[str | uuid.UUID], 

68 records: dict[str, dict[str, list[_Record]]], 

69 ) -> SerializedDatastoreRecordData: 

70 """Construct a `SerializedDatastoreRecordData` directly without 

71 validators. 

72 

73 This differs from the pydantic "construct" method in that the 

74 arguments are explicitly what the model requires, and it will recurse 

75 through members, constructing them from their corresponding `direct` 

76 methods. 

77 

78 This method should only be called when the inputs are trusted. 

79 """ 

80 # See also comments in record_ids_to_uuid() 

81 for table_data in records.values(): 

82 for table_records in table_data.values(): 

83 for record in table_records: 

84 # This only checks dataset_id value, if there are any other 

85 # columns that are UUIDs we'd need more generic approach. 

86 if (id := record.get("dataset_id")) is not None: 

87 record["dataset_id"] = uuid.UUID(id) if isinstance(id, str) else id 

88 

89 data = cls.model_construct( 

90 _fields_set={"dataset_ids", "records"}, 

91 # JSON makes strings out of UUIDs, need to convert them back 

92 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids], 

93 records=records, 

94 ) 

95 

96 return data 

97 

98 

99@dataclasses.dataclass 

100class DatastoreRecordData: 

101 """A struct that represents a tabular data export from a single 

102 datastore. 

103 """ 

104 

105 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field( 

106 default_factory=dict 

107 ) 

108 """Opaque table data, indexed by dataset ID and grouped by opaque table 

109 name.""" 

110 

111 def update(self, other: DatastoreRecordData) -> None: 

112 """Update contents of this instance with data from another instance. 

113 

114 Parameters 

115 ---------- 

116 other : `DatastoreRecordData` 

117 Records to merge into this instance. 

118 

119 Notes 

120 ----- 

121 If a ``(dataset_id, table_name)`` combination has any records in 

122 ``self``, it is assumed that all records for that combination are 

123 already present. This allows duplicates of the same dataset to be 

124 handled gracefully. 

125 """ 

126 for dataset_id, table_records in other.records.items(): 

127 this_table_records = self.records.setdefault(dataset_id, {}) 

128 for table_name, records in table_records.items(): 

129 # If this (dataset_id, table_name) combination already has 

130 # records in `self`, we assume that means all of the records 

131 # for that combination; we require other code to ensure entire 

132 # (parent) datasets are exported to these data structures 

133 # (never components). 

134 if not (this_records := this_table_records.setdefault(table_name, [])): 

135 this_records.extend(records) 

136 

137 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None: 

138 """Extract a subset of the records that match given dataset IDs. 

139 

140 Parameters 

141 ---------- 

142 dataset_ids : `set` [ `DatasetId` ] 

143 Dataset IDs to match. 

144 

145 Returns 

146 ------- 

147 record_data : `DatastoreRecordData` or `None` 

148 `None` is returned if there are no matching refs. 

149 

150 Notes 

151 ----- 

152 Records in the returned instance are shared with this instance, clients 

153 should not update or extend records in the returned instance. 

154 """ 

155 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

156 for dataset_id in dataset_ids: 

157 if (id_records := self.records.get(dataset_id)) is not None: 

158 matching_records[dataset_id] = id_records 

159 if matching_records: 

160 return DatastoreRecordData(records=matching_records) 

161 else: 

162 return None 

163 

164 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData: 

165 """Make representation of the object for serialization. 

166 

167 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol. 

168 

169 Parameters 

170 ---------- 

171 minimal : `bool`, optional 

172 If True produce minimal representation, not used by this method. 

173 

174 Returns 

175 ------- 

176 simple : `dict` 

177 Representation of this instance as a simple dictionary. 

178 """ 

179 

180 def _class_name(records: list[StoredDatastoreItemInfo]) -> str: 

181 """Get fully qualified class name for the records. Empty string 

182 returned if list is empty. Exception is raised if records are of 

183 different classes. 

184 """ 

185 if not records: 

186 return "" 

187 classes = {record.__class__ for record in records} 

188 assert len(classes) == 1, f"Records have to be of the same class: {classes}" 

189 return get_full_type_name(classes.pop()) 

190 

191 records: dict[str, dict[str, list[_Record]]] = {} 

192 for table_data in self.records.values(): 

193 for table_name, table_records in table_data.items(): 

194 class_name = _class_name(table_records) 

195 class_records = records.setdefault(class_name, {}) 

196 class_records.setdefault(table_name, []).extend( 

197 [record.to_record() for record in table_records] 

198 ) 

199 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records) 

200 

201 @classmethod 

202 def from_simple( 

203 cls, 

204 simple: SerializedDatastoreRecordData, 

205 universe: DimensionUniverse | None = None, 

206 registry: Registry | None = None, 

207 ) -> DatastoreRecordData: 

208 """Make an instance of this class from serialized data. 

209 

210 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol. 

211 

212 Parameters 

213 ---------- 

214 data : `dict` 

215 Serialized representation returned from `to_simple` method. 

216 universe : `DimensionUniverse`, optional 

217 Dimension universe, not used by this method. 

218 registry : `Registry`, optional 

219 Registry instance, not used by this method. 

220 

221 Returns 

222 ------- 

223 item_info : `StoredDatastoreItemInfo` 

224 De-serialized instance of `StoredDatastoreItemInfo`. 

225 """ 

226 cache = PersistenceContextVars.dataStoreRecords.get() 

227 key = frozenset(simple.dataset_ids) 

228 if cache is not None and (cachedRecord := cache.get(key)) is not None: 

229 return cachedRecord 

230 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

231 # make sure that all dataset IDs appear in the dict even if they don't 

232 # have records. 

233 for dataset_id in simple.dataset_ids: 

234 records[dataset_id] = {} 

235 for class_name, table_data in simple.records.items(): 

236 klass = doImportType(class_name) 

237 if not issubclass(klass, StoredDatastoreItemInfo): 

238 raise RuntimeError( 

239 "The class specified in the SerializedDatastoreRecordData " 

240 f"({get_full_type_name(klass)}) is not a StoredDatastoreItemInfo." 

241 ) 

242 for table_name, table_records in table_data.items(): 

243 for record in table_records: 

244 info = klass.from_record(record) 

245 dataset_type_records = records.setdefault(info.dataset_id, {}) 

246 dataset_type_records.setdefault(table_name, []).append(info) 

247 newRecord = cls(records=records) 

248 if cache is not None: 

249 cache[key] = newRecord 

250 return newRecord