Coverage for python/lsst/daf/butler/core/datastoreRecordData.py: 29%

88 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-14 19:21 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData") 

27 

28import dataclasses 

29import uuid 

30from collections.abc import Mapping 

31from typing import TYPE_CHECKING, Any 

32 

33from lsst.utils import doImportType 

34from lsst.utils.introspection import get_full_type_name 

35 

36try: 

37 from pydantic.v1 import BaseModel 

38except ModuleNotFoundError: 

39 from pydantic import BaseModel # type: ignore 

40 

41from .datasets import DatasetId 

42from .dimensions import DimensionUniverse 

43from .persistenceContext import PersistenceContextVars 

44from .storedFileInfo import StoredDatastoreItemInfo 

45 

46if TYPE_CHECKING: 

47 from ..registry import Registry 

48 

49_Record = dict[str, Any] 

50 

51 

52class SerializedDatastoreRecordData(BaseModel): 

53 """Representation of a `DatastoreRecordData` suitable for serialization.""" 

54 

55 dataset_ids: list[uuid.UUID] 

56 """List of dataset IDs""" 

57 

58 records: Mapping[str, Mapping[str, list[_Record]]] 

59 """List of records indexed by record class name and table name.""" 

60 

61 @classmethod 

62 def direct( 

63 cls, 

64 *, 

65 dataset_ids: list[str | uuid.UUID], 

66 records: dict[str, dict[str, list[_Record]]], 

67 ) -> SerializedDatastoreRecordData: 

68 """Construct a `SerializedDatastoreRecordData` directly without 

69 validators. 

70 

71 This differs from the pydantic "construct" method in that the 

72 arguments are explicitly what the model requires, and it will recurse 

73 through members, constructing them from their corresponding `direct` 

74 methods. 

75 

76 This method should only be called when the inputs are trusted. 

77 """ 

78 data = SerializedDatastoreRecordData.__new__(cls) 

79 setter = object.__setattr__ 

80 # JSON makes strings out of UUIDs, need to convert them back 

81 setter(data, "dataset_ids", [uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids]) 

82 # See also comments in record_ids_to_uuid() 

83 for table_data in records.values(): 

84 for table_records in table_data.values(): 

85 for record in table_records: 

86 # This only checks dataset_id value, if there are any other 

87 # columns that are UUIDs we'd need more generic approach. 

88 if (id := record.get("dataset_id")) is not None: 

89 record["dataset_id"] = uuid.UUID(id) if isinstance(id, str) else id 

90 setter(data, "records", records) 

91 return data 

92 

93 

94@dataclasses.dataclass 

95class DatastoreRecordData: 

96 """A struct that represents a tabular data export from a single 

97 datastore. 

98 """ 

99 

100 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field( 

101 default_factory=dict 

102 ) 

103 """Opaque table data, indexed by dataset ID and grouped by opaque table 

104 name.""" 

105 

106 def update(self, other: DatastoreRecordData) -> None: 

107 """Update contents of this instance with data from another instance. 

108 

109 Parameters 

110 ---------- 

111 other : `DatastoreRecordData` 

112 Records tho merge into this instance. 

113 

114 Notes 

115 ----- 

116 Merged instances can not have identical records. 

117 """ 

118 for dataset_id, table_records in other.records.items(): 

119 this_table_records = self.records.setdefault(dataset_id, {}) 

120 for table_name, records in table_records.items(): 

121 this_table_records.setdefault(table_name, []).extend(records) 

122 

123 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None: 

124 """Extract a subset of the records that match given dataset IDs. 

125 

126 Parameters 

127 ---------- 

128 dataset_ids : `set` [ `DatasetId` ] 

129 Dataset IDs to match. 

130 

131 Returns 

132 ------- 

133 record_data : `DatastoreRecordData` or `None` 

134 `None` is returned if there are no matching refs. 

135 

136 Notes 

137 ----- 

138 Records in the returned instance are shared with this instance, clients 

139 should not update or extend records in the returned instance. 

140 """ 

141 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

142 for dataset_id in dataset_ids: 

143 if (id_records := self.records.get(dataset_id)) is not None: 

144 matching_records[dataset_id] = id_records 

145 if matching_records: 

146 return DatastoreRecordData(records=matching_records) 

147 else: 

148 return None 

149 

150 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData: 

151 """Make representation of the object for serialization. 

152 

153 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol. 

154 

155 Parameters 

156 ---------- 

157 minimal : `bool`, optional 

158 If True produce minimal representation, not used by this method. 

159 

160 Returns 

161 ------- 

162 simple : `dict` 

163 Representation of this instance as a simple dictionary. 

164 """ 

165 

166 def _class_name(records: list[StoredDatastoreItemInfo]) -> str: 

167 """Get fully qualified class name for the records. Empty string 

168 returned if list is empty. Exception is raised if records are of 

169 different classes. 

170 """ 

171 if not records: 

172 return "" 

173 classes = {record.__class__ for record in records} 

174 assert len(classes) == 1, f"Records have to be of the same class: {classes}" 

175 return get_full_type_name(classes.pop()) 

176 

177 records: dict[str, dict[str, list[_Record]]] = {} 

178 for table_data in self.records.values(): 

179 for table_name, table_records in table_data.items(): 

180 class_name = _class_name(table_records) 

181 class_records = records.setdefault(class_name, {}) 

182 class_records.setdefault(table_name, []).extend( 

183 [record.to_record() for record in table_records] 

184 ) 

185 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records) 

186 

187 @classmethod 

188 def from_simple( 

189 cls, 

190 simple: SerializedDatastoreRecordData, 

191 universe: DimensionUniverse | None = None, 

192 registry: Registry | None = None, 

193 ) -> DatastoreRecordData: 

194 """Make an instance of this class from serialized data. 

195 

196 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol. 

197 

198 Parameters 

199 ---------- 

200 data : `dict` 

201 Serialized representation returned from `to_simple` method. 

202 universe : `DimensionUniverse`, optional 

203 Dimension universe, not used by this method. 

204 registry : `Registry`, optional 

205 Registry instance, not used by this method. 

206 

207 Returns 

208 ------- 

209 item_info : `StoredDatastoreItemInfo` 

210 De-serialized instance of `StoredDatastoreItemInfo`. 

211 """ 

212 cache = PersistenceContextVars.dataStoreRecords.get() 

213 key = frozenset(simple.dataset_ids) 

214 if cache is not None and (cachedRecord := cache.get(key)) is not None: 

215 return cachedRecord 

216 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {} 

217 # make sure that all dataset IDs appear in the dict even if they don't 

218 # have records. 

219 for dataset_id in simple.dataset_ids: 

220 records[dataset_id] = {} 

221 for class_name, table_data in simple.records.items(): 

222 klass = doImportType(class_name) 

223 if not issubclass(klass, StoredDatastoreItemInfo): 

224 raise RuntimeError( 

225 "The class specified in the SerializedDatastoreRecordData " 

226 f"({get_full_type_name(klass)}) is not a StoredDatastoreItemInfo." 

227 ) 

228 for table_name, table_records in table_data.items(): 

229 for record in table_records: 

230 info = klass.from_record(record) 

231 dataset_type_records = records.setdefault(info.dataset_id, {}) 

232 dataset_type_records.setdefault(table_name, []).append(info) 

233 newRecord = cls(records=records) 

234 if cache is not None: 

235 cache[key] = newRecord 

236 return newRecord