Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26import os 

27from datetime import datetime 

28from typing import ( 

29 Any, 

30 Dict, 

31 IO, 

32 Iterable, 

33 List, 

34 Mapping, 

35 Optional, 

36 Set, 

37 Tuple, 

38 Type, 

39) 

40from collections import defaultdict 

41 

42import yaml 

43import astropy.time 

44 

45from lsst.utils import doImport 

46from ..core import ( 

47 DatasetAssociation, 

48 DatasetRef, 

49 DatasetType, 

50 DataCoordinate, 

51 Datastore, 

52 DimensionElement, 

53 DimensionRecord, 

54 FileDataset, 

55 Timespan, 

56) 

57from ..core.utils import iterable 

58from ..core.named import NamedValueSet 

59from ..registry import CollectionType, Registry 

60from ..registry.wildcards import DatasetTypeRestriction 

61from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord 

62from ._interfaces import RepoExportBackend, RepoImportBackend 

63 

64 

65class YamlRepoExportBackend(RepoExportBackend): 

66 """A repository export implementation that saves to a YAML file. 

67 

68 Parameters 

69 ---------- 

70 stream 

71 A writeable file-like object. 

72 """ 

73 

74 def __init__(self, stream: IO): 

75 self.stream = stream 

76 self.data: List[Dict[str, Any]] = [] 

77 

78 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

79 # Docstring inherited from RepoExportBackend.saveDimensionData. 

80 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

81 self.data.append({ 

82 "type": "dimension", 

83 "element": element.name, 

84 "records": data_dicts, 

85 }) 

86 

87 def saveCollection(self, record: CollectionRecord) -> None: 

88 # Docstring inherited from RepoExportBackend.saveCollections. 

89 data: Dict[str, Any] = { 

90 "type": "collection", 

91 "collection_type": record.type.name, 

92 "name": record.name, 

93 } 

94 if isinstance(record, RunRecord): 

95 data["host"] = record.host 

96 data["timespan_begin"] = record.timespan.begin 

97 data["timespan_end"] = record.timespan.end 

98 elif isinstance(record, ChainedCollectionRecord): 

99 data["children"] = [ 

100 [name, list(restriction.names) if restriction.names is not ... else None] # type: ignore 

101 for name, restriction in record.children 

102 ] 

103 self.data.append(data) 

104 

105 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

106 # Docstring inherited from RepoExportBackend.saveDatasets. 

107 self.data.append({ 

108 "type": "dataset_type", 

109 "name": datasetType.name, 

110 "dimensions": [d.name for d in datasetType.dimensions], 

111 "storage_class": datasetType.storageClass.name, 

112 "is_calibration": datasetType.isCalibration(), 

113 }) 

114 self.data.append({ 

115 "type": "dataset", 

116 "dataset_type": datasetType.name, 

117 "run": run, 

118 "records": [ 

119 { 

120 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

121 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

122 "path": dataset.path, 

123 "formatter": dataset.formatter, 

124 # TODO: look up and save other collections 

125 } 

126 for dataset in datasets 

127 ] 

128 }) 

129 

130 def saveDatasetAssociations(self, collection: str, collectionType: CollectionType, 

131 associations: Iterable[DatasetAssociation]) -> None: 

132 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

133 if collectionType is CollectionType.TAGGED: 

134 self.data.append({ 

135 "type": "associations", 

136 "collection": collection, 

137 "collection_type": collectionType.name, 

138 "dataset_ids": [assoc.ref.id for assoc in associations], 

139 }) 

140 elif collectionType is CollectionType.CALIBRATION: 

141 idsByTimespan: Dict[Timespan, List[int]] = defaultdict(list) 

142 for association in associations: 

143 assert association.timespan is not None 

144 assert association.ref.id is not None 

145 idsByTimespan[association.timespan].append(association.ref.id) 

146 self.data.append({ 

147 "type": "associations", 

148 "collection": collection, 

149 "collection_type": collectionType.name, 

150 "validity_ranges": [ 

151 { 

152 "begin": timespan.begin, 

153 "end": timespan.end, 

154 "dataset_ids": dataset_ids, 

155 } 

156 for timespan, dataset_ids in idsByTimespan.items() 

157 ] 

158 }) 

159 

160 def finish(self) -> None: 

161 # Docstring inherited from RepoExportBackend. 

162 yaml.dump( 

163 { 

164 "description": "Butler Data Repository Export", 

165 "version": 0, 

166 "data": self.data, 

167 }, 

168 stream=self.stream, 

169 sort_keys=False, 

170 ) 

171 

172 

173class YamlRepoImportBackend(RepoImportBackend): 

174 """A repository import implementation that reads from a YAML file. 

175 

176 Parameters 

177 ---------- 

178 stream 

179 A readable file-like object. 

180 registry : `Registry` 

181 The registry datasets will be imported into. Only used to retreive 

182 dataset types during construction; all write happen in `register` 

183 and `load`. 

184 """ 

185 

186 def __init__(self, stream: IO, registry: Registry): 

187 # We read the file fully and convert its contents to Python objects 

188 # instead of loading incrementally so we can spot some problems early; 

189 # because `register` can't be put inside a transaction, we'd rather not 

190 # run that at all if there's going to be problem later in `load`. 

191 wrapper = yaml.safe_load(stream) 

192 # TODO: When version numbers become meaningful, check here that we can 

193 # read the version in the file. 

194 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {} 

195 self.chains: Dict[str, List[Tuple[str, DatasetTypeRestriction]]] = {} 

196 self.collections: Dict[str, CollectionType] = {} 

197 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

198 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

199 self.tagAssociations: Dict[str, List[int]] = defaultdict(list) 

200 self.calibAssociations: Dict[str, Dict[Timespan, List[int]]] = defaultdict(dict) 

201 self.refsByFileId: Dict[int, DatasetRef] = {} 

202 self.registry: Registry = registry 

203 datasetData = [] 

204 for data in wrapper["data"]: 

205 if data["type"] == "dimension": 

206 # convert all datetime values to astropy 

207 for record in data["records"]: 

208 for key in record: 

209 # Some older YAML files were produced with native 

210 # YAML support for datetime, we support reading that 

211 # data back. Newer conversion uses _AstropyTimeToYAML 

212 # class with special YAML tag. 

213 if isinstance(record[key], datetime): 

214 record[key] = astropy.time.Time(record[key], scale="utc") 

215 element = self.registry.dimensions[data["element"]] 

216 RecordClass: Type[DimensionRecord] = element.RecordClass 

217 self.dimensions[element].extend( 

218 RecordClass(**r) for r in data["records"] 

219 ) 

220 elif data["type"] == "collection": 

221 collectionType = CollectionType.__members__[data["collection_type"].upper()] 

222 if collectionType is CollectionType.RUN: 

223 self.runs[data["name"]] = ( 

224 data["host"], 

225 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]) 

226 ) 

227 elif collectionType is CollectionType.CHAINED: 

228 children = [] 

229 for name, restriction_data in data["children"]: 

230 if restriction_data is None: 

231 restriction = DatasetTypeRestriction.any 

232 else: 

233 restriction = DatasetTypeRestriction.fromExpression(restriction_data) 

234 children.append((name, restriction)) 

235 self.chains[data["name"]] = children 

236 else: 

237 self.collections[data["name"]] = collectionType 

238 elif data["type"] == "run": 

239 # Also support old form of saving a run with no extra info. 

240 self.runs[data["name"]] = (None, Timespan(None, None)) 

241 elif data["type"] == "dataset_type": 

242 self.datasetTypes.add( 

243 DatasetType(data["name"], dimensions=data["dimensions"], 

244 storageClass=data["storage_class"], universe=self.registry.dimensions, 

245 isCalibration=data.get("is_calibration", False)) 

246 ) 

247 elif data["type"] == "dataset": 

248 # Save raw dataset data for a second loop, so we can ensure we 

249 # know about all dataset types first. 

250 datasetData.append(data) 

251 elif data["type"] == "associations": 

252 collectionType = CollectionType.__members__[data["collection_type"].upper()] 

253 if collectionType is CollectionType.TAGGED: 

254 self.tagAssociations[data["collection"]].extend(data["dataset_ids"]) 

255 elif collectionType is CollectionType.CALIBRATION: 

256 assocsByTimespan = self.calibAssociations[data["collection"]] 

257 for d in data["validity_ranges"]: 

258 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"] 

259 else: 

260 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

261 else: 

262 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

263 # key is (dataset type name, run) 

264 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list) 

265 for data in datasetData: 

266 datasetType = self.datasetTypes.get(data["dataset_type"]) 

267 if datasetType is None: 

268 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

269 self.datasets[data["dataset_type"], data["run"]].extend( 

270 FileDataset( 

271 d.get("path"), 

272 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

273 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

274 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

275 ) 

276 for d in data["records"] 

277 ) 

278 

279 def register(self) -> None: 

280 # Docstring inherited from RepoImportBackend.register. 

281 for datasetType in self.datasetTypes: 

282 self.registry.registerDatasetType(datasetType) 

283 for run in self.runs: 

284 self.registry.registerRun(run) 

285 # No way to add extra run info to registry yet. 

286 for collection, collection_type in self.collections.items(): 

287 self.registry.registerCollection(collection, collection_type) 

288 for chain, children in self.chains.items(): 

289 self.registry.registerCollection(chain, CollectionType.CHAINED) 

290 self.registry.setCollectionChain(chain, children) 

291 

292 def load(self, datastore: Optional[Datastore], *, 

293 directory: Optional[str] = None, transfer: Optional[str] = None, 

294 skip_dimensions: Optional[Set] = None) -> None: 

295 # Docstring inherited from RepoImportBackend.load. 

296 for element, dimensionRecords in self.dimensions.items(): 

297 if skip_dimensions and element in skip_dimensions: 

298 continue 

299 self.registry.insertDimensionData(element, *dimensionRecords) 

300 # FileDatasets to ingest into the datastore (in bulk): 

301 fileDatasets = [] 

302 for (datasetTypeName, run), records in self.datasets.items(): 

303 datasetType = self.registry.getDatasetType(datasetTypeName) 

304 # Make a big flattened list of all data IDs and dataset_ids, while 

305 # remembering slices that associate them with the FileDataset 

306 # instances they came from. 

307 dataIds: List[DataCoordinate] = [] 

308 dataset_ids: List[int] = [] 

309 slices = [] 

310 for fileDataset in records: 

311 start = len(dataIds) 

312 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

313 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore 

314 stop = len(dataIds) 

315 slices.append(slice(start, stop)) 

316 # Insert all of those DatasetRefs at once. 

317 # For now, we ignore the dataset_id we pulled from the file 

318 # and just insert without one to get a new autoincrement value. 

319 # Eventually (once we have origin in IDs) we'll preserve them. 

320 resolvedRefs = self.registry.insertDatasets( 

321 datasetType, 

322 dataIds=dataIds, 

323 run=run, 

324 ) 

325 # Populate our dictionary that maps int dataset_id values from the 

326 # export file to the new DatasetRefs 

327 for fileId, ref in zip(dataset_ids, resolvedRefs): 

328 self.refsByFileId[fileId] = ref 

329 # Now iterate over the original records, and install the new 

330 # resolved DatasetRefs to replace the unresolved ones as we 

331 # reorganize the collection information. 

332 for sliceForFileDataset, fileDataset in zip(slices, records): 

333 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

334 if directory is not None: 

335 fileDataset.path = os.path.join(directory, fileDataset.path) 

336 fileDatasets.append(fileDataset) 

337 # Ingest everything into the datastore at once. 

338 if datastore is not None and fileDatasets: 

339 datastore.ingest(*fileDatasets, transfer=transfer) 

340 # Associate datasets with tagged collections. 

341 for collection, dataset_ids in self.tagAssociations.items(): 

342 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

343 # Associate datasets with calibration collections. 

344 for collection, idsByTimespan in self.calibAssociations.items(): 

345 for timespan, dataset_ids in idsByTimespan.items(): 

346 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)