Coverage for python/lsst/daf/butler/transfers/_yaml.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

171 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26import uuid 

27import warnings 

28from collections import defaultdict 

29from datetime import datetime 

30from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type 

31 

32import astropy.time 

33import yaml 

34from lsst.resources import ResourcePath 

35from lsst.utils import doImportType 

36from lsst.utils.iteration import ensure_iterable 

37 

38from ..core import ( 

39 DatasetAssociation, 

40 DatasetId, 

41 DatasetRef, 

42 DatasetType, 

43 Datastore, 

44 DimensionElement, 

45 DimensionRecord, 

46 FileDataset, 

47 Timespan, 

48) 

49from ..core.named import NamedValueSet 

50from ..registry import CollectionType, Registry 

51from ..registry.interfaces import ( 

52 ChainedCollectionRecord, 

53 CollectionRecord, 

54 DatasetIdGenEnum, 

55 RunRecord, 

56 VersionTuple, 

57) 

58from ..registry.versions import IncompatibleVersionError 

59from ._interfaces import RepoExportBackend, RepoImportBackend 

60 

61EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1) 

62"""Export format version. 

63 

64Files with a different major version or a newer minor version cannot be read by 

65this version of the code. 

66""" 

67 

68 

69def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node: 

70 """Generate YAML representation for UUID. 

71 

72 This produces a scalar node with a tag "!uuid" and value being a regular 

73 string representation of UUID. 

74 """ 

75 return dumper.represent_scalar("!uuid", str(data)) 

76 

77 

78def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]: 

79 if node.value is not None: 

80 return uuid.UUID(hex=node.value) 

81 return None 

82 

83 

84yaml.Dumper.add_representer(uuid.UUID, _uuid_representer) 

85yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor) 

86 

87 

88class YamlRepoExportBackend(RepoExportBackend): 

89 """A repository export implementation that saves to a YAML file. 

90 

91 Parameters 

92 ---------- 

93 stream 

94 A writeable file-like object. 

95 """ 

96 

97 def __init__(self, stream: IO): 

98 self.stream = stream 

99 self.data: List[Dict[str, Any]] = [] 

100 

101 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

102 # Docstring inherited from RepoExportBackend.saveDimensionData. 

103 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

104 self.data.append( 

105 { 

106 "type": "dimension", 

107 "element": element.name, 

108 "records": data_dicts, 

109 } 

110 ) 

111 

112 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None: 

113 # Docstring inherited from RepoExportBackend.saveCollections. 

114 data: Dict[str, Any] = { 

115 "type": "collection", 

116 "collection_type": record.type.name, 

117 "name": record.name, 

118 } 

119 if doc is not None: 

120 data["doc"] = doc 

121 if isinstance(record, RunRecord): 

122 data["host"] = record.host 

123 data["timespan_begin"] = record.timespan.begin 

124 data["timespan_end"] = record.timespan.end 

125 elif isinstance(record, ChainedCollectionRecord): 

126 data["children"] = list(record.children) 

127 self.data.append(data) 

128 

129 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

130 # Docstring inherited from RepoExportBackend.saveDatasets. 

131 self.data.append( 

132 { 

133 "type": "dataset_type", 

134 "name": datasetType.name, 

135 "dimensions": [d.name for d in datasetType.dimensions], 

136 "storage_class": datasetType.storageClass.name, 

137 "is_calibration": datasetType.isCalibration(), 

138 } 

139 ) 

140 self.data.append( 

141 { 

142 "type": "dataset", 

143 "dataset_type": datasetType.name, 

144 "run": run, 

145 "records": [ 

146 { 

147 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

148 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

149 "path": dataset.path, 

150 "formatter": dataset.formatter, 

151 # TODO: look up and save other collections 

152 } 

153 for dataset in datasets 

154 ], 

155 } 

156 ) 

157 

158 def saveDatasetAssociations( 

159 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation] 

160 ) -> None: 

161 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

162 if collectionType is CollectionType.TAGGED: 

163 self.data.append( 

164 { 

165 "type": "associations", 

166 "collection": collection, 

167 "collection_type": collectionType.name, 

168 "dataset_ids": [assoc.ref.id for assoc in associations], 

169 } 

170 ) 

171 elif collectionType is CollectionType.CALIBRATION: 

172 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list) 

173 for association in associations: 

174 assert association.timespan is not None 

175 assert association.ref.id is not None 

176 idsByTimespan[association.timespan].append(association.ref.id) 

177 self.data.append( 

178 { 

179 "type": "associations", 

180 "collection": collection, 

181 "collection_type": collectionType.name, 

182 "validity_ranges": [ 

183 { 

184 "timespan": timespan, 

185 "dataset_ids": dataset_ids, 

186 } 

187 for timespan, dataset_ids in idsByTimespan.items() 

188 ], 

189 } 

190 ) 

191 

192 def finish(self) -> None: 

193 # Docstring inherited from RepoExportBackend. 

194 yaml.dump( 

195 { 

196 "description": "Butler Data Repository Export", 

197 "version": str(EXPORT_FORMAT_VERSION), 

198 "data": self.data, 

199 }, 

200 stream=self.stream, 

201 sort_keys=False, 

202 ) 

203 

204 

205class YamlRepoImportBackend(RepoImportBackend): 

206 """A repository import implementation that reads from a YAML file. 

207 

208 Parameters 

209 ---------- 

210 stream 

211 A readable file-like object. 

212 registry : `Registry` 

213 The registry datasets will be imported into. Only used to retreive 

214 dataset types during construction; all write happen in `register` 

215 and `load`. 

216 """ 

217 

218 def __init__(self, stream: IO, registry: Registry): 

219 # We read the file fully and convert its contents to Python objects 

220 # instead of loading incrementally so we can spot some problems early; 

221 # because `register` can't be put inside a transaction, we'd rather not 

222 # run that at all if there's going to be problem later in `load`. 

223 wrapper = yaml.safe_load(stream) 

224 if wrapper["version"] == 0: 

225 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

226 # before we really tried to do versioning here. 

227 fileVersion = VersionTuple(1, 0, 0) 

228 else: 

229 fileVersion = VersionTuple.fromString(wrapper["version"]) 

230 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

231 raise IncompatibleVersionError( 

232 f"Cannot read repository export file with version={fileVersion} " 

233 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

234 ) 

235 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

236 raise IncompatibleVersionError( 

237 f"Cannot read repository export file with version={fileVersion} " 

238 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

239 ) 

240 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {} 

241 self.chains: Dict[str, List[str]] = {} 

242 self.collections: Dict[str, CollectionType] = {} 

243 self.collectionDocs: Dict[str, str] = {} 

244 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

245 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

246 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list) 

247 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict) 

248 self.refsByFileId: Dict[DatasetId, DatasetRef] = {} 

249 self.registry: Registry = registry 

250 datasetData = [] 

251 for data in wrapper["data"]: 

252 if data["type"] == "dimension": 

253 # convert all datetime values to astropy 

254 for record in data["records"]: 

255 for key in record: 

256 # Some older YAML files were produced with native 

257 # YAML support for datetime, we support reading that 

258 # data back. Newer conversion uses _AstropyTimeToYAML 

259 # class with special YAML tag. 

260 if isinstance(record[key], datetime): 

261 record[key] = astropy.time.Time(record[key], scale="utc") 

262 element = self.registry.dimensions[data["element"]] 

263 RecordClass: Type[DimensionRecord] = element.RecordClass 

264 self.dimensions[element].extend(RecordClass(**r) for r in data["records"]) 

265 elif data["type"] == "collection": 

266 collectionType = CollectionType.from_name(data["collection_type"]) 

267 if collectionType is CollectionType.RUN: 

268 self.runs[data["name"]] = ( 

269 data["host"], 

270 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]), 

271 ) 

272 elif collectionType is CollectionType.CHAINED: 

273 children = [] 

274 for child in data["children"]: 

275 if not isinstance(child, str): 

276 warnings.warn( 

277 f"CHAINED collection {data['name']} includes restrictions on child " 

278 "collection searches, which are no longer suppored and will be ignored." 

279 ) 

280 # Old form with dataset type restrictions only, 

281 # supported for backwards compatibility. 

282 child, _ = child 

283 children.append(child) 

284 self.chains[data["name"]] = children 

285 else: 

286 self.collections[data["name"]] = collectionType 

287 doc = data.get("doc") 

288 if doc is not None: 

289 self.collectionDocs[data["name"]] = doc 

290 elif data["type"] == "run": 

291 # Also support old form of saving a run with no extra info. 

292 self.runs[data["name"]] = (None, Timespan(None, None)) 

293 elif data["type"] == "dataset_type": 

294 self.datasetTypes.add( 

295 DatasetType( 

296 data["name"], 

297 dimensions=data["dimensions"], 

298 storageClass=data["storage_class"], 

299 universe=self.registry.dimensions, 

300 isCalibration=data.get("is_calibration", False), 

301 ) 

302 ) 

303 elif data["type"] == "dataset": 

304 # Save raw dataset data for a second loop, so we can ensure we 

305 # know about all dataset types first. 

306 datasetData.append(data) 

307 elif data["type"] == "associations": 

308 collectionType = CollectionType.from_name(data["collection_type"]) 

309 if collectionType is CollectionType.TAGGED: 

310 self.tagAssociations[data["collection"]].extend(data["dataset_ids"]) 

311 elif collectionType is CollectionType.CALIBRATION: 

312 assocsByTimespan = self.calibAssociations[data["collection"]] 

313 for d in data["validity_ranges"]: 

314 if "timespan" in d: 

315 assocsByTimespan[d["timespan"]] = d["dataset_ids"] 

316 else: 

317 # TODO: this is for backward compatibility, should 

318 # be removed at some point. 

319 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"] 

320 else: 

321 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

322 else: 

323 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

324 # key is (dataset type name, run) 

325 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list) 

326 for data in datasetData: 

327 datasetType = self.datasetTypes.get(data["dataset_type"]) 

328 if datasetType is None: 

329 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

330 self.datasets[data["dataset_type"], data["run"]].extend( 

331 FileDataset( 

332 d.get("path"), 

333 [ 

334 DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

335 for dataId, refid in zip( 

336 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]) 

337 ) 

338 ], 

339 formatter=doImportType(d.get("formatter")) if "formatter" in d else None, 

340 ) 

341 for d in data["records"] 

342 ) 

343 

344 def register(self) -> None: 

345 # Docstring inherited from RepoImportBackend.register. 

346 for datasetType in self.datasetTypes: 

347 self.registry.registerDatasetType(datasetType) 

348 for run in self.runs: 

349 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

350 # No way to add extra run info to registry yet. 

351 for collection, collection_type in self.collections.items(): 

352 self.registry.registerCollection( 

353 collection, collection_type, doc=self.collectionDocs.get(collection) 

354 ) 

355 for chain, children in self.chains.items(): 

356 self.registry.registerCollection( 

357 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain) 

358 ) 

359 self.registry.setCollectionChain(chain, children) 

360 

361 def load( 

362 self, 

363 datastore: Optional[Datastore], 

364 *, 

365 directory: Optional[str] = None, 

366 transfer: Optional[str] = None, 

367 skip_dimensions: Optional[Set] = None, 

368 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

369 reuseIds: bool = False, 

370 ) -> None: 

371 # Docstring inherited from RepoImportBackend.load. 

372 for element, dimensionRecords in self.dimensions.items(): 

373 if skip_dimensions and element in skip_dimensions: 

374 continue 

375 self.registry.insertDimensionData(element, *dimensionRecords) 

376 # FileDatasets to ingest into the datastore (in bulk): 

377 fileDatasets = [] 

378 for (datasetTypeName, run), records in self.datasets.items(): 

379 # Make a big flattened list of all data IDs and dataset_ids, while 

380 # remembering slices that associate them with the FileDataset 

381 # instances they came from. 

382 datasets: List[DatasetRef] = [] 

383 dataset_ids: List[DatasetId] = [] 

384 slices = [] 

385 for fileDataset in records: 

386 start = len(datasets) 

387 datasets.extend(fileDataset.refs) 

388 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore 

389 stop = len(datasets) 

390 slices.append(slice(start, stop)) 

391 # Insert all of those DatasetRefs at once. 

392 # For now, we ignore the dataset_id we pulled from the file 

393 # and just insert without one to get a new autoincrement value. 

394 # Eventually (once we have origin in IDs) we'll preserve them. 

395 resolvedRefs = self.registry._importDatasets( 

396 datasets, idGenerationMode=idGenerationMode, reuseIds=reuseIds 

397 ) 

398 # Populate our dictionary that maps int dataset_id values from the 

399 # export file to the new DatasetRefs 

400 for fileId, ref in zip(dataset_ids, resolvedRefs): 

401 self.refsByFileId[fileId] = ref 

402 # Now iterate over the original records, and install the new 

403 # resolved DatasetRefs to replace the unresolved ones as we 

404 # reorganize the collection information. 

405 for sliceForFileDataset, fileDataset in zip(slices, records): 

406 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

407 if directory is not None: 

408 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path) 

409 fileDatasets.append(fileDataset) 

410 # Ingest everything into the datastore at once. 

411 if datastore is not None and fileDatasets: 

412 datastore.ingest(*fileDatasets, transfer=transfer) 

413 # Associate datasets with tagged collections. 

414 for collection, dataset_ids in self.tagAssociations.items(): 

415 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

416 # Associate datasets with calibration collections. 

417 for collection, idsByTimespan in self.calibAssociations.items(): 

418 for timespan, dataset_ids in idsByTimespan.items(): 

419 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)