Coverage for python/lsst/daf/butler/transfers/_yaml.py: 11%

185 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-07 09:47 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26import uuid 

27import warnings 

28from collections import defaultdict 

29from datetime import datetime 

30from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type 

31 

32import astropy.time 

33import yaml 

34from lsst.resources import ResourcePath 

35from lsst.utils import doImportType 

36from lsst.utils.iteration import ensure_iterable 

37 

38from ..core import ( 

39 DatasetAssociation, 

40 DatasetId, 

41 DatasetRef, 

42 DatasetType, 

43 Datastore, 

44 DimensionElement, 

45 DimensionRecord, 

46 DimensionUniverse, 

47 FileDataset, 

48 Timespan, 

49) 

50from ..core.named import NamedValueSet 

51from ..registry import CollectionType, Registry 

52from ..registry.interfaces import ( 

53 ChainedCollectionRecord, 

54 CollectionRecord, 

55 DatasetIdGenEnum, 

56 RunRecord, 

57 VersionTuple, 

58) 

59from ..registry.versions import IncompatibleVersionError 

60from ._interfaces import RepoExportBackend, RepoImportBackend 

61 

62EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2) 

63"""Export format version. 

64 

65Files with a different major version or a newer minor version cannot be read by 

66this version of the code. 

67""" 

68 

69 

70def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node: 

71 """Generate YAML representation for UUID. 

72 

73 This produces a scalar node with a tag "!uuid" and value being a regular 

74 string representation of UUID. 

75 """ 

76 return dumper.represent_scalar("!uuid", str(data)) 

77 

78 

79def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]: 

80 if node.value is not None: 

81 return uuid.UUID(hex=node.value) 

82 return None 

83 

84 

85yaml.Dumper.add_representer(uuid.UUID, _uuid_representer) 

86yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor) 

87 

88 

89class YamlRepoExportBackend(RepoExportBackend): 

90 """A repository export implementation that saves to a YAML file. 

91 

92 Parameters 

93 ---------- 

94 stream 

95 A writeable file-like object. 

96 """ 

97 

98 def __init__(self, stream: IO, universe: DimensionUniverse): 

99 self.stream = stream 

100 self.universe = universe 

101 self.data: List[Dict[str, Any]] = [] 

102 

103 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

104 # Docstring inherited from RepoExportBackend.saveDimensionData. 

105 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

106 self.data.append( 

107 { 

108 "type": "dimension", 

109 "element": element.name, 

110 "records": data_dicts, 

111 } 

112 ) 

113 

114 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None: 

115 # Docstring inherited from RepoExportBackend.saveCollections. 

116 data: Dict[str, Any] = { 

117 "type": "collection", 

118 "collection_type": record.type.name, 

119 "name": record.name, 

120 } 

121 if doc is not None: 

122 data["doc"] = doc 

123 if isinstance(record, RunRecord): 

124 data["host"] = record.host 

125 data["timespan_begin"] = record.timespan.begin 

126 data["timespan_end"] = record.timespan.end 

127 elif isinstance(record, ChainedCollectionRecord): 

128 data["children"] = list(record.children) 

129 self.data.append(data) 

130 

131 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

132 # Docstring inherited from RepoExportBackend.saveDatasets. 

133 self.data.append( 

134 { 

135 "type": "dataset_type", 

136 "name": datasetType.name, 

137 "dimensions": [d.name for d in datasetType.dimensions], 

138 "storage_class": datasetType.storageClass_name, 

139 "is_calibration": datasetType.isCalibration(), 

140 } 

141 ) 

142 self.data.append( 

143 { 

144 "type": "dataset", 

145 "dataset_type": datasetType.name, 

146 "run": run, 

147 "records": [ 

148 { 

149 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

150 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

151 "path": dataset.path, 

152 "formatter": dataset.formatter, 

153 # TODO: look up and save other collections 

154 } 

155 for dataset in datasets 

156 ], 

157 } 

158 ) 

159 

160 def saveDatasetAssociations( 

161 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation] 

162 ) -> None: 

163 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

164 if collectionType is CollectionType.TAGGED: 

165 self.data.append( 

166 { 

167 "type": "associations", 

168 "collection": collection, 

169 "collection_type": collectionType.name, 

170 "dataset_ids": [assoc.ref.id for assoc in associations], 

171 } 

172 ) 

173 elif collectionType is CollectionType.CALIBRATION: 

174 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list) 

175 for association in associations: 

176 assert association.timespan is not None 

177 assert association.ref.id is not None 

178 idsByTimespan[association.timespan].append(association.ref.id) 

179 self.data.append( 

180 { 

181 "type": "associations", 

182 "collection": collection, 

183 "collection_type": collectionType.name, 

184 "validity_ranges": [ 

185 { 

186 "timespan": timespan, 

187 "dataset_ids": dataset_ids, 

188 } 

189 for timespan, dataset_ids in idsByTimespan.items() 

190 ], 

191 } 

192 ) 

193 

194 def finish(self) -> None: 

195 # Docstring inherited from RepoExportBackend. 

196 yaml.dump( 

197 { 

198 "description": "Butler Data Repository Export", 

199 "version": str(EXPORT_FORMAT_VERSION), 

200 "universe_version": self.universe.version, 

201 "universe_namespace": self.universe.namespace, 

202 "data": self.data, 

203 }, 

204 stream=self.stream, 

205 sort_keys=False, 

206 ) 

207 

208 

209class YamlRepoImportBackend(RepoImportBackend): 

210 """A repository import implementation that reads from a YAML file. 

211 

212 Parameters 

213 ---------- 

214 stream 

215 A readable file-like object. 

216 registry : `Registry` 

217 The registry datasets will be imported into. Only used to retreive 

218 dataset types during construction; all write happen in `register` 

219 and `load`. 

220 """ 

221 

222 def __init__(self, stream: IO, registry: Registry): 

223 # We read the file fully and convert its contents to Python objects 

224 # instead of loading incrementally so we can spot some problems early; 

225 # because `register` can't be put inside a transaction, we'd rather not 

226 # run that at all if there's going to be problem later in `load`. 

227 wrapper = yaml.safe_load(stream) 

228 if wrapper["version"] == 0: 

229 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

230 # before we really tried to do versioning here. 

231 fileVersion = VersionTuple(1, 0, 0) 

232 else: 

233 fileVersion = VersionTuple.fromString(wrapper["version"]) 

234 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

235 raise IncompatibleVersionError( 

236 f"Cannot read repository export file with version={fileVersion} " 

237 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

238 ) 

239 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

240 raise IncompatibleVersionError( 

241 f"Cannot read repository export file with version={fileVersion} " 

242 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

243 ) 

244 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {} 

245 self.chains: Dict[str, List[str]] = {} 

246 self.collections: Dict[str, CollectionType] = {} 

247 self.collectionDocs: Dict[str, str] = {} 

248 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

249 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

250 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list) 

251 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict) 

252 self.refsByFileId: Dict[DatasetId, DatasetRef] = {} 

253 self.registry: Registry = registry 

254 

255 universe_version = wrapper.get("universe_version", 0) 

256 universe_namespace = wrapper.get("universe_namespace", "daf_butler") 

257 

258 # If this is data exported before the reorganization of visits 

259 # and visit systems and that new schema is in use, some filtering 

260 # will be needed. The entry in the visit dimension record will be 

261 # silently dropped when visit is created but the 

262 # visit_system_membership must be constructed. 

263 migrate_visit_system = False 

264 if ( 

265 universe_version < 2 

266 and universe_namespace == "daf_butler" 

267 and "visit_system_membership" in self.registry.dimensions 

268 ): 

269 migrate_visit_system = True 

270 

271 datasetData = [] 

272 for data in wrapper["data"]: 

273 if data["type"] == "dimension": 

274 # convert all datetime values to astropy 

275 for record in data["records"]: 

276 for key in record: 

277 # Some older YAML files were produced with native 

278 # YAML support for datetime, we support reading that 

279 # data back. Newer conversion uses _AstropyTimeToYAML 

280 # class with special YAML tag. 

281 if isinstance(record[key], datetime): 

282 record[key] = astropy.time.Time(record[key], scale="utc") 

283 element = self.registry.dimensions[data["element"]] 

284 RecordClass: Type[DimensionRecord] = element.RecordClass 

285 self.dimensions[element].extend(RecordClass(**r) for r in data["records"]) 

286 

287 if data["element"] == "visit" and migrate_visit_system: 

288 # Must create the visit_system_membership records. 

289 element = self.registry.dimensions["visit_system_membership"] 

290 RecordClass = element.RecordClass 

291 self.dimensions[element].extend( 

292 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"]) 

293 for r in data["records"] 

294 ) 

295 

296 elif data["type"] == "collection": 

297 collectionType = CollectionType.from_name(data["collection_type"]) 

298 if collectionType is CollectionType.RUN: 

299 self.runs[data["name"]] = ( 

300 data["host"], 

301 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]), 

302 ) 

303 elif collectionType is CollectionType.CHAINED: 

304 children = [] 

305 for child in data["children"]: 

306 if not isinstance(child, str): 

307 warnings.warn( 

308 f"CHAINED collection {data['name']} includes restrictions on child " 

309 "collection searches, which are no longer suppored and will be ignored." 

310 ) 

311 # Old form with dataset type restrictions only, 

312 # supported for backwards compatibility. 

313 child, _ = child 

314 children.append(child) 

315 self.chains[data["name"]] = children 

316 else: 

317 self.collections[data["name"]] = collectionType 

318 doc = data.get("doc") 

319 if doc is not None: 

320 self.collectionDocs[data["name"]] = doc 

321 elif data["type"] == "run": 

322 # Also support old form of saving a run with no extra info. 

323 self.runs[data["name"]] = (None, Timespan(None, None)) 

324 elif data["type"] == "dataset_type": 

325 dimensions = data["dimensions"] 

326 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions: 

327 dimensions.remove("visit_system") 

328 self.datasetTypes.add( 

329 DatasetType( 

330 data["name"], 

331 dimensions=dimensions, 

332 storageClass=data["storage_class"], 

333 universe=self.registry.dimensions, 

334 isCalibration=data.get("is_calibration", False), 

335 ) 

336 ) 

337 elif data["type"] == "dataset": 

338 # Save raw dataset data for a second loop, so we can ensure we 

339 # know about all dataset types first. 

340 datasetData.append(data) 

341 elif data["type"] == "associations": 

342 collectionType = CollectionType.from_name(data["collection_type"]) 

343 if collectionType is CollectionType.TAGGED: 

344 self.tagAssociations[data["collection"]].extend(data["dataset_ids"]) 

345 elif collectionType is CollectionType.CALIBRATION: 

346 assocsByTimespan = self.calibAssociations[data["collection"]] 

347 for d in data["validity_ranges"]: 

348 if "timespan" in d: 

349 assocsByTimespan[d["timespan"]] = d["dataset_ids"] 

350 else: 

351 # TODO: this is for backward compatibility, should 

352 # be removed at some point. 

353 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"] 

354 else: 

355 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

356 else: 

357 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

358 # key is (dataset type name, run) 

359 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list) 

360 for data in datasetData: 

361 datasetType = self.datasetTypes.get(data["dataset_type"]) 

362 if datasetType is None: 

363 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

364 self.datasets[data["dataset_type"], data["run"]].extend( 

365 FileDataset( 

366 d.get("path"), 

367 [ 

368 DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

369 for dataId, refid in zip( 

370 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]) 

371 ) 

372 ], 

373 formatter=doImportType(d.get("formatter")) if "formatter" in d else None, 

374 ) 

375 for d in data["records"] 

376 ) 

377 

378 def register(self) -> None: 

379 # Docstring inherited from RepoImportBackend.register. 

380 for datasetType in self.datasetTypes: 

381 self.registry.registerDatasetType(datasetType) 

382 for run in self.runs: 

383 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

384 # No way to add extra run info to registry yet. 

385 for collection, collection_type in self.collections.items(): 

386 self.registry.registerCollection( 

387 collection, collection_type, doc=self.collectionDocs.get(collection) 

388 ) 

389 for chain, children in self.chains.items(): 

390 self.registry.registerCollection( 

391 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain) 

392 ) 

393 self.registry.setCollectionChain(chain, children) 

394 

395 def load( 

396 self, 

397 datastore: Optional[Datastore], 

398 *, 

399 directory: Optional[str] = None, 

400 transfer: Optional[str] = None, 

401 skip_dimensions: Optional[Set] = None, 

402 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

403 reuseIds: bool = False, 

404 ) -> None: 

405 # Docstring inherited from RepoImportBackend.load. 

406 for element, dimensionRecords in self.dimensions.items(): 

407 if skip_dimensions and element in skip_dimensions: 

408 continue 

409 # Using skip_existing=True here assumes that the records in the 

410 # database are either equivalent or at least preferable to the ones 

411 # being imported. It'd be ideal to check that, but that would mean 

412 # using syncDimensionData, which is not vectorized and is hence 

413 # unacceptably slo. 

414 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True) 

415 # FileDatasets to ingest into the datastore (in bulk): 

416 fileDatasets = [] 

417 for (datasetTypeName, run), records in self.datasets.items(): 

418 # Make a big flattened list of all data IDs and dataset_ids, while 

419 # remembering slices that associate them with the FileDataset 

420 # instances they came from. 

421 datasets: List[DatasetRef] = [] 

422 dataset_ids: List[DatasetId] = [] 

423 slices = [] 

424 for fileDataset in records: 

425 start = len(datasets) 

426 datasets.extend(fileDataset.refs) 

427 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore 

428 stop = len(datasets) 

429 slices.append(slice(start, stop)) 

430 # Insert all of those DatasetRefs at once. 

431 # For now, we ignore the dataset_id we pulled from the file 

432 # and just insert without one to get a new autoincrement value. 

433 # Eventually (once we have origin in IDs) we'll preserve them. 

434 resolvedRefs = self.registry._importDatasets( 

435 datasets, idGenerationMode=idGenerationMode, reuseIds=reuseIds 

436 ) 

437 # Populate our dictionary that maps int dataset_id values from the 

438 # export file to the new DatasetRefs 

439 for fileId, ref in zip(dataset_ids, resolvedRefs): 

440 self.refsByFileId[fileId] = ref 

441 # Now iterate over the original records, and install the new 

442 # resolved DatasetRefs to replace the unresolved ones as we 

443 # reorganize the collection information. 

444 for sliceForFileDataset, fileDataset in zip(slices, records): 

445 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

446 if directory is not None: 

447 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path) 

448 fileDatasets.append(fileDataset) 

449 # Ingest everything into the datastore at once. 

450 if datastore is not None and fileDatasets: 

451 datastore.ingest(*fileDatasets, transfer=transfer) 

452 # Associate datasets with tagged collections. 

453 for collection, dataset_ids in self.tagAssociations.items(): 

454 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

455 # Associate datasets with calibration collections. 

456 for collection, idsByTimespan in self.calibAssociations.items(): 

457 for timespan, dataset_ids in idsByTimespan.items(): 

458 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)