Coverage for python/lsst/daf/butler/transfers/_yaml.py: 11%

185 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-13 02:34 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26import uuid 

27import warnings 

28from collections import defaultdict 

29from datetime import datetime 

30from typing import IO, TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type 

31 

32import astropy.time 

33import yaml 

34from lsst.resources import ResourcePath 

35from lsst.utils import doImportType 

36from lsst.utils.iteration import ensure_iterable 

37 

38from ..core import ( 

39 DatasetAssociation, 

40 DatasetId, 

41 DatasetRef, 

42 DatasetType, 

43 Datastore, 

44 DimensionElement, 

45 DimensionRecord, 

46 DimensionUniverse, 

47 FileDataset, 

48 Timespan, 

49) 

50from ..core.named import NamedValueSet 

51from ..registry import CollectionType, Registry 

52from ..registry.interfaces import ( 

53 ChainedCollectionRecord, 

54 CollectionRecord, 

55 DatasetIdGenEnum, 

56 RunRecord, 

57 VersionTuple, 

58) 

59from ..registry.versions import IncompatibleVersionError 

60from ._interfaces import RepoExportBackend, RepoImportBackend 

61 

62if TYPE_CHECKING: 

63 from lsst.resources import ResourcePathExpression 

64 

65EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2) 

66"""Export format version. 

67 

68Files with a different major version or a newer minor version cannot be read by 

69this version of the code. 

70""" 

71 

72 

73def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node: 

74 """Generate YAML representation for UUID. 

75 

76 This produces a scalar node with a tag "!uuid" and value being a regular 

77 string representation of UUID. 

78 """ 

79 return dumper.represent_scalar("!uuid", str(data)) 

80 

81 

82def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]: 

83 if node.value is not None: 

84 return uuid.UUID(hex=node.value) 

85 return None 

86 

87 

88yaml.Dumper.add_representer(uuid.UUID, _uuid_representer) 

89yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor) 

90 

91 

92class YamlRepoExportBackend(RepoExportBackend): 

93 """A repository export implementation that saves to a YAML file. 

94 

95 Parameters 

96 ---------- 

97 stream 

98 A writeable file-like object. 

99 """ 

100 

101 def __init__(self, stream: IO, universe: DimensionUniverse): 

102 self.stream = stream 

103 self.universe = universe 

104 self.data: List[Dict[str, Any]] = [] 

105 

106 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

107 # Docstring inherited from RepoExportBackend.saveDimensionData. 

108 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

109 self.data.append( 

110 { 

111 "type": "dimension", 

112 "element": element.name, 

113 "records": data_dicts, 

114 } 

115 ) 

116 

117 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None: 

118 # Docstring inherited from RepoExportBackend.saveCollections. 

119 data: Dict[str, Any] = { 

120 "type": "collection", 

121 "collection_type": record.type.name, 

122 "name": record.name, 

123 } 

124 if doc is not None: 

125 data["doc"] = doc 

126 if isinstance(record, RunRecord): 

127 data["host"] = record.host 

128 data["timespan_begin"] = record.timespan.begin 

129 data["timespan_end"] = record.timespan.end 

130 elif isinstance(record, ChainedCollectionRecord): 

131 data["children"] = list(record.children) 

132 self.data.append(data) 

133 

134 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

135 # Docstring inherited from RepoExportBackend.saveDatasets. 

136 self.data.append( 

137 { 

138 "type": "dataset_type", 

139 "name": datasetType.name, 

140 "dimensions": [d.name for d in datasetType.dimensions], 

141 "storage_class": datasetType.storageClass_name, 

142 "is_calibration": datasetType.isCalibration(), 

143 } 

144 ) 

145 self.data.append( 

146 { 

147 "type": "dataset", 

148 "dataset_type": datasetType.name, 

149 "run": run, 

150 "records": [ 

151 { 

152 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

153 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

154 "path": dataset.path, 

155 "formatter": dataset.formatter, 

156 # TODO: look up and save other collections 

157 } 

158 for dataset in datasets 

159 ], 

160 } 

161 ) 

162 

163 def saveDatasetAssociations( 

164 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation] 

165 ) -> None: 

166 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

167 if collectionType is CollectionType.TAGGED: 

168 self.data.append( 

169 { 

170 "type": "associations", 

171 "collection": collection, 

172 "collection_type": collectionType.name, 

173 "dataset_ids": [assoc.ref.id for assoc in associations], 

174 } 

175 ) 

176 elif collectionType is CollectionType.CALIBRATION: 

177 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list) 

178 for association in associations: 

179 assert association.timespan is not None 

180 assert association.ref.id is not None 

181 idsByTimespan[association.timespan].append(association.ref.id) 

182 self.data.append( 

183 { 

184 "type": "associations", 

185 "collection": collection, 

186 "collection_type": collectionType.name, 

187 "validity_ranges": [ 

188 { 

189 "timespan": timespan, 

190 "dataset_ids": dataset_ids, 

191 } 

192 for timespan, dataset_ids in idsByTimespan.items() 

193 ], 

194 } 

195 ) 

196 

197 def finish(self) -> None: 

198 # Docstring inherited from RepoExportBackend. 

199 yaml.dump( 

200 { 

201 "description": "Butler Data Repository Export", 

202 "version": str(EXPORT_FORMAT_VERSION), 

203 "universe_version": self.universe.version, 

204 "universe_namespace": self.universe.namespace, 

205 "data": self.data, 

206 }, 

207 stream=self.stream, 

208 sort_keys=False, 

209 ) 

210 

211 

212class YamlRepoImportBackend(RepoImportBackend): 

213 """A repository import implementation that reads from a YAML file. 

214 

215 Parameters 

216 ---------- 

217 stream 

218 A readable file-like object. 

219 registry : `Registry` 

220 The registry datasets will be imported into. Only used to retreive 

221 dataset types during construction; all write happen in `register` 

222 and `load`. 

223 """ 

224 

225 def __init__(self, stream: IO, registry: Registry): 

226 # We read the file fully and convert its contents to Python objects 

227 # instead of loading incrementally so we can spot some problems early; 

228 # because `register` can't be put inside a transaction, we'd rather not 

229 # run that at all if there's going to be problem later in `load`. 

230 wrapper = yaml.safe_load(stream) 

231 if wrapper["version"] == 0: 

232 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

233 # before we really tried to do versioning here. 

234 fileVersion = VersionTuple(1, 0, 0) 

235 else: 

236 fileVersion = VersionTuple.fromString(wrapper["version"]) 

237 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

238 raise IncompatibleVersionError( 

239 f"Cannot read repository export file with version={fileVersion} " 

240 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

241 ) 

242 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

243 raise IncompatibleVersionError( 

244 f"Cannot read repository export file with version={fileVersion} " 

245 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

246 ) 

247 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {} 

248 self.chains: Dict[str, List[str]] = {} 

249 self.collections: Dict[str, CollectionType] = {} 

250 self.collectionDocs: Dict[str, str] = {} 

251 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

252 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

253 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list) 

254 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict) 

255 self.refsByFileId: Dict[DatasetId, DatasetRef] = {} 

256 self.registry: Registry = registry 

257 

258 universe_version = wrapper.get("universe_version", 0) 

259 universe_namespace = wrapper.get("universe_namespace", "daf_butler") 

260 

261 # If this is data exported before the reorganization of visits 

262 # and visit systems and that new schema is in use, some filtering 

263 # will be needed. The entry in the visit dimension record will be 

264 # silently dropped when visit is created but the 

265 # visit_system_membership must be constructed. 

266 migrate_visit_system = False 

267 if ( 

268 universe_version < 2 

269 and universe_namespace == "daf_butler" 

270 and "visit_system_membership" in self.registry.dimensions 

271 ): 

272 migrate_visit_system = True 

273 

274 datasetData = [] 

275 for data in wrapper["data"]: 

276 if data["type"] == "dimension": 

277 # convert all datetime values to astropy 

278 for record in data["records"]: 

279 for key in record: 

280 # Some older YAML files were produced with native 

281 # YAML support for datetime, we support reading that 

282 # data back. Newer conversion uses _AstropyTimeToYAML 

283 # class with special YAML tag. 

284 if isinstance(record[key], datetime): 

285 record[key] = astropy.time.Time(record[key], scale="utc") 

286 element = self.registry.dimensions[data["element"]] 

287 RecordClass: Type[DimensionRecord] = element.RecordClass 

288 self.dimensions[element].extend(RecordClass(**r) for r in data["records"]) 

289 

290 if data["element"] == "visit" and migrate_visit_system: 

291 # Must create the visit_system_membership records. 

292 element = self.registry.dimensions["visit_system_membership"] 

293 RecordClass = element.RecordClass 

294 self.dimensions[element].extend( 

295 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"]) 

296 for r in data["records"] 

297 ) 

298 

299 elif data["type"] == "collection": 

300 collectionType = CollectionType.from_name(data["collection_type"]) 

301 if collectionType is CollectionType.RUN: 

302 self.runs[data["name"]] = ( 

303 data["host"], 

304 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]), 

305 ) 

306 elif collectionType is CollectionType.CHAINED: 

307 children = [] 

308 for child in data["children"]: 

309 if not isinstance(child, str): 

310 warnings.warn( 

311 f"CHAINED collection {data['name']} includes restrictions on child " 

312 "collection searches, which are no longer suppored and will be ignored." 

313 ) 

314 # Old form with dataset type restrictions only, 

315 # supported for backwards compatibility. 

316 child, _ = child 

317 children.append(child) 

318 self.chains[data["name"]] = children 

319 else: 

320 self.collections[data["name"]] = collectionType 

321 doc = data.get("doc") 

322 if doc is not None: 

323 self.collectionDocs[data["name"]] = doc 

324 elif data["type"] == "run": 

325 # Also support old form of saving a run with no extra info. 

326 self.runs[data["name"]] = (None, Timespan(None, None)) 

327 elif data["type"] == "dataset_type": 

328 dimensions = data["dimensions"] 

329 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions: 

330 dimensions.remove("visit_system") 

331 self.datasetTypes.add( 

332 DatasetType( 

333 data["name"], 

334 dimensions=dimensions, 

335 storageClass=data["storage_class"], 

336 universe=self.registry.dimensions, 

337 isCalibration=data.get("is_calibration", False), 

338 ) 

339 ) 

340 elif data["type"] == "dataset": 

341 # Save raw dataset data for a second loop, so we can ensure we 

342 # know about all dataset types first. 

343 datasetData.append(data) 

344 elif data["type"] == "associations": 

345 collectionType = CollectionType.from_name(data["collection_type"]) 

346 if collectionType is CollectionType.TAGGED: 

347 self.tagAssociations[data["collection"]].extend(data["dataset_ids"]) 

348 elif collectionType is CollectionType.CALIBRATION: 

349 assocsByTimespan = self.calibAssociations[data["collection"]] 

350 for d in data["validity_ranges"]: 

351 if "timespan" in d: 

352 assocsByTimespan[d["timespan"]] = d["dataset_ids"] 

353 else: 

354 # TODO: this is for backward compatibility, should 

355 # be removed at some point. 

356 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"] 

357 else: 

358 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

359 else: 

360 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

361 # key is (dataset type name, run) 

362 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list) 

363 for data in datasetData: 

364 datasetType = self.datasetTypes.get(data["dataset_type"]) 

365 if datasetType is None: 

366 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

367 self.datasets[data["dataset_type"], data["run"]].extend( 

368 FileDataset( 

369 d.get("path"), 

370 [ 

371 DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

372 for dataId, refid in zip( 

373 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]) 

374 ) 

375 ], 

376 formatter=doImportType(d.get("formatter")) if "formatter" in d else None, 

377 ) 

378 for d in data["records"] 

379 ) 

380 

381 def register(self) -> None: 

382 # Docstring inherited from RepoImportBackend.register. 

383 for datasetType in self.datasetTypes: 

384 self.registry.registerDatasetType(datasetType) 

385 for run in self.runs: 

386 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

387 # No way to add extra run info to registry yet. 

388 for collection, collection_type in self.collections.items(): 

389 self.registry.registerCollection( 

390 collection, collection_type, doc=self.collectionDocs.get(collection) 

391 ) 

392 for chain, children in self.chains.items(): 

393 self.registry.registerCollection( 

394 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain) 

395 ) 

396 self.registry.setCollectionChain(chain, children) 

397 

398 def load( 

399 self, 

400 datastore: Optional[Datastore], 

401 *, 

402 directory: ResourcePathExpression | None = None, 

403 transfer: Optional[str] = None, 

404 skip_dimensions: Optional[Set] = None, 

405 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

406 reuseIds: bool = False, 

407 ) -> None: 

408 # Docstring inherited from RepoImportBackend.load. 

409 for element, dimensionRecords in self.dimensions.items(): 

410 if skip_dimensions and element in skip_dimensions: 

411 continue 

412 # Using skip_existing=True here assumes that the records in the 

413 # database are either equivalent or at least preferable to the ones 

414 # being imported. It'd be ideal to check that, but that would mean 

415 # using syncDimensionData, which is not vectorized and is hence 

416 # unacceptably slo. 

417 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True) 

418 # FileDatasets to ingest into the datastore (in bulk): 

419 fileDatasets = [] 

420 for (datasetTypeName, run), records in self.datasets.items(): 

421 # Make a big flattened list of all data IDs and dataset_ids, while 

422 # remembering slices that associate them with the FileDataset 

423 # instances they came from. 

424 datasets: List[DatasetRef] = [] 

425 dataset_ids: List[DatasetId] = [] 

426 slices = [] 

427 for fileDataset in records: 

428 start = len(datasets) 

429 datasets.extend(fileDataset.refs) 

430 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore 

431 stop = len(datasets) 

432 slices.append(slice(start, stop)) 

433 # Insert all of those DatasetRefs at once. 

434 # For now, we ignore the dataset_id we pulled from the file 

435 # and just insert without one to get a new autoincrement value. 

436 # Eventually (once we have origin in IDs) we'll preserve them. 

437 resolvedRefs = self.registry._importDatasets( 

438 datasets, idGenerationMode=idGenerationMode, reuseIds=reuseIds 

439 ) 

440 # Populate our dictionary that maps int dataset_id values from the 

441 # export file to the new DatasetRefs 

442 for fileId, ref in zip(dataset_ids, resolvedRefs): 

443 self.refsByFileId[fileId] = ref 

444 # Now iterate over the original records, and install the new 

445 # resolved DatasetRefs to replace the unresolved ones as we 

446 # reorganize the collection information. 

447 for sliceForFileDataset, fileDataset in zip(slices, records): 

448 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

449 if directory is not None: 

450 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path) 

451 fileDatasets.append(fileDataset) 

452 # Ingest everything into the datastore at once. 

453 if datastore is not None and fileDatasets: 

454 datastore.ingest(*fileDatasets, transfer=transfer) 

455 # Associate datasets with tagged collections. 

456 for collection, dataset_ids in self.tagAssociations.items(): 

457 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

458 # Associate datasets with calibration collections. 

459 for collection, idsByTimespan in self.calibAssociations.items(): 

460 for timespan, dataset_ids in idsByTimespan.items(): 

461 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)