Coverage for python/lsst/daf/butler/transfers/_yaml.py: 14%

208 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

31 

32import uuid 

33import warnings 

34from collections import UserDict, defaultdict 

35from collections.abc import Iterable, Mapping 

36from datetime import datetime 

37from typing import IO, TYPE_CHECKING, Any 

38 

39import astropy.time 

40import yaml 

41from lsst.resources import ResourcePath 

42from lsst.utils import doImportType 

43from lsst.utils.introspection import find_outside_stacklevel 

44from lsst.utils.iteration import ensure_iterable 

45 

46from .._dataset_association import DatasetAssociation 

47from .._dataset_ref import DatasetId, DatasetRef 

48from .._dataset_type import DatasetType 

49from .._file_dataset import FileDataset 

50from .._named import NamedValueSet 

51from .._timespan import Timespan 

52from ..datastore import Datastore 

53from ..dimensions import DimensionElement, DimensionRecord, DimensionUniverse 

54from ..registry import CollectionType 

55from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple 

56from ..registry.sql_registry import SqlRegistry 

57from ..registry.versions import IncompatibleVersionError 

58from ._interfaces import RepoExportBackend, RepoImportBackend 

59 

60if TYPE_CHECKING: 

61 from lsst.resources import ResourcePathExpression 

62 

63EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2) 

64"""Export format version. 

65 

66Files with a different major version or a newer minor version cannot be read by 

67this version of the code. 

68""" 

69 

70 

71class _RefMapper(UserDict[int, uuid.UUID]): 

72 """Create a local dict subclass which creates new deterministic UUID for 

73 missing keys. 

74 """ 

75 

76 _namespace = uuid.UUID("4d4851f4-2890-4d41-8779-5f38a3f5062b") 

77 

78 def __missing__(self, key: int) -> uuid.UUID: 

79 newUUID = uuid.uuid3(namespace=self._namespace, name=str(key)) 

80 self[key] = newUUID 

81 return newUUID 

82 

83 

84_refIntId2UUID = _RefMapper() 

85 

86 

87def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node: 

88 """Generate YAML representation for UUID. 

89 

90 This produces a scalar node with a tag "!uuid" and value being a regular 

91 string representation of UUID. 

92 """ 

93 return dumper.represent_scalar("!uuid", str(data)) 

94 

95 

96def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None: 

97 if node.value is not None: 

98 return uuid.UUID(hex=node.value) 

99 return None 

100 

101 

102yaml.Dumper.add_representer(uuid.UUID, _uuid_representer) 

103yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor) 

104 

105 

106class YamlRepoExportBackend(RepoExportBackend): 

107 """A repository export implementation that saves to a YAML file. 

108 

109 Parameters 

110 ---------- 

111 stream : `io.IO` 

112 A writeable file-like object. 

113 universe : `DimensionUniverse` 

114 The dimension universe to use for the export. 

115 """ 

116 

117 def __init__(self, stream: IO, universe: DimensionUniverse): 

118 self.stream = stream 

119 self.universe = universe 

120 self.data: list[dict[str, Any]] = [] 

121 

122 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

123 # Docstring inherited from RepoExportBackend.saveDimensionData. 

124 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

125 self.data.append( 

126 { 

127 "type": "dimension", 

128 "element": element.name, 

129 "records": data_dicts, 

130 } 

131 ) 

132 

133 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None: 

134 # Docstring inherited from RepoExportBackend.saveCollections. 

135 data: dict[str, Any] = { 

136 "type": "collection", 

137 "collection_type": record.type.name, 

138 "name": record.name, 

139 } 

140 if doc is not None: 

141 data["doc"] = doc 

142 if isinstance(record, RunRecord): 

143 data["host"] = record.host 

144 data["timespan_begin"] = record.timespan.begin 

145 data["timespan_end"] = record.timespan.end 

146 elif isinstance(record, ChainedCollectionRecord): 

147 data["children"] = list(record.children) 

148 self.data.append(data) 

149 

150 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

151 # Docstring inherited from RepoExportBackend.saveDatasets. 

152 self.data.append( 

153 { 

154 "type": "dataset_type", 

155 "name": datasetType.name, 

156 "dimensions": list(datasetType.dimensions.names), 

157 "storage_class": datasetType.storageClass_name, 

158 "is_calibration": datasetType.isCalibration(), 

159 } 

160 ) 

161 self.data.append( 

162 { 

163 "type": "dataset", 

164 "dataset_type": datasetType.name, 

165 "run": run, 

166 "records": [ 

167 { 

168 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

169 "data_id": [dict(ref.dataId.required) for ref in sorted(dataset.refs)], 

170 "path": dataset.path, 

171 "formatter": dataset.formatter, 

172 # TODO: look up and save other collections 

173 } 

174 for dataset in datasets 

175 ], 

176 } 

177 ) 

178 

179 def saveDatasetAssociations( 

180 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation] 

181 ) -> None: 

182 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

183 if collectionType is CollectionType.TAGGED: 

184 self.data.append( 

185 { 

186 "type": "associations", 

187 "collection": collection, 

188 "collection_type": collectionType.name, 

189 "dataset_ids": [assoc.ref.id for assoc in associations], 

190 } 

191 ) 

192 elif collectionType is CollectionType.CALIBRATION: 

193 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list) 

194 for association in associations: 

195 assert association.timespan is not None 

196 idsByTimespan[association.timespan].append(association.ref.id) 

197 self.data.append( 

198 { 

199 "type": "associations", 

200 "collection": collection, 

201 "collection_type": collectionType.name, 

202 "validity_ranges": [ 

203 { 

204 "timespan": timespan, 

205 "dataset_ids": dataset_ids, 

206 } 

207 for timespan, dataset_ids in idsByTimespan.items() 

208 ], 

209 } 

210 ) 

211 

212 def finish(self) -> None: 

213 # Docstring inherited from RepoExportBackend. 

214 yaml.dump( 

215 { 

216 "description": "Butler Data Repository Export", 

217 "version": str(EXPORT_FORMAT_VERSION), 

218 "universe_version": self.universe.version, 

219 "universe_namespace": self.universe.namespace, 

220 "data": self.data, 

221 }, 

222 stream=self.stream, 

223 sort_keys=False, 

224 ) 

225 

226 

227class YamlRepoImportBackend(RepoImportBackend): 

228 """A repository import implementation that reads from a YAML file. 

229 

230 Parameters 

231 ---------- 

232 stream : `io.IO` 

233 A readable file-like object. 

234 registry : `SqlRegistry` 

235 The registry datasets will be imported into. Only used to retreive 

236 dataset types during construction; all write happen in `register` 

237 and `load`. 

238 """ 

239 

240 def __init__(self, stream: IO, registry: SqlRegistry): 

241 # We read the file fully and convert its contents to Python objects 

242 # instead of loading incrementally so we can spot some problems early; 

243 # because `register` can't be put inside a transaction, we'd rather not 

244 # run that at all if there's going to be problem later in `load`. 

245 wrapper = yaml.safe_load(stream) 

246 if wrapper["version"] == 0: 

247 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

248 # before we really tried to do versioning here. 

249 fileVersion = VersionTuple(1, 0, 0) 

250 else: 

251 fileVersion = VersionTuple.fromString(wrapper["version"]) 

252 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

253 raise IncompatibleVersionError( 

254 f"Cannot read repository export file with version={fileVersion} " 

255 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

256 ) 

257 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

258 raise IncompatibleVersionError( 

259 f"Cannot read repository export file with version={fileVersion} " 

260 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

261 ) 

262 self.runs: dict[str, tuple[str | None, Timespan]] = {} 

263 self.chains: dict[str, list[str]] = {} 

264 self.collections: dict[str, CollectionType] = {} 

265 self.collectionDocs: dict[str, str] = {} 

266 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

267 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list) 

268 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list) 

269 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict) 

270 self.refsByFileId: dict[DatasetId, DatasetRef] = {} 

271 self.registry: SqlRegistry = registry 

272 

273 universe_version = wrapper.get("universe_version", 0) 

274 universe_namespace = wrapper.get("universe_namespace", "daf_butler") 

275 

276 # If this is data exported before the reorganization of visits 

277 # and visit systems and that new schema is in use, some filtering 

278 # will be needed. The entry in the visit dimension record will be 

279 # silently dropped when visit is created but the 

280 # visit_system_membership must be constructed. 

281 migrate_visit_system = False 

282 if ( 

283 universe_version < 2 

284 and universe_namespace == "daf_butler" 

285 and "visit_system_membership" in self.registry.dimensions 

286 ): 

287 migrate_visit_system = True 

288 

289 # Drop "seeing" from visits in files older than version 1. 

290 migrate_visit_seeing = False 

291 if ( 

292 universe_version < 1 

293 and universe_namespace == "daf_butler" 

294 and "visit" in self.registry.dimensions 

295 and "seeing" not in self.registry.dimensions["visit"].metadata 

296 ): 

297 migrate_visit_seeing = True 

298 

299 datasetData = [] 

300 RecordClass: type[DimensionRecord] 

301 for data in wrapper["data"]: 

302 if data["type"] == "dimension": 

303 # convert all datetime values to astropy 

304 for record in data["records"]: 

305 for key in record: 

306 # Some older YAML files were produced with native 

307 # YAML support for datetime, we support reading that 

308 # data back. Newer conversion uses _AstropyTimeToYAML 

309 # class with special YAML tag. 

310 if isinstance(record[key], datetime): 

311 record[key] = astropy.time.Time(record[key], scale="utc") 

312 

313 if data["element"] == "visit": 

314 if migrate_visit_system: 

315 # Must create the visit_system_membership records. 

316 # But first create empty list for visits since other 

317 # logic in this file depends on self.dimensions being 

318 # populated in an order consisteny with primary keys. 

319 self.dimensions[self.registry.dimensions["visit"]] = [] 

320 element = self.registry.dimensions["visit_system_membership"] 

321 RecordClass = element.RecordClass 

322 self.dimensions[element].extend( 

323 RecordClass( 

324 instrument=r["instrument"], visit_system=r.pop("visit_system"), visit=r["id"] 

325 ) 

326 for r in data["records"] 

327 ) 

328 if migrate_visit_seeing: 

329 for record in data["records"]: 

330 record.pop("seeing", None) 

331 

332 element = self.registry.dimensions[data["element"]] 

333 RecordClass = element.RecordClass 

334 self.dimensions[element].extend(RecordClass(**r) for r in data["records"]) 

335 

336 elif data["type"] == "collection": 

337 collectionType = CollectionType.from_name(data["collection_type"]) 

338 if collectionType is CollectionType.RUN: 

339 self.runs[data["name"]] = ( 

340 data["host"], 

341 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]), 

342 ) 

343 elif collectionType is CollectionType.CHAINED: 

344 children = [] 

345 for child in data["children"]: 

346 if not isinstance(child, str): 

347 warnings.warn( 

348 f"CHAINED collection {data['name']} includes restrictions on child " 

349 "collection searches, which are no longer suppored and will be ignored.", 

350 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

351 ) 

352 # Old form with dataset type restrictions only, 

353 # supported for backwards compatibility. 

354 child, _ = child 

355 children.append(child) 

356 self.chains[data["name"]] = children 

357 else: 

358 self.collections[data["name"]] = collectionType 

359 doc = data.get("doc") 

360 if doc is not None: 

361 self.collectionDocs[data["name"]] = doc 

362 elif data["type"] == "run": 

363 # Also support old form of saving a run with no extra info. 

364 self.runs[data["name"]] = (None, Timespan(None, None)) 

365 elif data["type"] == "dataset_type": 

366 dimensions = data["dimensions"] 

367 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions: 

368 dimensions.remove("visit_system") 

369 self.datasetTypes.add( 

370 DatasetType( 

371 data["name"], 

372 dimensions=dimensions, 

373 storageClass=data["storage_class"], 

374 universe=self.registry.dimensions, 

375 isCalibration=data.get("is_calibration", False), 

376 ) 

377 ) 

378 elif data["type"] == "dataset": 

379 # Save raw dataset data for a second loop, so we can ensure we 

380 # know about all dataset types first. 

381 datasetData.append(data) 

382 elif data["type"] == "associations": 

383 collectionType = CollectionType.from_name(data["collection_type"]) 

384 if collectionType is CollectionType.TAGGED: 

385 self.tagAssociations[data["collection"]].extend( 

386 [x if not isinstance(x, int) else _refIntId2UUID[x] for x in data["dataset_ids"]] 

387 ) 

388 elif collectionType is CollectionType.CALIBRATION: 

389 assocsByTimespan = self.calibAssociations[data["collection"]] 

390 for d in data["validity_ranges"]: 

391 if "timespan" in d: 

392 assocsByTimespan[d["timespan"]] = [ 

393 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"] 

394 ] 

395 else: 

396 # TODO: this is for backward compatibility, should 

397 # be removed at some point. 

398 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [ 

399 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"] 

400 ] 

401 else: 

402 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

403 else: 

404 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

405 # key is (dataset type name, run) 

406 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list) 

407 for data in datasetData: 

408 datasetType = self.datasetTypes.get(data["dataset_type"]) 

409 if datasetType is None: 

410 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

411 self.datasets[data["dataset_type"], data["run"]].extend( 

412 FileDataset( 

413 d.get("path"), 

414 [ 

415 DatasetRef( 

416 datasetType, 

417 dataId, 

418 run=data["run"], 

419 id=refid if not isinstance(refid, int) else _refIntId2UUID[refid], 

420 ) 

421 for dataId, refid in zip( 

422 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]), strict=True 

423 ) 

424 ], 

425 formatter=doImportType(d.get("formatter")) if "formatter" in d else None, 

426 ) 

427 for d in data["records"] 

428 ) 

429 

430 def register(self) -> None: 

431 # Docstring inherited from RepoImportBackend.register. 

432 for datasetType in self.datasetTypes: 

433 self.registry.registerDatasetType(datasetType) 

434 for run in self.runs: 

435 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

436 # No way to add extra run info to registry yet. 

437 for collection, collection_type in self.collections.items(): 

438 self.registry.registerCollection( 

439 collection, collection_type, doc=self.collectionDocs.get(collection) 

440 ) 

441 for chain, children in self.chains.items(): 

442 self.registry.registerCollection( 

443 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain) 

444 ) 

445 self.registry.setCollectionChain(chain, children) 

446 

447 def load( 

448 self, 

449 datastore: Datastore | None, 

450 *, 

451 directory: ResourcePathExpression | None = None, 

452 transfer: str | None = None, 

453 skip_dimensions: set | None = None, 

454 ) -> None: 

455 # Docstring inherited from RepoImportBackend.load. 

456 for element, dimensionRecords in self.dimensions.items(): 

457 if skip_dimensions and element in skip_dimensions: 

458 continue 

459 # Using skip_existing=True here assumes that the records in the 

460 # database are either equivalent or at least preferable to the ones 

461 # being imported. It'd be ideal to check that, but that would mean 

462 # using syncDimensionData, which is not vectorized and is hence 

463 # unacceptably slo. 

464 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True) 

465 # FileDatasets to ingest into the datastore (in bulk): 

466 fileDatasets = [] 

467 for records in self.datasets.values(): 

468 # Make a big flattened list of all data IDs and dataset_ids, while 

469 # remembering slices that associate them with the FileDataset 

470 # instances they came from. 

471 datasets: list[DatasetRef] = [] 

472 dataset_ids: list[DatasetId] = [] 

473 slices = [] 

474 for fileDataset in records: 

475 start = len(datasets) 

476 datasets.extend(fileDataset.refs) 

477 dataset_ids.extend(ref.id for ref in fileDataset.refs) 

478 stop = len(datasets) 

479 slices.append(slice(start, stop)) 

480 # Insert all of those DatasetRefs at once. 

481 # For now, we ignore the dataset_id we pulled from the file 

482 # and just insert without one to get a new autoincrement value. 

483 # Eventually (once we have origin in IDs) we'll preserve them. 

484 resolvedRefs = self.registry._importDatasets(datasets) 

485 # Populate our dictionary that maps int dataset_id values from the 

486 # export file to the new DatasetRefs 

487 for fileId, ref in zip(dataset_ids, resolvedRefs, strict=True): 

488 self.refsByFileId[fileId] = ref 

489 # Now iterate over the original records, and install the new 

490 # resolved DatasetRefs to replace the unresolved ones as we 

491 # reorganize the collection information. 

492 for sliceForFileDataset, fileDataset in zip(slices, records, strict=True): 

493 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

494 if directory is not None: 

495 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path) 

496 fileDatasets.append(fileDataset) 

497 # Ingest everything into the datastore at once. 

498 if datastore is not None and fileDatasets: 

499 datastore.ingest(*fileDatasets, transfer=transfer) 

500 # Associate datasets with tagged collections. 

501 for collection, dataset_ids in self.tagAssociations.items(): 

502 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

503 # Associate datasets with calibration collections. 

504 for collection, idsByTimespan in self.calibAssociations.items(): 

505 for timespan, dataset_ids in idsByTimespan.items(): 

506 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)