Coverage for python/lsst/daf/butler/transfers/_yaml.py: 13%

192 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26import uuid 

27import warnings 

28from collections import UserDict, defaultdict 

29from collections.abc import Iterable, Mapping 

30from datetime import datetime 

31from typing import IO, TYPE_CHECKING, Any 

32 

33import astropy.time 

34import yaml 

35from lsst.resources import ResourcePath 

36from lsst.utils import doImportType 

37from lsst.utils.iteration import ensure_iterable 

38 

39from ..core import ( 

40 DatasetAssociation, 

41 DatasetId, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DimensionElement, 

46 DimensionRecord, 

47 DimensionUniverse, 

48 FileDataset, 

49 Timespan, 

50) 

51from ..core.named import NamedValueSet 

52from ..registry import CollectionType, Registry 

53from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple 

54from ..registry.versions import IncompatibleVersionError 

55from ._interfaces import RepoExportBackend, RepoImportBackend 

56 

57if TYPE_CHECKING: 

58 from lsst.resources import ResourcePathExpression 

59 

60EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2) 

61"""Export format version. 

62 

63Files with a different major version or a newer minor version cannot be read by 

64this version of the code. 

65""" 

66 

67 

68class _RefMapper(UserDict[int, uuid.UUID]): 

69 """Create a local dict subclass which creates new deterministic UUID for 

70 missing keys. 

71 """ 

72 

73 _namespace = uuid.UUID("4d4851f4-2890-4d41-8779-5f38a3f5062b") 

74 

75 def __missing__(self, key: int) -> uuid.UUID: 

76 newUUID = uuid.uuid3(namespace=self._namespace, name=str(key)) 

77 self[key] = newUUID 

78 return newUUID 

79 

80 

81_refIntId2UUID = _RefMapper() 

82 

83 

84def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node: 

85 """Generate YAML representation for UUID. 

86 

87 This produces a scalar node with a tag "!uuid" and value being a regular 

88 string representation of UUID. 

89 """ 

90 return dumper.represent_scalar("!uuid", str(data)) 

91 

92 

93def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None: 

94 if node.value is not None: 

95 return uuid.UUID(hex=node.value) 

96 return None 

97 

98 

99yaml.Dumper.add_representer(uuid.UUID, _uuid_representer) 

100yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor) 

101 

102 

103class YamlRepoExportBackend(RepoExportBackend): 

104 """A repository export implementation that saves to a YAML file. 

105 

106 Parameters 

107 ---------- 

108 stream 

109 A writeable file-like object. 

110 """ 

111 

112 def __init__(self, stream: IO, universe: DimensionUniverse): 

113 self.stream = stream 

114 self.universe = universe 

115 self.data: list[dict[str, Any]] = [] 

116 

117 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

118 # Docstring inherited from RepoExportBackend.saveDimensionData. 

119 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

120 self.data.append( 

121 { 

122 "type": "dimension", 

123 "element": element.name, 

124 "records": data_dicts, 

125 } 

126 ) 

127 

128 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None: 

129 # Docstring inherited from RepoExportBackend.saveCollections. 

130 data: dict[str, Any] = { 

131 "type": "collection", 

132 "collection_type": record.type.name, 

133 "name": record.name, 

134 } 

135 if doc is not None: 

136 data["doc"] = doc 

137 if isinstance(record, RunRecord): 

138 data["host"] = record.host 

139 data["timespan_begin"] = record.timespan.begin 

140 data["timespan_end"] = record.timespan.end 

141 elif isinstance(record, ChainedCollectionRecord): 

142 data["children"] = list(record.children) 

143 self.data.append(data) 

144 

145 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

146 # Docstring inherited from RepoExportBackend.saveDatasets. 

147 self.data.append( 

148 { 

149 "type": "dataset_type", 

150 "name": datasetType.name, 

151 "dimensions": [d.name for d in datasetType.dimensions], 

152 "storage_class": datasetType.storageClass_name, 

153 "is_calibration": datasetType.isCalibration(), 

154 } 

155 ) 

156 self.data.append( 

157 { 

158 "type": "dataset", 

159 "dataset_type": datasetType.name, 

160 "run": run, 

161 "records": [ 

162 { 

163 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

164 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

165 "path": dataset.path, 

166 "formatter": dataset.formatter, 

167 # TODO: look up and save other collections 

168 } 

169 for dataset in datasets 

170 ], 

171 } 

172 ) 

173 

174 def saveDatasetAssociations( 

175 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation] 

176 ) -> None: 

177 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

178 if collectionType is CollectionType.TAGGED: 

179 self.data.append( 

180 { 

181 "type": "associations", 

182 "collection": collection, 

183 "collection_type": collectionType.name, 

184 "dataset_ids": [assoc.ref.id for assoc in associations], 

185 } 

186 ) 

187 elif collectionType is CollectionType.CALIBRATION: 

188 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list) 

189 for association in associations: 

190 assert association.timespan is not None 

191 idsByTimespan[association.timespan].append(association.ref.id) 

192 self.data.append( 

193 { 

194 "type": "associations", 

195 "collection": collection, 

196 "collection_type": collectionType.name, 

197 "validity_ranges": [ 

198 { 

199 "timespan": timespan, 

200 "dataset_ids": dataset_ids, 

201 } 

202 for timespan, dataset_ids in idsByTimespan.items() 

203 ], 

204 } 

205 ) 

206 

207 def finish(self) -> None: 

208 # Docstring inherited from RepoExportBackend. 

209 yaml.dump( 

210 { 

211 "description": "Butler Data Repository Export", 

212 "version": str(EXPORT_FORMAT_VERSION), 

213 "universe_version": self.universe.version, 

214 "universe_namespace": self.universe.namespace, 

215 "data": self.data, 

216 }, 

217 stream=self.stream, 

218 sort_keys=False, 

219 ) 

220 

221 

222class YamlRepoImportBackend(RepoImportBackend): 

223 """A repository import implementation that reads from a YAML file. 

224 

225 Parameters 

226 ---------- 

227 stream 

228 A readable file-like object. 

229 registry : `Registry` 

230 The registry datasets will be imported into. Only used to retreive 

231 dataset types during construction; all write happen in `register` 

232 and `load`. 

233 """ 

234 

235 def __init__(self, stream: IO, registry: Registry): 

236 # We read the file fully and convert its contents to Python objects 

237 # instead of loading incrementally so we can spot some problems early; 

238 # because `register` can't be put inside a transaction, we'd rather not 

239 # run that at all if there's going to be problem later in `load`. 

240 wrapper = yaml.safe_load(stream) 

241 if wrapper["version"] == 0: 

242 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

243 # before we really tried to do versioning here. 

244 fileVersion = VersionTuple(1, 0, 0) 

245 else: 

246 fileVersion = VersionTuple.fromString(wrapper["version"]) 

247 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

248 raise IncompatibleVersionError( 

249 f"Cannot read repository export file with version={fileVersion} " 

250 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

251 ) 

252 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

253 raise IncompatibleVersionError( 

254 f"Cannot read repository export file with version={fileVersion} " 

255 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

256 ) 

257 self.runs: dict[str, tuple[str | None, Timespan]] = {} 

258 self.chains: dict[str, list[str]] = {} 

259 self.collections: dict[str, CollectionType] = {} 

260 self.collectionDocs: dict[str, str] = {} 

261 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

262 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list) 

263 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list) 

264 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict) 

265 self.refsByFileId: dict[DatasetId, DatasetRef] = {} 

266 self.registry: Registry = registry 

267 

268 universe_version = wrapper.get("universe_version", 0) 

269 universe_namespace = wrapper.get("universe_namespace", "daf_butler") 

270 

271 # If this is data exported before the reorganization of visits 

272 # and visit systems and that new schema is in use, some filtering 

273 # will be needed. The entry in the visit dimension record will be 

274 # silently dropped when visit is created but the 

275 # visit_system_membership must be constructed. 

276 migrate_visit_system = False 

277 if ( 

278 universe_version < 2 

279 and universe_namespace == "daf_butler" 

280 and "visit_system_membership" in self.registry.dimensions 

281 ): 

282 migrate_visit_system = True 

283 

284 datasetData = [] 

285 for data in wrapper["data"]: 

286 if data["type"] == "dimension": 

287 # convert all datetime values to astropy 

288 for record in data["records"]: 

289 for key in record: 

290 # Some older YAML files were produced with native 

291 # YAML support for datetime, we support reading that 

292 # data back. Newer conversion uses _AstropyTimeToYAML 

293 # class with special YAML tag. 

294 if isinstance(record[key], datetime): 

295 record[key] = astropy.time.Time(record[key], scale="utc") 

296 element = self.registry.dimensions[data["element"]] 

297 RecordClass: type[DimensionRecord] = element.RecordClass 

298 self.dimensions[element].extend(RecordClass(**r) for r in data["records"]) 

299 

300 if data["element"] == "visit" and migrate_visit_system: 

301 # Must create the visit_system_membership records. 

302 element = self.registry.dimensions["visit_system_membership"] 

303 RecordClass = element.RecordClass 

304 self.dimensions[element].extend( 

305 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"]) 

306 for r in data["records"] 

307 ) 

308 

309 elif data["type"] == "collection": 

310 collectionType = CollectionType.from_name(data["collection_type"]) 

311 if collectionType is CollectionType.RUN: 

312 self.runs[data["name"]] = ( 

313 data["host"], 

314 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]), 

315 ) 

316 elif collectionType is CollectionType.CHAINED: 

317 children = [] 

318 for child in data["children"]: 

319 if not isinstance(child, str): 

320 warnings.warn( 

321 f"CHAINED collection {data['name']} includes restrictions on child " 

322 "collection searches, which are no longer suppored and will be ignored." 

323 ) 

324 # Old form with dataset type restrictions only, 

325 # supported for backwards compatibility. 

326 child, _ = child 

327 children.append(child) 

328 self.chains[data["name"]] = children 

329 else: 

330 self.collections[data["name"]] = collectionType 

331 doc = data.get("doc") 

332 if doc is not None: 

333 self.collectionDocs[data["name"]] = doc 

334 elif data["type"] == "run": 

335 # Also support old form of saving a run with no extra info. 

336 self.runs[data["name"]] = (None, Timespan(None, None)) 

337 elif data["type"] == "dataset_type": 

338 dimensions = data["dimensions"] 

339 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions: 

340 dimensions.remove("visit_system") 

341 self.datasetTypes.add( 

342 DatasetType( 

343 data["name"], 

344 dimensions=dimensions, 

345 storageClass=data["storage_class"], 

346 universe=self.registry.dimensions, 

347 isCalibration=data.get("is_calibration", False), 

348 ) 

349 ) 

350 elif data["type"] == "dataset": 

351 # Save raw dataset data for a second loop, so we can ensure we 

352 # know about all dataset types first. 

353 datasetData.append(data) 

354 elif data["type"] == "associations": 

355 collectionType = CollectionType.from_name(data["collection_type"]) 

356 if collectionType is CollectionType.TAGGED: 

357 self.tagAssociations[data["collection"]].extend( 

358 [x if not isinstance(x, int) else _refIntId2UUID[x] for x in data["dataset_ids"]] 

359 ) 

360 elif collectionType is CollectionType.CALIBRATION: 

361 assocsByTimespan = self.calibAssociations[data["collection"]] 

362 for d in data["validity_ranges"]: 

363 if "timespan" in d: 

364 assocsByTimespan[d["timespan"]] = [ 

365 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"] 

366 ] 

367 else: 

368 # TODO: this is for backward compatibility, should 

369 # be removed at some point. 

370 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [ 

371 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"] 

372 ] 

373 else: 

374 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

375 else: 

376 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

377 # key is (dataset type name, run) 

378 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list) 

379 for data in datasetData: 

380 datasetType = self.datasetTypes.get(data["dataset_type"]) 

381 if datasetType is None: 

382 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

383 self.datasets[data["dataset_type"], data["run"]].extend( 

384 FileDataset( 

385 d.get("path"), 

386 [ 

387 DatasetRef( 

388 datasetType, 

389 dataId, 

390 run=data["run"], 

391 id=refid if not isinstance(refid, int) else _refIntId2UUID[refid], 

392 ) 

393 for dataId, refid in zip( 

394 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]) 

395 ) 

396 ], 

397 formatter=doImportType(d.get("formatter")) if "formatter" in d else None, 

398 ) 

399 for d in data["records"] 

400 ) 

401 

402 def register(self) -> None: 

403 # Docstring inherited from RepoImportBackend.register. 

404 for datasetType in self.datasetTypes: 

405 self.registry.registerDatasetType(datasetType) 

406 for run in self.runs: 

407 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

408 # No way to add extra run info to registry yet. 

409 for collection, collection_type in self.collections.items(): 

410 self.registry.registerCollection( 

411 collection, collection_type, doc=self.collectionDocs.get(collection) 

412 ) 

413 for chain, children in self.chains.items(): 

414 self.registry.registerCollection( 

415 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain) 

416 ) 

417 self.registry.setCollectionChain(chain, children) 

418 

419 def load( 

420 self, 

421 datastore: Datastore | None, 

422 *, 

423 directory: ResourcePathExpression | None = None, 

424 transfer: str | None = None, 

425 skip_dimensions: set | None = None, 

426 ) -> None: 

427 # Docstring inherited from RepoImportBackend.load. 

428 for element, dimensionRecords in self.dimensions.items(): 

429 if skip_dimensions and element in skip_dimensions: 

430 continue 

431 # Using skip_existing=True here assumes that the records in the 

432 # database are either equivalent or at least preferable to the ones 

433 # being imported. It'd be ideal to check that, but that would mean 

434 # using syncDimensionData, which is not vectorized and is hence 

435 # unacceptably slo. 

436 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True) 

437 # FileDatasets to ingest into the datastore (in bulk): 

438 fileDatasets = [] 

439 for (datasetTypeName, run), records in self.datasets.items(): 

440 # Make a big flattened list of all data IDs and dataset_ids, while 

441 # remembering slices that associate them with the FileDataset 

442 # instances they came from. 

443 datasets: list[DatasetRef] = [] 

444 dataset_ids: list[DatasetId] = [] 

445 slices = [] 

446 for fileDataset in records: 

447 start = len(datasets) 

448 datasets.extend(fileDataset.refs) 

449 dataset_ids.extend(ref.id for ref in fileDataset.refs) 

450 stop = len(datasets) 

451 slices.append(slice(start, stop)) 

452 # Insert all of those DatasetRefs at once. 

453 # For now, we ignore the dataset_id we pulled from the file 

454 # and just insert without one to get a new autoincrement value. 

455 # Eventually (once we have origin in IDs) we'll preserve them. 

456 resolvedRefs = self.registry._importDatasets(datasets) 

457 # Populate our dictionary that maps int dataset_id values from the 

458 # export file to the new DatasetRefs 

459 for fileId, ref in zip(dataset_ids, resolvedRefs): 

460 self.refsByFileId[fileId] = ref 

461 # Now iterate over the original records, and install the new 

462 # resolved DatasetRefs to replace the unresolved ones as we 

463 # reorganize the collection information. 

464 for sliceForFileDataset, fileDataset in zip(slices, records): 

465 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

466 if directory is not None: 

467 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path) 

468 fileDatasets.append(fileDataset) 

469 # Ingest everything into the datastore at once. 

470 if datastore is not None and fileDatasets: 

471 datastore.ingest(*fileDatasets, transfer=transfer) 

472 # Associate datasets with tagged collections. 

473 for collection, dataset_ids in self.tagAssociations.items(): 

474 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

475 # Associate datasets with calibration collections. 

476 for collection, idsByTimespan in self.calibAssociations.items(): 

477 for timespan, dataset_ids in idsByTimespan.items(): 

478 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)