Coverage for python/lsst/daf/butler/transfers/_yaml.py: 12%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

171 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26from datetime import datetime 

27from typing import ( 

28 Any, 

29 Dict, 

30 IO, 

31 Iterable, 

32 List, 

33 Mapping, 

34 Optional, 

35 Set, 

36 Tuple, 

37 Type, 

38) 

39import uuid 

40import warnings 

41from collections import defaultdict 

42 

43import yaml 

44import astropy.time 

45 

46from lsst.utils import doImportType 

47from lsst.utils.iteration import ensure_iterable 

48from ..core import ( 

49 DatasetAssociation, 

50 DatasetId, 

51 DatasetRef, 

52 DatasetType, 

53 Datastore, 

54 DimensionElement, 

55 DimensionRecord, 

56 FileDataset, 

57 Timespan, 

58) 

59from ..core._butlerUri import ButlerURI 

60from ..core.named import NamedValueSet 

61from ..registry import CollectionType, Registry 

62from ..registry.interfaces import ( 

63 ChainedCollectionRecord, 

64 CollectionRecord, 

65 DatasetIdGenEnum, 

66 RunRecord, 

67 VersionTuple, 

68) 

69from ..registry.versions import IncompatibleVersionError 

70from ._interfaces import RepoExportBackend, RepoImportBackend 

71 

72 

73EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1) 

74"""Export format version. 

75 

76Files with a different major version or a newer minor version cannot be read by 

77this version of the code. 

78""" 

79 

80 

81def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node: 

82 """Generate YAML representation for UUID. 

83 

84 This produces a scalar node with a tag "!uuid" and value being a regular 

85 string representation of UUID. 

86 """ 

87 return dumper.represent_scalar("!uuid", str(data)) 

88 

89 

90def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]: 

91 if node.value is not None: 

92 return uuid.UUID(hex=node.value) 

93 return None 

94 

95 

96yaml.Dumper.add_representer(uuid.UUID, _uuid_representer) 

97yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor) 

98 

99 

100class YamlRepoExportBackend(RepoExportBackend): 

101 """A repository export implementation that saves to a YAML file. 

102 

103 Parameters 

104 ---------- 

105 stream 

106 A writeable file-like object. 

107 """ 

108 

109 def __init__(self, stream: IO): 

110 self.stream = stream 

111 self.data: List[Dict[str, Any]] = [] 

112 

113 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

114 # Docstring inherited from RepoExportBackend.saveDimensionData. 

115 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

116 self.data.append({ 

117 "type": "dimension", 

118 "element": element.name, 

119 "records": data_dicts, 

120 }) 

121 

122 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None: 

123 # Docstring inherited from RepoExportBackend.saveCollections. 

124 data: Dict[str, Any] = { 

125 "type": "collection", 

126 "collection_type": record.type.name, 

127 "name": record.name, 

128 } 

129 if doc is not None: 

130 data["doc"] = doc 

131 if isinstance(record, RunRecord): 

132 data["host"] = record.host 

133 data["timespan_begin"] = record.timespan.begin 

134 data["timespan_end"] = record.timespan.end 

135 elif isinstance(record, ChainedCollectionRecord): 

136 data["children"] = list(record.children) 

137 self.data.append(data) 

138 

139 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

140 # Docstring inherited from RepoExportBackend.saveDatasets. 

141 self.data.append({ 

142 "type": "dataset_type", 

143 "name": datasetType.name, 

144 "dimensions": [d.name for d in datasetType.dimensions], 

145 "storage_class": datasetType.storageClass.name, 

146 "is_calibration": datasetType.isCalibration(), 

147 }) 

148 self.data.append({ 

149 "type": "dataset", 

150 "dataset_type": datasetType.name, 

151 "run": run, 

152 "records": [ 

153 { 

154 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

155 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

156 "path": dataset.path, 

157 "formatter": dataset.formatter, 

158 # TODO: look up and save other collections 

159 } 

160 for dataset in datasets 

161 ] 

162 }) 

163 

164 def saveDatasetAssociations(self, collection: str, collectionType: CollectionType, 

165 associations: Iterable[DatasetAssociation]) -> None: 

166 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

167 if collectionType is CollectionType.TAGGED: 

168 self.data.append({ 

169 "type": "associations", 

170 "collection": collection, 

171 "collection_type": collectionType.name, 

172 "dataset_ids": [assoc.ref.id for assoc in associations], 

173 }) 

174 elif collectionType is CollectionType.CALIBRATION: 

175 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list) 

176 for association in associations: 

177 assert association.timespan is not None 

178 assert association.ref.id is not None 

179 idsByTimespan[association.timespan].append(association.ref.id) 

180 self.data.append({ 

181 "type": "associations", 

182 "collection": collection, 

183 "collection_type": collectionType.name, 

184 "validity_ranges": [ 

185 { 

186 "timespan": timespan, 

187 "dataset_ids": dataset_ids, 

188 } 

189 for timespan, dataset_ids in idsByTimespan.items() 

190 ] 

191 }) 

192 

193 def finish(self) -> None: 

194 # Docstring inherited from RepoExportBackend. 

195 yaml.dump( 

196 { 

197 "description": "Butler Data Repository Export", 

198 "version": str(EXPORT_FORMAT_VERSION), 

199 "data": self.data, 

200 }, 

201 stream=self.stream, 

202 sort_keys=False, 

203 ) 

204 

205 

206class YamlRepoImportBackend(RepoImportBackend): 

207 """A repository import implementation that reads from a YAML file. 

208 

209 Parameters 

210 ---------- 

211 stream 

212 A readable file-like object. 

213 registry : `Registry` 

214 The registry datasets will be imported into. Only used to retreive 

215 dataset types during construction; all write happen in `register` 

216 and `load`. 

217 """ 

218 

219 def __init__(self, stream: IO, registry: Registry): 

220 # We read the file fully and convert its contents to Python objects 

221 # instead of loading incrementally so we can spot some problems early; 

222 # because `register` can't be put inside a transaction, we'd rather not 

223 # run that at all if there's going to be problem later in `load`. 

224 wrapper = yaml.safe_load(stream) 

225 if wrapper["version"] == 0: 

226 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

227 # before we really tried to do versioning here. 

228 fileVersion = VersionTuple(1, 0, 0) 

229 else: 

230 fileVersion = VersionTuple.fromString(wrapper["version"]) 

231 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

232 raise IncompatibleVersionError( 

233 f"Cannot read repository export file with version={fileVersion} " 

234 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

235 ) 

236 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

237 raise IncompatibleVersionError( 

238 f"Cannot read repository export file with version={fileVersion} " 

239 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

240 ) 

241 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {} 

242 self.chains: Dict[str, List[str]] = {} 

243 self.collections: Dict[str, CollectionType] = {} 

244 self.collectionDocs: Dict[str, str] = {} 

245 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

246 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

247 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list) 

248 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict) 

249 self.refsByFileId: Dict[DatasetId, DatasetRef] = {} 

250 self.registry: Registry = registry 

251 datasetData = [] 

252 for data in wrapper["data"]: 

253 if data["type"] == "dimension": 

254 # convert all datetime values to astropy 

255 for record in data["records"]: 

256 for key in record: 

257 # Some older YAML files were produced with native 

258 # YAML support for datetime, we support reading that 

259 # data back. Newer conversion uses _AstropyTimeToYAML 

260 # class with special YAML tag. 

261 if isinstance(record[key], datetime): 

262 record[key] = astropy.time.Time(record[key], scale="utc") 

263 element = self.registry.dimensions[data["element"]] 

264 RecordClass: Type[DimensionRecord] = element.RecordClass 

265 self.dimensions[element].extend( 

266 RecordClass(**r) for r in data["records"] 

267 ) 

268 elif data["type"] == "collection": 

269 collectionType = CollectionType.from_name(data["collection_type"]) 

270 if collectionType is CollectionType.RUN: 

271 self.runs[data["name"]] = ( 

272 data["host"], 

273 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]) 

274 ) 

275 elif collectionType is CollectionType.CHAINED: 

276 children = [] 

277 for child in data["children"]: 

278 if not isinstance(child, str): 

279 warnings.warn( 

280 f"CHAINED collection {data['name']} includes restrictions on child " 

281 "collection searches, which are no longer suppored and will be ignored." 

282 ) 

283 # Old form with dataset type restrictions only, 

284 # supported for backwards compatibility. 

285 child, _ = child 

286 children.append(child) 

287 self.chains[data["name"]] = children 

288 else: 

289 self.collections[data["name"]] = collectionType 

290 doc = data.get("doc") 

291 if doc is not None: 

292 self.collectionDocs[data["name"]] = doc 

293 elif data["type"] == "run": 

294 # Also support old form of saving a run with no extra info. 

295 self.runs[data["name"]] = (None, Timespan(None, None)) 

296 elif data["type"] == "dataset_type": 

297 self.datasetTypes.add( 

298 DatasetType(data["name"], dimensions=data["dimensions"], 

299 storageClass=data["storage_class"], universe=self.registry.dimensions, 

300 isCalibration=data.get("is_calibration", False)) 

301 ) 

302 elif data["type"] == "dataset": 

303 # Save raw dataset data for a second loop, so we can ensure we 

304 # know about all dataset types first. 

305 datasetData.append(data) 

306 elif data["type"] == "associations": 

307 collectionType = CollectionType.from_name(data["collection_type"]) 

308 if collectionType is CollectionType.TAGGED: 

309 self.tagAssociations[data["collection"]].extend(data["dataset_ids"]) 

310 elif collectionType is CollectionType.CALIBRATION: 

311 assocsByTimespan = self.calibAssociations[data["collection"]] 

312 for d in data["validity_ranges"]: 

313 if "timespan" in d: 

314 assocsByTimespan[d["timespan"]] = d["dataset_ids"] 

315 else: 

316 # TODO: this is for backward compatibility, should 

317 # be removed at some point. 

318 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"] 

319 else: 

320 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

321 else: 

322 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

323 # key is (dataset type name, run) 

324 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list) 

325 for data in datasetData: 

326 datasetType = self.datasetTypes.get(data["dataset_type"]) 

327 if datasetType is None: 

328 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

329 self.datasets[data["dataset_type"], data["run"]].extend( 

330 FileDataset( 

331 d.get("path"), 

332 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

333 for dataId, refid in zip(ensure_iterable(d["data_id"]), 

334 ensure_iterable(d["dataset_id"]))], 

335 formatter=doImportType(d.get("formatter")) if "formatter" in d else None 

336 ) 

337 for d in data["records"] 

338 ) 

339 

340 def register(self) -> None: 

341 # Docstring inherited from RepoImportBackend.register. 

342 for datasetType in self.datasetTypes: 

343 self.registry.registerDatasetType(datasetType) 

344 for run in self.runs: 

345 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

346 # No way to add extra run info to registry yet. 

347 for collection, collection_type in self.collections.items(): 

348 self.registry.registerCollection(collection, collection_type, 

349 doc=self.collectionDocs.get(collection)) 

350 for chain, children in self.chains.items(): 

351 self.registry.registerCollection(chain, CollectionType.CHAINED, 

352 doc=self.collectionDocs.get(chain)) 

353 self.registry.setCollectionChain(chain, children) 

354 

355 def load(self, datastore: Optional[Datastore], *, 

356 directory: Optional[str] = None, transfer: Optional[str] = None, 

357 skip_dimensions: Optional[Set] = None, 

358 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

359 reuseIds: bool = False) -> None: 

360 # Docstring inherited from RepoImportBackend.load. 

361 for element, dimensionRecords in self.dimensions.items(): 

362 if skip_dimensions and element in skip_dimensions: 

363 continue 

364 self.registry.insertDimensionData(element, *dimensionRecords) 

365 # FileDatasets to ingest into the datastore (in bulk): 

366 fileDatasets = [] 

367 for (datasetTypeName, run), records in self.datasets.items(): 

368 # Make a big flattened list of all data IDs and dataset_ids, while 

369 # remembering slices that associate them with the FileDataset 

370 # instances they came from. 

371 datasets: List[DatasetRef] = [] 

372 dataset_ids: List[DatasetId] = [] 

373 slices = [] 

374 for fileDataset in records: 

375 start = len(datasets) 

376 datasets.extend(fileDataset.refs) 

377 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore 

378 stop = len(datasets) 

379 slices.append(slice(start, stop)) 

380 # Insert all of those DatasetRefs at once. 

381 # For now, we ignore the dataset_id we pulled from the file 

382 # and just insert without one to get a new autoincrement value. 

383 # Eventually (once we have origin in IDs) we'll preserve them. 

384 resolvedRefs = self.registry._importDatasets(datasets, idGenerationMode=idGenerationMode, 

385 reuseIds=reuseIds) 

386 # Populate our dictionary that maps int dataset_id values from the 

387 # export file to the new DatasetRefs 

388 for fileId, ref in zip(dataset_ids, resolvedRefs): 

389 self.refsByFileId[fileId] = ref 

390 # Now iterate over the original records, and install the new 

391 # resolved DatasetRefs to replace the unresolved ones as we 

392 # reorganize the collection information. 

393 for sliceForFileDataset, fileDataset in zip(slices, records): 

394 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

395 if directory is not None: 

396 fileDataset.path = ButlerURI(directory, forceDirectory=True).join(fileDataset.path) 

397 fileDatasets.append(fileDataset) 

398 # Ingest everything into the datastore at once. 

399 if datastore is not None and fileDatasets: 

400 datastore.ingest(*fileDatasets, transfer=transfer) 

401 # Associate datasets with tagged collections. 

402 for collection, dataset_ids in self.tagAssociations.items(): 

403 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

404 # Associate datasets with calibration collections. 

405 for collection, idsByTimespan in self.calibAssociations.items(): 

406 for timespan, dataset_ids in idsByTimespan.items(): 

407 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)