Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26from datetime import datetime 

27from typing import ( 

28 Any, 

29 Dict, 

30 IO, 

31 Iterable, 

32 List, 

33 Mapping, 

34 Optional, 

35 Set, 

36 Tuple, 

37 Type, 

38) 

39import warnings 

40from collections import defaultdict 

41 

42import yaml 

43import astropy.time 

44 

45from lsst.utils import doImport 

46from ..core import ( 

47 DatasetAssociation, 

48 DatasetRef, 

49 DatasetType, 

50 DataCoordinate, 

51 Datastore, 

52 DimensionElement, 

53 DimensionRecord, 

54 FileDataset, 

55 Timespan, 

56) 

57from ..core._butlerUri import ButlerURI 

58from ..core.utils import iterable 

59from ..core.named import NamedValueSet 

60from ..registry import CollectionType, Registry 

61from ..registry.interfaces import ( 

62 ChainedCollectionRecord, 

63 CollectionRecord, 

64 RunRecord, 

65 VersionTuple, 

66) 

67from ..registry.versions import IncompatibleVersionError 

68from ._interfaces import RepoExportBackend, RepoImportBackend 

69 

70 

71EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1) 

72"""Export format version. 

73 

74Files with a different major version or a newer minor version cannot be read by 

75this version of the code. 

76""" 

77 

78 

79class YamlRepoExportBackend(RepoExportBackend): 

80 """A repository export implementation that saves to a YAML file. 

81 

82 Parameters 

83 ---------- 

84 stream 

85 A writeable file-like object. 

86 """ 

87 

88 def __init__(self, stream: IO): 

89 self.stream = stream 

90 self.data: List[Dict[str, Any]] = [] 

91 

92 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

93 # Docstring inherited from RepoExportBackend.saveDimensionData. 

94 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

95 self.data.append({ 

96 "type": "dimension", 

97 "element": element.name, 

98 "records": data_dicts, 

99 }) 

100 

101 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None: 

102 # Docstring inherited from RepoExportBackend.saveCollections. 

103 data: Dict[str, Any] = { 

104 "type": "collection", 

105 "collection_type": record.type.name, 

106 "name": record.name, 

107 } 

108 if doc is not None: 

109 data["doc"] = doc 

110 if isinstance(record, RunRecord): 

111 data["host"] = record.host 

112 data["timespan_begin"] = record.timespan.begin 

113 data["timespan_end"] = record.timespan.end 

114 elif isinstance(record, ChainedCollectionRecord): 

115 data["children"] = list(record.children) 

116 self.data.append(data) 

117 

118 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

119 # Docstring inherited from RepoExportBackend.saveDatasets. 

120 self.data.append({ 

121 "type": "dataset_type", 

122 "name": datasetType.name, 

123 "dimensions": [d.name for d in datasetType.dimensions], 

124 "storage_class": datasetType.storageClass.name, 

125 "is_calibration": datasetType.isCalibration(), 

126 }) 

127 self.data.append({ 

128 "type": "dataset", 

129 "dataset_type": datasetType.name, 

130 "run": run, 

131 "records": [ 

132 { 

133 "dataset_id": [ref.id for ref in sorted(dataset.refs)], 

134 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)], 

135 "path": dataset.path, 

136 "formatter": dataset.formatter, 

137 # TODO: look up and save other collections 

138 } 

139 for dataset in datasets 

140 ] 

141 }) 

142 

143 def saveDatasetAssociations(self, collection: str, collectionType: CollectionType, 

144 associations: Iterable[DatasetAssociation]) -> None: 

145 # Docstring inherited from RepoExportBackend.saveDatasetAssociations. 

146 if collectionType is CollectionType.TAGGED: 

147 self.data.append({ 

148 "type": "associations", 

149 "collection": collection, 

150 "collection_type": collectionType.name, 

151 "dataset_ids": [assoc.ref.id for assoc in associations], 

152 }) 

153 elif collectionType is CollectionType.CALIBRATION: 

154 idsByTimespan: Dict[Timespan, List[int]] = defaultdict(list) 

155 for association in associations: 

156 assert association.timespan is not None 

157 assert association.ref.id is not None 

158 idsByTimespan[association.timespan].append(association.ref.id) 

159 self.data.append({ 

160 "type": "associations", 

161 "collection": collection, 

162 "collection_type": collectionType.name, 

163 "validity_ranges": [ 

164 { 

165 "begin": timespan.begin, 

166 "end": timespan.end, 

167 "dataset_ids": dataset_ids, 

168 } 

169 for timespan, dataset_ids in idsByTimespan.items() 

170 ] 

171 }) 

172 

173 def finish(self) -> None: 

174 # Docstring inherited from RepoExportBackend. 

175 yaml.dump( 

176 { 

177 "description": "Butler Data Repository Export", 

178 "version": str(EXPORT_FORMAT_VERSION), 

179 "data": self.data, 

180 }, 

181 stream=self.stream, 

182 sort_keys=False, 

183 ) 

184 

185 

186class YamlRepoImportBackend(RepoImportBackend): 

187 """A repository import implementation that reads from a YAML file. 

188 

189 Parameters 

190 ---------- 

191 stream 

192 A readable file-like object. 

193 registry : `Registry` 

194 The registry datasets will be imported into. Only used to retreive 

195 dataset types during construction; all write happen in `register` 

196 and `load`. 

197 """ 

198 

199 def __init__(self, stream: IO, registry: Registry): 

200 # We read the file fully and convert its contents to Python objects 

201 # instead of loading incrementally so we can spot some problems early; 

202 # because `register` can't be put inside a transaction, we'd rather not 

203 # run that at all if there's going to be problem later in `load`. 

204 wrapper = yaml.safe_load(stream) 

205 if wrapper["version"] == 0: 

206 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote 

207 # before we really tried to do versioning here. 

208 fileVersion = VersionTuple(1, 0, 0) 

209 else: 

210 fileVersion = VersionTuple.fromString(wrapper["version"]) 

211 if fileVersion.major != EXPORT_FORMAT_VERSION.major: 

212 raise IncompatibleVersionError( 

213 f"Cannot read repository export file with version={fileVersion} " 

214 f"({EXPORT_FORMAT_VERSION.major}.x.x required)." 

215 ) 

216 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor: 

217 raise IncompatibleVersionError( 

218 f"Cannot read repository export file with version={fileVersion} " 

219 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required." 

220 ) 

221 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {} 

222 self.chains: Dict[str, List[str]] = {} 

223 self.collections: Dict[str, CollectionType] = {} 

224 self.collectionDocs: Dict[str, str] = {} 

225 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

226 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

227 self.tagAssociations: Dict[str, List[int]] = defaultdict(list) 

228 self.calibAssociations: Dict[str, Dict[Timespan, List[int]]] = defaultdict(dict) 

229 self.refsByFileId: Dict[int, DatasetRef] = {} 

230 self.registry: Registry = registry 

231 datasetData = [] 

232 for data in wrapper["data"]: 

233 if data["type"] == "dimension": 

234 # convert all datetime values to astropy 

235 for record in data["records"]: 

236 for key in record: 

237 # Some older YAML files were produced with native 

238 # YAML support for datetime, we support reading that 

239 # data back. Newer conversion uses _AstropyTimeToYAML 

240 # class with special YAML tag. 

241 if isinstance(record[key], datetime): 

242 record[key] = astropy.time.Time(record[key], scale="utc") 

243 element = self.registry.dimensions[data["element"]] 

244 RecordClass: Type[DimensionRecord] = element.RecordClass 

245 self.dimensions[element].extend( 

246 RecordClass(**r) for r in data["records"] 

247 ) 

248 elif data["type"] == "collection": 

249 collectionType = CollectionType.__members__[data["collection_type"].upper()] 

250 if collectionType is CollectionType.RUN: 

251 self.runs[data["name"]] = ( 

252 data["host"], 

253 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]) 

254 ) 

255 elif collectionType is CollectionType.CHAINED: 

256 children = [] 

257 for child in data["children"]: 

258 if not isinstance(child, str): 

259 warnings.warn( 

260 f"CHAINED collection {data['name']} includes restrictions on child " 

261 "collection searches, which are no longer suppored and will be ignored." 

262 ) 

263 # Old form with dataset type restrictions only, 

264 # supported for backwards compatibility. 

265 child, _ = child 

266 children.append(child) 

267 self.chains[data["name"]] = children 

268 else: 

269 self.collections[data["name"]] = collectionType 

270 doc = data.get("doc") 

271 if doc is not None: 

272 self.collectionDocs[data["name"]] = doc 

273 elif data["type"] == "run": 

274 # Also support old form of saving a run with no extra info. 

275 self.runs[data["name"]] = (None, Timespan(None, None)) 

276 elif data["type"] == "dataset_type": 

277 self.datasetTypes.add( 

278 DatasetType(data["name"], dimensions=data["dimensions"], 

279 storageClass=data["storage_class"], universe=self.registry.dimensions, 

280 isCalibration=data.get("is_calibration", False)) 

281 ) 

282 elif data["type"] == "dataset": 

283 # Save raw dataset data for a second loop, so we can ensure we 

284 # know about all dataset types first. 

285 datasetData.append(data) 

286 elif data["type"] == "associations": 

287 collectionType = CollectionType.__members__[data["collection_type"].upper()] 

288 if collectionType is CollectionType.TAGGED: 

289 self.tagAssociations[data["collection"]].extend(data["dataset_ids"]) 

290 elif collectionType is CollectionType.CALIBRATION: 

291 assocsByTimespan = self.calibAssociations[data["collection"]] 

292 for d in data["validity_ranges"]: 

293 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"] 

294 else: 

295 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.") 

296 else: 

297 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

298 # key is (dataset type name, run) 

299 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list) 

300 for data in datasetData: 

301 datasetType = self.datasetTypes.get(data["dataset_type"]) 

302 if datasetType is None: 

303 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

304 self.datasets[data["dataset_type"], data["run"]].extend( 

305 FileDataset( 

306 d.get("path"), 

307 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

308 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

309 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

310 ) 

311 for d in data["records"] 

312 ) 

313 

314 def register(self) -> None: 

315 # Docstring inherited from RepoImportBackend.register. 

316 for datasetType in self.datasetTypes: 

317 self.registry.registerDatasetType(datasetType) 

318 for run in self.runs: 

319 self.registry.registerRun(run, doc=self.collectionDocs.get(run)) 

320 # No way to add extra run info to registry yet. 

321 for collection, collection_type in self.collections.items(): 

322 self.registry.registerCollection(collection, collection_type, 

323 doc=self.collectionDocs.get(collection)) 

324 for chain, children in self.chains.items(): 

325 self.registry.registerCollection(chain, CollectionType.CHAINED, 

326 doc=self.collectionDocs.get(chain)) 

327 self.registry.setCollectionChain(chain, children) 

328 

329 def load(self, datastore: Optional[Datastore], *, 

330 directory: Optional[str] = None, transfer: Optional[str] = None, 

331 skip_dimensions: Optional[Set] = None) -> None: 

332 # Docstring inherited from RepoImportBackend.load. 

333 for element, dimensionRecords in self.dimensions.items(): 

334 if skip_dimensions and element in skip_dimensions: 

335 continue 

336 self.registry.insertDimensionData(element, *dimensionRecords) 

337 # FileDatasets to ingest into the datastore (in bulk): 

338 fileDatasets = [] 

339 for (datasetTypeName, run), records in self.datasets.items(): 

340 datasetType = self.registry.getDatasetType(datasetTypeName) 

341 # Make a big flattened list of all data IDs and dataset_ids, while 

342 # remembering slices that associate them with the FileDataset 

343 # instances they came from. 

344 dataIds: List[DataCoordinate] = [] 

345 dataset_ids: List[int] = [] 

346 slices = [] 

347 for fileDataset in records: 

348 start = len(dataIds) 

349 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

350 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore 

351 stop = len(dataIds) 

352 slices.append(slice(start, stop)) 

353 # Insert all of those DatasetRefs at once. 

354 # For now, we ignore the dataset_id we pulled from the file 

355 # and just insert without one to get a new autoincrement value. 

356 # Eventually (once we have origin in IDs) we'll preserve them. 

357 resolvedRefs = self.registry.insertDatasets( 

358 datasetType, 

359 dataIds=dataIds, 

360 run=run, 

361 ) 

362 # Populate our dictionary that maps int dataset_id values from the 

363 # export file to the new DatasetRefs 

364 for fileId, ref in zip(dataset_ids, resolvedRefs): 

365 self.refsByFileId[fileId] = ref 

366 # Now iterate over the original records, and install the new 

367 # resolved DatasetRefs to replace the unresolved ones as we 

368 # reorganize the collection information. 

369 for sliceForFileDataset, fileDataset in zip(slices, records): 

370 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

371 if directory is not None: 

372 fileDataset.path = ButlerURI(directory, forceDirectory=True).join(fileDataset.path) 

373 fileDatasets.append(fileDataset) 

374 # Ingest everything into the datastore at once. 

375 if datastore is not None and fileDatasets: 

376 datastore.ingest(*fileDatasets, transfer=transfer) 

377 # Associate datasets with tagged collections. 

378 for collection, dataset_ids in self.tagAssociations.items(): 

379 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids]) 

380 # Associate datasets with calibration collections. 

381 for collection, idsByTimespan in self.calibAssociations.items(): 

382 for timespan, dataset_ids in idsByTimespan.items(): 

383 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)