Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["FileDataset", "RepoExport", 

25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig", 

26 "YamlRepoExportBackend", "YamlRepoImportBackend"] 

27 

28import os 

29from abc import ABC, abstractmethod 

30from dataclasses import dataclass 

31from datetime import datetime 

32from typing import ( 

33 Any, 

34 Callable, 

35 Dict, 

36 IO, 

37 Iterable, 

38 List, 

39 Mapping, 

40 MutableMapping, 

41 Optional, 

42 Set, 

43 Tuple, 

44 Type, 

45 TYPE_CHECKING, 

46 Union, 

47) 

48from collections import defaultdict 

49 

50import yaml 

51import astropy.time 

52 

53from lsst.utils import doImport 

54from .config import ConfigSubset 

55from .datasets import DatasetType, DatasetRef 

56from .utils import iterable 

57from .named import NamedValueSet 

58 

59if TYPE_CHECKING: 59 ↛ 60line 59 didn't jump to line 60, because the condition on line 59 was never true

60 from .dimensions import DataCoordinate, DimensionElement, DimensionRecord 

61 from ..registry import Registry 

62 from .datastore import Datastore 

63 from .formatter import FormatterParameter 

64 

65 

66class RepoTransferFormatConfig(ConfigSubset): 

67 """The section of butler configuration that associates repo import/export 

68 backends with file formats. 

69 """ 

70 component = "repo_transfer_formats" 

71 defaultConfigFile = "repo_transfer_formats.yaml" 

72 

73 

74@dataclass 

75class FileDataset: 

76 """A struct that represents a dataset exported to a file. 

77 """ 

78 __slots__ = ("refs", "path", "formatter") 

79 

80 refs: List[DatasetRef] 

81 """Registry information about the dataset. (`list` of `DatasetRef`). 

82 """ 

83 

84 path: str 

85 """Path to the dataset (`str`). 

86 

87 If the dataset was exported with ``transfer=None`` (i.e. in-place), 

88 this is relative to the datastore root (only datastores that have a 

89 well-defined root in the local filesystem can be expected to support 

90 in-place exports). Otherwise this is relative to the directory passed 

91 to `Datastore.export`. 

92 """ 

93 

94 formatter: Optional[FormatterParameter] 

95 """A `Formatter` class or fully-qualified name. 

96 """ 

97 

98 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *, 

99 formatter: Optional[FormatterParameter] = None): 

100 self.path = path 

101 if isinstance(refs, DatasetRef): 

102 refs = [refs] 

103 self.refs = refs 

104 self.formatter = formatter 

105 

106 

107class RepoExport: 

108 """Public interface for exporting a subset of a data repository. 

109 

110 Instances of this class are obtained by calling `Butler.export` as the 

111 value returned by that context manager:: 

112 

113 with butler.export(filename="export.yaml") as export: 

114 export.saveDataIds(...) 

115 export.saveDatasts(...) 

116 

117 Parameters 

118 ---------- 

119 registry : `Registry` 

120 Registry to export from. 

121 datastore : `Datastore` 

122 Datastore to export from. 

123 backend : `RepoExportBackend` 

124 Implementation class for a particular export file format. 

125 directory : `str`, optional 

126 Directory to pass to `Datastore.export`. 

127 transfer : `str`, optional 

128 Transfer mdoe to pass to `Datastore.export`. 

129 """ 

130 

131 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

132 directory: Optional[str] = None, transfer: Optional[str] = None): 

133 self._registry = registry 

134 self._datastore = datastore 

135 self._backend = backend 

136 self._directory = directory 

137 self._transfer = transfer 

138 self._dataset_ids: Set[int] = set() 

139 

140 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *, 

141 elements: Optional[Iterable[DimensionElement]] = None) -> None: 

142 """Export the dimension records associated with one or more data IDs. 

143 

144 Parameters 

145 ---------- 

146 dataIds : iterable of `DataCoordinate`. 

147 Fully-expanded data IDs to export. 

148 elements : iterable of `DimensionElement`, optional 

149 Dimension elements whose records should be exported. If `None`, 

150 records for all dimensions will be exported. 

151 """ 

152 if elements is None: 

153 elements = frozenset(element for element in self._registry.dimensions.getStaticElements() 

154 if element.hasTable() and element.viewOf is None) 

155 else: 

156 elements = frozenset(elements) 

157 records: MutableMapping[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

158 for dataId in dataIds: 

159 for record in dataId.records.values(): 

160 if record is not None and record.definition in elements: 

161 records[record.definition].setdefault(record.dataId, record) 

162 for element in self._registry.dimensions.sorted(records.keys()): 

163 self._backend.saveDimensionData(element, *records[element].values()) 

164 

165 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

166 elements: Optional[Iterable[DimensionElement]] = None, 

167 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None: 

168 """Export one or more datasets. 

169 

170 This automatically exports any `DatasetType`, `Run`, and dimension 

171 records associated with the datasets. 

172 

173 Parameters 

174 ---------- 

175 refs : iterable of `DatasetRef` 

176 References to the datasets to export. Their `DatasetRef.id` 

177 attributes must not be `None`. Duplicates are automatically 

178 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

179 return `True`. 

180 elements : iterable of `DimensionElement`, optional 

181 Dimension elements whose records should be exported; this is 

182 forwarded to `saveDataIds` when exporting the data IDs of the 

183 given datasets. 

184 rewrite : callable, optional 

185 A callable that takes a single `FileDataset` argument and returns 

186 a modified `FileDataset`. This is typically used to rewrite the 

187 path generated by the datastore. If `None`, the `FileDataset` 

188 returned by `Datastore.export` will be used directly. 

189 

190 Notes 

191 ----- 

192 At present, this only associates datasets with the collection that 

193 matches their run name. Other collections will be included in the 

194 export in the future (once `Registry` provides a way to look up that 

195 information). 

196 """ 

197 dataIds = set() 

198 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) 

199 for ref in refs: 

200 # The query interfaces that are often used to generate the refs 

201 # passed here often don't remove duplicates, so do that here for 

202 # convenience. 

203 if ref.id in self._dataset_ids: 

204 continue 

205 dataIds.add(self._registry.expandDataId(ref.dataId)) 

206 # `exports` is a single-element list here, because we anticipate 

207 # a future where more than just Datastore.export has a vectorized 

208 # API and we can pull this out of the loop. 

209 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

210 if rewrite is not None: 

211 exports = [rewrite(export) for export in exports] 

212 self._dataset_ids.add(ref.getCheckedId()) 

213 assert ref.run is not None 

214 datasets[ref.datasetType, ref.run].extend(exports) 

215 self.saveDataIds(dataIds, elements=elements) 

216 for (datasetType, run), records in datasets.items(): 

217 self._backend.saveDatasets(datasetType, run, *records) 

218 

219 def _finish(self) -> None: 

220 """Delegate to the backend to finish the export process. 

221 

222 For use by `Butler.export` only. 

223 """ 

224 self._backend.finish() 

225 

226 

227class RepoExportBackend(ABC): 

228 """An abstract interface for data repository export implementations. 

229 """ 

230 

231 @abstractmethod 

232 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

233 """Export one or more dimension element records. 

234 

235 Parameters 

236 ---------- 

237 element : `DimensionElement` 

238 The `DimensionElement` whose elements are being exported. 

239 data : `DimensionRecord` (variadic) 

240 One or more records to export. 

241 """ 

242 raise NotImplementedError() 

243 

244 @abstractmethod 

245 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

246 """Export one or more datasets, including their associated DatasetType 

247 and run information (but not including associated dimension 

248 information). 

249 

250 Parameters 

251 ---------- 

252 datasetType : `DatasetType` 

253 Type of all datasets being exported with this call. 

254 run : `str` 

255 Run associated with all datasets being exported with this call. 

256 datasets : `FileDataset`, variadic 

257 Per-dataset information to be exported. `FileDataset.formatter` 

258 attributes should be strings, not `Formatter` instances or classes. 

259 """ 

260 raise NotImplementedError() 

261 

262 @abstractmethod 

263 def finish(self) -> None: 

264 """Complete the export process. 

265 """ 

266 raise NotImplementedError() 

267 

268 

269class RepoImportBackend(ABC): 

270 """An abstract interface for data repository import implementations. 

271 

272 Import backends are expected to be constructed with a description of 

273 the objects that need to be imported (from, e.g., a file written by the 

274 corresponding export backend), along with a `Registry`. 

275 """ 

276 

277 @abstractmethod 

278 def register(self) -> None: 

279 """Register all runs and dataset types associated with the backend with 

280 the `Registry` the backend was constructed with. 

281 

282 These operations cannot be performed inside transactions, unlike those 

283 performed by `load`, and must in general be performed before `load`. 

284 """ 

285 

286 @abstractmethod 

287 def load(self, datastore: Optional[Datastore], *, 

288 directory: Optional[str] = None, transfer: Optional[str] = None, 

289 skip_dimensions: Optional[Set] = None) -> None: 

290 """Import information associated with the backend into the given 

291 registry and datastore. 

292 

293 This must be run after `register`, and may be performed inside a 

294 transaction. 

295 

296 Parameters 

297 ---------- 

298 datastore : `Datastore` 

299 Datastore to import into. If `None`, datasets will only be 

300 inserted into the `Registry` (primarily intended for tests). 

301 directory : `str`, optional 

302 File all dataset paths are relative to. 

303 transfer : `str`, optional 

304 Transfer mode forwarded to `Datastore.ingest`. 

305 skip_dimensions : `set`, optional 

306 Dimensions that should be skipped and not imported. This can 

307 be useful when importing into a registry that already knows 

308 about a specific instrument. 

309 """ 

310 raise NotImplementedError() 

311 

312 

313class YamlRepoExportBackend(RepoExportBackend): 

314 """A repository export implementation that saves to a YAML file. 

315 

316 Parameters 

317 ---------- 

318 stream 

319 A writeable file-like object. 

320 """ 

321 

322 def __init__(self, stream: IO): 

323 self.stream = stream 

324 self.data: List[Dict[str, Any]] = [] 

325 

326 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

327 # Docstring inherited from RepoExportBackend.saveDimensionData. 

328 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

329 self.data.append({ 

330 "type": "dimension", 

331 "element": element.name, 

332 "records": data_dicts, 

333 }) 

334 

335 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

336 # Docstring inherited from RepoExportBackend.saveDatasets. 

337 self.data.append({ 

338 "type": "dataset_type", 

339 "name": datasetType.name, 

340 "dimensions": [d.name for d in datasetType.dimensions], 

341 "storage_class": datasetType.storageClass.name, 

342 }) 

343 self.data.append({ 

344 "type": "run", 

345 "name": run, 

346 }) 

347 self.data.append({ 

348 "type": "dataset", 

349 "dataset_type": datasetType.name, 

350 "run": run, 

351 "records": [ 

352 { 

353 "dataset_id": [ref.id for ref in dataset.refs], 

354 "data_id": [ref.dataId.byName() for ref in dataset.refs], 

355 "path": dataset.path, 

356 "formatter": dataset.formatter, 

357 # TODO: look up and save other collections 

358 } 

359 for dataset in datasets 

360 ] 

361 }) 

362 

363 def finish(self) -> None: 

364 # Docstring inherited from RepoExportBackend. 

365 yaml.dump( 

366 { 

367 "description": "Butler Data Repository Export", 

368 "version": 0, 

369 "data": self.data, 

370 }, 

371 stream=self.stream, 

372 sort_keys=False, 

373 ) 

374 

375 

376class YamlRepoImportBackend(RepoImportBackend): 

377 """A repository import implementation that reads from a YAML file. 

378 

379 Parameters 

380 ---------- 

381 stream 

382 A readable file-like object. 

383 registry : `Registry` 

384 The registry datasets will be imported into. Only used to retreive 

385 dataset types during construction; all write happen in `register` 

386 and `load`. 

387 """ 

388 

389 def __init__(self, stream: IO, registry: Registry): 

390 # We read the file fully and convert its contents to Python objects 

391 # instead of loading incrementally so we can spot some problems early; 

392 # because `register` can't be put inside a transaction, we'd rather not 

393 # run that at all if there's going to be problem later in `load`. 

394 wrapper = yaml.safe_load(stream) 

395 # TODO: When version numbers become meaningful, check here that we can 

396 # read the version in the file. 

397 self.runs: List[str] = [] 

398 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

399 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

400 self.registry: Registry = registry 

401 datasetData = [] 

402 for data in wrapper["data"]: 

403 if data["type"] == "dimension": 

404 # convert all datetiem values to astropy 

405 for record in data["records"]: 

406 for key in record: 

407 # Some older YAML files were produced with native 

408 # YAML support for datetime, we support reading that 

409 # data back. Newer conversion uses _AstropyTimeToYAML 

410 # class with special YAML tag. 

411 if isinstance(record[key], datetime): 

412 record[key] = astropy.time.Time(record[key], scale="utc") 

413 element = self.registry.dimensions[data["element"]] 

414 RecordClass: Type[DimensionRecord] = element.RecordClass 

415 self.dimensions[element].extend( 

416 RecordClass(**r) for r in data["records"] 

417 ) 

418 elif data["type"] == "run": 

419 self.runs.append(data["name"]) 

420 elif data["type"] == "dataset_type": 

421 self.datasetTypes.add( 

422 DatasetType(data["name"], dimensions=data["dimensions"], 

423 storageClass=data["storage_class"], universe=self.registry.dimensions) 

424 ) 

425 elif data["type"] == "dataset": 

426 # Save raw dataset data for a second loop, so we can ensure we 

427 # know about all dataset types first. 

428 datasetData.append(data) 

429 else: 

430 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

431 # key is (dataset type name, run); inner most list is collections 

432 self.datasets: Mapping[Tuple[str, str], List[Tuple[FileDataset, List[str]]]] = defaultdict(list) 

433 for data in datasetData: 

434 datasetType = self.datasetTypes.get(data["dataset_type"]) 

435 if datasetType is None: 

436 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

437 self.datasets[data["dataset_type"], data["run"]].extend( 

438 ( 

439 FileDataset( 

440 d.get("path"), 

441 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

442 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

443 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

444 ), 

445 d.get("collections", []) 

446 ) 

447 for d in data["records"] 

448 ) 

449 

450 def register(self) -> None: 

451 # Docstring inherited from RepoImportBackend.register. 

452 for run in self.runs: 

453 self.registry.registerRun(run) 

454 for datasetType in self.datasetTypes: 

455 self.registry.registerDatasetType(datasetType) 

456 

457 def load(self, datastore: Optional[Datastore], *, 

458 directory: Optional[str] = None, transfer: Optional[str] = None, 

459 skip_dimensions: Optional[Set] = None) -> None: 

460 # Docstring inherited from RepoImportBackend.load. 

461 for element, dimensionRecords in self.dimensions.items(): 

462 if skip_dimensions and element in skip_dimensions: 

463 continue 

464 self.registry.insertDimensionData(element, *dimensionRecords) 

465 # Mapping from collection name to list of DatasetRefs to associate. 

466 collections = defaultdict(list) 

467 # FileDatasets to ingest into the datastore (in bulk): 

468 fileDatasets = [] 

469 for (datasetTypeName, run), records in self.datasets.items(): 

470 datasetType = self.registry.getDatasetType(datasetTypeName) 

471 # Make a big flattened list of all data IDs, while remembering 

472 # slices that associate them with the FileDataset instances they 

473 # came from. 

474 dataIds: List[DataCoordinate] = [] 

475 slices = [] 

476 for fileDataset, _ in records: 

477 start = len(dataIds) 

478 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

479 stop = len(dataIds) 

480 slices.append(slice(start, stop)) 

481 # Insert all of those DatasetRefs at once. 

482 # For now, we ignore the dataset_id we pulled from the file 

483 # and just insert without one to get a new autoincrement value. 

484 # Eventually (once we have origin in IDs) we'll preserve them. 

485 resolvedRefs = self.registry.insertDatasets( 

486 datasetType, 

487 dataIds=dataIds, 

488 run=run, 

489 ) 

490 # Now iterate over the original records, and install the new 

491 # resolved DatasetRefs to replace the unresolved ones as we 

492 # reorganize the collection information. 

493 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records): 

494 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

495 if directory is not None: 

496 fileDataset.path = os.path.join(directory, fileDataset.path) 

497 fileDatasets.append(fileDataset) 

498 for collection in collectionsForDataset: 

499 collections[collection].extend(fileDataset.refs) 

500 # Ingest everything into the datastore at once. 

501 if datastore is not None and fileDatasets: 

502 datastore.ingest(*fileDatasets, transfer=transfer) 

503 # Associate with collections, one collection at a time. 

504 for collection, refs in collections.items(): 

505 self.registry.associate(collection, refs) 

506 

507 

508class _AstropyTimeToYAML: 

509 """Handle conversion of astropy Time to/from YAML representation. 

510 

511 This class defines methods that convert astropy Time instances to or from 

512 YAML representation. On output it converts time to string ISO format in 

513 TAI scale with maximum precision defining special YAML tag for it. On 

514 input it does inverse transformation. The methods need to be registered 

515 with YAML dumper and loader classes. 

516 

517 Notes 

518 ----- 

519 Python ``yaml`` module defines special helper base class ``YAMLObject`` 

520 that provides similar functionality but its use is complicated by the need 

521 to convert ``Time`` instances to instances of ``YAMLObject`` sub-class 

522 before saving them to YAML. This class avoids this intermediate step but 

523 it requires separate regisration step. 

524 """ 

525 

526 yaml_tag = "!butler_time/tai/iso" # YAML tag name for Time class 

527 

528 @classmethod 

529 def to_yaml(cls, dumper: yaml.Dumper, data: astropy.time.Time) -> Any: 

530 """Convert astropy Time object into YAML format. 

531 

532 Parameters 

533 ---------- 

534 dumper : `yaml.Dumper` 

535 YAML dumper instance. 

536 data : `astropy.time.Time` 

537 Data to be converted. 

538 """ 

539 if data is not None: 

540 # we store time in ISO format but we need full nanosecond 

541 # precision so we have to construct intermediate instance to make 

542 # sure its precision is set correctly. 

543 data = astropy.time.Time(data.tai, precision=9) 

544 data = data.to_value("iso") 

545 return dumper.represent_scalar(cls.yaml_tag, data) 

546 

547 @classmethod 

548 def from_yaml(cls, loader: yaml.SafeLoader, node: yaml.ScalarNode) -> astropy.time.Time: 

549 """Convert YAML node into astropy time 

550 

551 Parameters 

552 ---------- 

553 loader : `yaml.SafeLoader` 

554 Instance of YAML loader class. 

555 node : `yaml.ScalarNode` 

556 YAML node. 

557 

558 Returns 

559 ------- 

560 time : `astropy.time.Time` 

561 Time instance, can be ``None``. 

562 """ 

563 if node.value is not None: 

564 return astropy.time.Time(node.value, format="iso", scale="tai") 

565 

566 

567# Register Time -> YAML conversion method with Dumper class 

568yaml.Dumper.add_representer(astropy.time.Time, _AstropyTimeToYAML.to_yaml) 

569 

570# Register YAML -> Time conversion method with Loader, for our use case we 

571# only need SafeLoader. 

572yaml.SafeLoader.add_constructor(_AstropyTimeToYAML.yaml_tag, _AstropyTimeToYAML.from_yaml)