Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["FileDataset", "RepoExport", 

25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig", 

26 "YamlRepoExportBackend", "YamlRepoImportBackend"] 

27 

28import os 

29from abc import ABC, abstractmethod 

30from dataclasses import dataclass 

31from datetime import datetime 

32from typing import ( 

33 TYPE_CHECKING, 

34 Callable, 

35 IO, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44from collections import defaultdict 

45 

46import yaml 

47import astropy.time 

48 

49from lsst.utils import doImport 

50from .config import ConfigSubset 

51from .datasets import DatasetType, DatasetRef 

52from .utils import NamedValueSet, iterable 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate 

56 from ..registry import Registry 

57 from .datastore import Datastore 

58 from .formatter import FormatterParameter 

59 

60 

61class RepoTransferFormatConfig(ConfigSubset): 

62 """The section of butler configuration that associates repo import/export 

63 backends with file formats. 

64 """ 

65 component = "repo_transfer_formats" 

66 defaultConfigFile = "repo_transfer_formats.yaml" 

67 

68 

69@dataclass 

70class FileDataset: 

71 """A struct that represents a dataset exported to a file. 

72 """ 

73 __slots__ = ("refs", "path", "formatter") 

74 

75 refs: List[DatasetRef] 

76 """Registry information about the dataset. (`list` of `DatasetRef`). 

77 """ 

78 

79 path: str 

80 """Path to the dataset (`str`). 

81 

82 If the dataset was exported with ``transfer=None`` (i.e. in-place), 

83 this is relative to the datastore root (only datastores that have a 

84 well-defined root in the local filesystem can be expected to support 

85 in-place exports). Otherwise this is relative to the directory passed 

86 to `Datastore.export`. 

87 """ 

88 

89 formatter: Optional[FormatterParameter] 

90 """A `Formatter` class or fully-qualified name. 

91 """ 

92 

93 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *, 

94 formatter: Optional[FormatterParameter] = None): 

95 self.path = path 

96 if isinstance(refs, DatasetRef): 

97 refs = [refs] 

98 self.refs = refs 

99 self.formatter = formatter 

100 

101 

102class RepoExport: 

103 """Public interface for exporting a subset of a data repository. 

104 

105 Instances of this class are obtained by calling `Butler.export` as the 

106 value returned by that context manager:: 

107 

108 with butler.export(filename="export.yaml") as export: 

109 export.saveDataIds(...) 

110 export.saveDatasts(...) 

111 

112 Parameters 

113 ---------- 

114 registry : `Registry` 

115 Registry to export from. 

116 datastore : `Datastore` 

117 Datastore to export from. 

118 backend : `RepoExportBackend` 

119 Implementation class for a particular export file format. 

120 directory : `str`, optional 

121 Directory to pass to `Datastore.export`. 

122 transfer : `str`, optional 

123 Transfer mdoe to pass to `Datastore.export`. 

124 """ 

125 

126 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

127 directory: Optional[str] = None, transfer: Optional[str] = None): 

128 self._registry = registry 

129 self._datastore = datastore 

130 self._backend = backend 

131 self._directory = directory 

132 self._transfer = transfer 

133 self._dataset_ids: Set[int] = set() 

134 

135 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *, 

136 elements: Optional[Iterable[DimensionElement]] = None) -> None: 

137 """Export the dimension records associated with one or more data IDs. 

138 

139 Parameters 

140 ---------- 

141 dataIds : iterable of `ExpandedDataCoordinate`. 

142 Fully-expanded data IDs to export. 

143 elements : iterable of `DimensionElement`, optional 

144 Dimension elements whose records should be exported. If `None`, 

145 records for all dimensions will be exported. 

146 """ 

147 if elements is None: 

148 elements = frozenset(element for element in self._registry.dimensions.elements 

149 if element.hasTable() and element.viewOf is None) 

150 else: 

151 elements = frozenset(elements) 

152 records = defaultdict(dict) 

153 for dataId in dataIds: 

154 for record in dataId.records.values(): 

155 if record.definition in elements: 

156 records[record.definition].setdefault(record.dataId, record) 

157 for element in self._registry.dimensions.sorted(records.keys()): 

158 self._backend.saveDimensionData(element, *records[element].values()) 

159 

160 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

161 elements: Optional[Iterable[DimensionElement]] = None, 

162 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None: 

163 """Export one or more datasets. 

164 

165 This automatically exports any `DatasetType`, `Run`, and dimension 

166 records associated with the datasets. 

167 

168 Parameters 

169 ---------- 

170 refs : iterable of `DatasetRef` 

171 References to the datasets to export. Their `DatasetRef.id` 

172 attributes must not be `None`. Duplicates are automatically 

173 ignored. 

174 elements : iterable of `DimensionElement`, optional 

175 Dimension elements whose records should be exported; this is 

176 forwarded to `saveDataIds` when exporting the data IDs of the 

177 given datasets. 

178 rewrite : callable, optional 

179 A callable that takes a single `FileDataset` argument and returns 

180 a modified `FileDataset`. This is typically used to rewrite the 

181 path generated by the datastore. If `None`, the `FileDataset` 

182 returned by `Datastore.export` will be used directly. 

183 

184 Notes 

185 ----- 

186 At present, this only associates datasets with the collection that 

187 matches their run name. Other collections will be included in the 

188 export in the future (once `Registry` provides a way to look up that 

189 information). 

190 """ 

191 dataIds = set() 

192 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) 

193 for ref in refs: 

194 # The query interfaces that are often used to generate the refs 

195 # passed here often don't remove duplicates, so do that here for 

196 # convenience. 

197 if ref.id in self._dataset_ids: 

198 continue 

199 dataIds.add(ref.dataId) 

200 # `exports` is a single-element list here, because we anticipate 

201 # a future where more than just Datastore.export has a vectorized 

202 # API and we can pull this out of the loop. 

203 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

204 if rewrite is not None: 

205 exports = [rewrite(export) for export in exports] 

206 datasets[ref.datasetType, ref.run].extend(exports) 

207 self._dataset_ids.add(ref.id) 

208 self.saveDataIds(dataIds, elements=elements) 

209 for (datasetType, run), records in datasets.items(): 

210 self._backend.saveDatasets(datasetType, run, *records) 

211 

212 def _finish(self) -> None: 

213 """Delegate to the backend to finish the export process. 

214 

215 For use by `Butler.export` only. 

216 """ 

217 self._backend.finish() 

218 

219 

220class RepoExportBackend(ABC): 

221 """An abstract interface for data repository export implementations. 

222 """ 

223 

224 @abstractmethod 

225 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

226 """Export one or more dimension element records. 

227 

228 Parameters 

229 ---------- 

230 element : `DimensionElement` 

231 The `DimensionElement` whose elements are being exported. 

232 data : `DimensionRecord` (variadic) 

233 One or more records to export. 

234 """ 

235 raise NotImplementedError() 

236 

237 @abstractmethod 

238 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset, 

239 collections: Iterable[str] = ()) -> None: 

240 """Export one or more datasets, including their associated DatasetType 

241 and run information (but not including associated dimension 

242 information). 

243 

244 Parameters 

245 ---------- 

246 datasetType : `DatasetType` 

247 Type of all datasets being exported with this call. 

248 run : `str` 

249 Run associated with all datasets being exported with this call. 

250 datasets : `FileDataset`, variadic 

251 Per-dataset information to be exported. `FileDataset.formatter` 

252 attributes should be strings, not `Formatter` instances or classes. 

253 collections : iterable of `str` 

254 Extra collections (in addition to ``run``) the dataset 

255 should be associated with. 

256 """ 

257 raise NotImplementedError() 

258 

259 @abstractmethod 

260 def finish(self) -> None: 

261 """Complete the export process. 

262 """ 

263 raise NotImplementedError() 

264 

265 

266class RepoImportBackend(ABC): 

267 """An abstract interface for data repository import implementations. 

268 

269 Import backends are expected to be constructed with a description of 

270 the objects that need to be imported (from, e.g., a file written by the 

271 corresponding export backend), along with a `Registry`. 

272 """ 

273 

274 @abstractmethod 

275 def register(self) -> None: 

276 """Register all runs and dataset types associated with the backend with 

277 the `Registry` the backend was constructed with. 

278 

279 These operations cannot be performed inside transactions, unlike those 

280 performed by `load`, and must in general be performed before `load`. 

281 """ 

282 

283 @abstractmethod 

284 def load(self, datastore: Optional[Datastore], *, 

285 directory: Optional[str] = None, transfer: Optional[str] = None) -> None: 

286 """Import information associated with the backend into the given 

287 registry and datastore. 

288 

289 This must be run after `register`, and may be performed inside a 

290 transaction. 

291 

292 Parameters 

293 ---------- 

294 datastore : `Datastore` 

295 Datastore to import into. If `None`, datasets will only be 

296 inserted into the `Registry` (primarily intended for tests). 

297 directory : `str`, optional 

298 File all dataset paths are relative to. 

299 transfer : `str`, optional 

300 Transfer mode forwarded to `Datastore.ingest`. 

301 """ 

302 raise NotImplementedError() 

303 

304 

305class YamlRepoExportBackend(RepoExportBackend): 

306 """A repository export implementation that saves to a YAML file. 

307 

308 Parameters 

309 ---------- 

310 stream 

311 A writeable file-like object. 

312 """ 

313 

314 def __init__(self, stream: IO): 

315 self.stream = stream 

316 self.data = [] 

317 

318 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

319 # Docstring inherited from RepoExportBackend.saveDimensionData. 

320 # Convert astropy time in TAI to datetime in UTC for YAML 

321 data_dicts = [] 

322 for record in data: 

323 rec_dict = record.toDict() 

324 for key in rec_dict: 

325 if isinstance(rec_dict[key], astropy.time.Time): 

326 rec_dict[key] = rec_dict[key].utc.to_datetime() 

327 data_dicts += [rec_dict] 

328 self.data.append({ 

329 "type": "dimension", 

330 "element": element.name, 

331 "records": data_dicts, # TODO: encode regions 

332 }) 

333 

334 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

335 # Docstring inherited from RepoExportBackend.saveDatasets. 

336 self.data.append({ 

337 "type": "dataset_type", 

338 "name": datasetType.name, 

339 "dimensions": [d.name for d in datasetType.dimensions], 

340 "storage_class": datasetType.storageClass.name, 

341 }) 

342 self.data.append({ 

343 "type": "run", 

344 "name": run, 

345 }) 

346 self.data.append({ 

347 "type": "dataset", 

348 "dataset_type": datasetType.name, 

349 "run": run, 

350 "records": [ 

351 { 

352 "dataset_id": [ref.id for ref in dataset.refs], 

353 "data_id": [ref.dataId.byName() for ref in dataset.refs], 

354 "path": dataset.path, 

355 "formatter": dataset.formatter, 

356 # TODO: look up and save other collections 

357 } 

358 for dataset in datasets 

359 ] 

360 }) 

361 

362 def finish(self) -> None: 

363 # Docstring inherited from RepoExportBackend. 

364 yaml.dump( 

365 { 

366 "description": "Butler Data Repository Export", 

367 "version": 0, 

368 "data": self.data, 

369 }, 

370 stream=self.stream, 

371 sort_keys=False, 

372 ) 

373 

374 

375class YamlRepoImportBackend(RepoImportBackend): 

376 """A repository import implementation that reads from a YAML file. 

377 

378 Parameters 

379 ---------- 

380 stream 

381 A readable file-like object. 

382 registry : `Registry` 

383 The registry datasets will be imported into. Only used to retreive 

384 dataset types during construction; all write happen in `register` 

385 and `load`. 

386 """ 

387 

388 def __init__(self, stream: IO, registry: Registry): 

389 # We read the file fully and convert its contents to Python objects 

390 # instead of loading incrementally so we can spot some problems early; 

391 # because `register` can't be put inside a transaction, we'd rather not 

392 # run that at all if there's going to be problem later in `load`. 

393 wrapper = yaml.safe_load(stream) 

394 # TODO: When version numbers become meaningful, check here that we can 

395 # read the version in the file. 

396 self.runs: List[str] = [] 

397 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

398 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

399 self.registry: Registry = registry 

400 datasetData = [] 

401 for data in wrapper["data"]: 

402 if data["type"] == "dimension": 

403 # convert all datetiem values to astropy 

404 for record in data["records"]: 

405 for key in record: 

406 if isinstance(record[key], datetime): 

407 record[key] = astropy.time.Time(record[key], scale="utc") 

408 element = self.registry.dimensions[data["element"]] 

409 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"]) 

410 elif data["type"] == "run": 

411 self.runs.append(data["name"]) 

412 elif data["type"] == "dataset_type": 

413 self.datasetTypes.add( 

414 DatasetType(data["name"], dimensions=data["dimensions"], 

415 storageClass=data["storage_class"], universe=self.registry.dimensions) 

416 ) 

417 elif data["type"] == "dataset": 

418 # Save raw dataset data for a second loop, so we can ensure we 

419 # know about all dataset types first. 

420 datasetData.append(data) 

421 else: 

422 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

423 # key is (dataset type name, run); inner most list is collections 

424 self.datasets: Mapping[Tuple[str, str], List[Tuple[FileDataset, List[str]]]] = defaultdict(list) 

425 for data in datasetData: 

426 datasetType = self.datasetTypes.get(data["dataset_type"]) 

427 if datasetType is None: 

428 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

429 self.datasets[data["dataset_type"], data["run"]].extend( 

430 ( 

431 FileDataset( 

432 d.get("path"), 

433 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

434 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

435 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

436 ), 

437 d.get("collections", []) 

438 ) 

439 for d in data["records"] 

440 ) 

441 

442 def register(self) -> None: 

443 # Docstring inherited from RepoImportBackend.register. 

444 for run in self.runs: 

445 self.registry.registerRun(run) 

446 for datasetType in self.datasetTypes: 

447 self.registry.registerDatasetType(datasetType) 

448 

449 def load(self, datastore: Optional[Datastore], *, 

450 directory: Optional[str] = None, transfer: Optional[str] = None) -> None: 

451 # Docstring inherited from RepoImportBackend.load. 

452 for element, records in self.dimensions.items(): 

453 self.registry.insertDimensionData(element, *records) 

454 # Mapping from collection name to list of DatasetRefs to associate. 

455 collections = defaultdict(list) 

456 # FileDatasets to ingest into the datastore (in bulk): 

457 fileDatasets = [] 

458 for (datasetTypeName, run), records in self.datasets.items(): 

459 datasetType = self.registry.getDatasetType(datasetTypeName) 

460 # Make a big flattened list of all data IDs, while remembering 

461 # slices that associate them with the FileDataset instances they 

462 # came from. 

463 dataIds = [] 

464 slices = [] 

465 for fileDataset, _ in records: 

466 start = len(dataIds) 

467 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

468 stop = len(dataIds) 

469 slices.append(slice(start, stop)) 

470 # Insert all of those DatasetRefs at once. 

471 # For now, we ignore the dataset_id we pulled from the file 

472 # and just insert without one to get a new autoincrement value. 

473 # Eventually (once we have origin in IDs) we'll preserve them. 

474 resolvedRefs = self.registry.insertDatasets( 

475 datasetType, 

476 dataIds=dataIds, 

477 run=run, 

478 recursive=True 

479 ) 

480 # Now iterate over the original records, and install the new 

481 # resolved DatasetRefs to replace the unresolved ones as we 

482 # reorganize the collection information. 

483 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records): 

484 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

485 if directory is not None: 

486 fileDataset.path = os.path.join(directory, fileDataset.path) 

487 fileDatasets.append(fileDataset) 

488 for collection in collectionsForDataset: 

489 collections[collection].extend(fileDataset.refs) 

490 # Ingest everything into the datastore at once. 

491 if datastore is not None and fileDatasets: 

492 datastore.ingest(*fileDatasets, transfer=transfer) 

493 # Associate with collections, one collection at a time. 

494 for collection, refs in collections.items(): 

495 self.registry.associate(collection, refs)