Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["FileDataset", "RepoExport", 

25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig", 

26 "YamlRepoExportBackend", "YamlRepoImportBackend"] 

27 

28import os 

29from abc import ABC, abstractmethod 

30from dataclasses import dataclass 

31from datetime import datetime 

32from typing import TYPE_CHECKING, Iterable, Optional, IO, List, Mapping, Tuple, Callable, Union 

33from collections import defaultdict 

34 

35import yaml 

36import astropy.time 

37 

38from lsst.utils import doImport 

39from .config import ConfigSubset 

40from .datasets import DatasetType, DatasetRef 

41from .utils import NamedValueSet, iterable 

42 

43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true

44 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate 

45 from ..registry import Registry 

46 from .datastore import Datastore 

47 from .formatter import FormatterParameter 

48 

49 

50class RepoTransferFormatConfig(ConfigSubset): 

51 """The section of butler configuration that associates repo import/export 

52 backends with file formats. 

53 """ 

54 component = "repo_transfer_formats" 

55 defaultConfigFile = "repo_transfer_formats.yaml" 

56 

57 

58@dataclass 

59class FileDataset: 

60 """A struct that represents a dataset exported to a file. 

61 """ 

62 __slots__ = ("refs", "path", "formatter") 

63 

64 refs: List[DatasetRef] 

65 """Registry information about the dataset. (`list` of `DatasetRef`). 

66 """ 

67 

68 path: str 

69 """Path to the dataset (`str`). 

70 

71 If the dataset was exported with ``transfer=None`` (i.e. in-place), 

72 this is relative to the datastore root (only datastores that have a 

73 well-defined root in the local filesystem can be expected to support 

74 in-place exports). Otherwise this is relative to the directory passed 

75 to `Datastore.export`. 

76 """ 

77 

78 formatter: FormatterParameter 

79 """A `Formatter` class or fully-qualified name. 

80 """ 

81 

82 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *, 

83 formatter: FormatterParameter = None): 

84 self.path = path 

85 if isinstance(refs, DatasetRef): 

86 refs = [refs] 

87 self.refs = refs 

88 self.formatter = formatter 

89 

90 

91class RepoExport: 

92 """Public interface for exporting a subset of a data repository. 

93 

94 Instances of this class are obtained by calling `Butler.export` as the 

95 value returned by that context manager:: 

96 

97 with butler.export(filename="export.yaml") as export: 

98 export.saveDataIds(...) 

99 export.saveDatasts(...) 

100 

101 Parameters 

102 ---------- 

103 registry : `Registry` 

104 Registry to export from. 

105 datastore : `Datastore` 

106 Datastore to export from. 

107 backend : `RepoExportBackend` 

108 Implementation class for a particular export file format. 

109 directory : `str`, optional 

110 Directory to pass to `Datastore.export`. 

111 transfer : `str`, optional 

112 Transfer mdoe to pass to `Datastore.export`. 

113 """ 

114 

115 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

116 directory: Optional[str] = None, transfer: Optional[str] = None): 

117 self._registry = registry 

118 self._datastore = datastore 

119 self._backend = backend 

120 self._directory = directory 

121 self._transfer = transfer 

122 self._dataset_ids = set() 

123 

124 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *, 

125 elements: Optional[Iterable[DimensionElement]] = None): 

126 """Export the dimension records associated with one or more data IDs. 

127 

128 Parameters 

129 ---------- 

130 dataIds : iterable of `ExpandedDataCoordinate`. 

131 Fully-expanded data IDs to export. 

132 elements : iterable of `DimensionElement`, optional 

133 Dimension elements whose records should be exported. If `None`, 

134 records for all dimensions will be exported. 

135 """ 

136 if elements is None: 

137 elements = frozenset(element for element in self._registry.dimensions.elements 

138 if element.hasTable() and element.viewOf is None) 

139 else: 

140 elements = frozenset(elements) 

141 records = defaultdict(dict) 

142 for dataId in dataIds: 

143 for record in dataId.records.values(): 

144 if record.definition in elements: 

145 records[record.definition].setdefault(record.dataId, record) 

146 for element in self._registry.dimensions.sorted(records.keys()): 

147 self._backend.saveDimensionData(element, *records[element].values()) 

148 

149 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

150 elements: Optional[Iterable[DimensionElement]] = None, 

151 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None): 

152 """Export one or more datasets. 

153 

154 This automatically exports any `DatasetType`, `Run`, and dimension 

155 records associated with the datasets. 

156 

157 Parameters 

158 ---------- 

159 refs : iterable of `DatasetRef` 

160 References to the datasets to export. Their `DatasetRef.id` 

161 attributes must not be `None`. Duplicates are automatically 

162 ignored. 

163 elements : iterable of `DimensionElement`, optional 

164 Dimension elements whose records should be exported; this is 

165 forwarded to `saveDataIds` when exporting the data IDs of the 

166 given datasets. 

167 rewrite : callable, optional 

168 A callable that takes a single `FileDataset` argument and returns 

169 a modified `FileDataset`. This is typically used to rewrite the 

170 path generated by the datastore. If `None`, the `FileDataset` 

171 returned by `Datastore.export` will be used directly. 

172 

173 Notes 

174 ----- 

175 At present, this only associates datasets with the collection that 

176 matches their run name. Other collections will be included in the 

177 export in the future (once `Registry` provides a way to look up that 

178 information). 

179 """ 

180 dataIds = set() 

181 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) 

182 for ref in refs: 

183 # The query interfaces that are often used to generate the refs 

184 # passed here often don't remove duplicates, so do that here for 

185 # convenience. 

186 if ref.id in self._dataset_ids: 

187 continue 

188 dataIds.add(ref.dataId) 

189 # TODO: we need to call getDataset here because most ways of 

190 # obtaining a DatasetRef (including queryDataset) don't populate 

191 # the run attribute. We should address that upstream in the 

192 # future. 

193 ref = self._registry.getDataset(ref.id, dataId=ref.dataId, datasetType=ref.datasetType) 

194 # `exports` is a single-element list here, because we anticipate 

195 # a future where more than just Datastore.export has a vectorized 

196 # API and we can pull this out of the loop. 

197 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

198 if rewrite is not None: 

199 exports = [rewrite(export) for export in exports] 

200 datasets[ref.datasetType, ref.run].extend(exports) 

201 self._dataset_ids.add(ref.id) 

202 self.saveDataIds(dataIds, elements=elements) 

203 for (datasetType, run), records in datasets.items(): 

204 self._backend.saveDatasets(datasetType, run, *records) 

205 

206 def _finish(self): 

207 """Delegate to the backend to finish the export process. 

208 

209 For use by `Butler.export` only. 

210 """ 

211 self._backend.finish() 

212 

213 

214class RepoExportBackend(ABC): 

215 """An abstract interface for data repository export implementations. 

216 """ 

217 

218 @abstractmethod 

219 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): 

220 """Export one or more dimension element records. 

221 

222 Parameters 

223 ---------- 

224 element : `DimensionElement` 

225 The `DimensionElement` whose elements are being exported. 

226 data : `DimensionRecord` (variadic) 

227 One or more records to export. 

228 """ 

229 raise NotImplementedError() 

230 

231 @abstractmethod 

232 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset, 

233 collections: Iterable[str] = ()): 

234 """Export one or more datasets, including their associated DatasetType 

235 and run information (but not including associated dimension 

236 information). 

237 

238 Parameters 

239 ---------- 

240 datasetType : `DatasetType` 

241 Type of all datasets being exported with this call. 

242 run : `str` 

243 Run associated with all datasets being exported with this call. 

244 datasets : `FileDataset`, variadic 

245 Per-dataset information to be exported. `FileDataset.formatter` 

246 attributes should be strings, not `Formatter` instances or classes. 

247 collections : iterable of `str` 

248 Extra collections (in addition to ``run``) the dataset 

249 should be associated with. 

250 """ 

251 raise NotImplementedError() 

252 

253 @abstractmethod 

254 def finish(self): 

255 """Complete the export process. 

256 """ 

257 raise NotImplementedError() 

258 

259 

260class RepoImportBackend(ABC): 

261 """An abstract interface for data repository import implementations. 

262 

263 Import backends are expected to be constructed with a description of 

264 the objects that need to be imported (from, e.g., a file written by the 

265 corresponding export backend), along with a `Registry`. 

266 """ 

267 

268 @abstractmethod 

269 def register(self): 

270 """Register all runs and dataset types associated with the backend with 

271 the `Registry` the backend was constructed with. 

272 

273 These operations cannot be performed inside transactions, unlike those 

274 performed by `load`, and must in general be performed before `load`. 

275 """ 

276 

277 @abstractmethod 

278 def load(self, datastore: Optional[Datastore], *, 

279 directory: Optional[str] = None, transfer: Optional[str] = None): 

280 """Import information associated with the backend into the given 

281 registry and datastore. 

282 

283 This must be run after `register`, and may be performed inside a 

284 transaction. 

285 

286 Parameters 

287 ---------- 

288 datastore : `Datastore` 

289 Datastore to import into. If `None`, datasets will only be 

290 inserted into the `Registry` (primarily intended for tests). 

291 directory : `str`, optional 

292 File all dataset paths are relative to. 

293 transfer : `str`, optional 

294 Transfer mode forwarded to `Datastore.ingest`. 

295 """ 

296 raise NotImplementedError() 

297 

298 

299class YamlRepoExportBackend(RepoExportBackend): 

300 """A repository export implementation that saves to a YAML file. 

301 

302 Parameters 

303 ---------- 

304 stream 

305 A writeable file-like object. 

306 """ 

307 

308 def __init__(self, stream: IO): 

309 self.stream = stream 

310 self.data = [] 

311 

312 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): 

313 # Docstring inherited from RepoExportBackend.saveDimensionData. 

314 # Convert astropy time in TAI to datetime in UTC for YAML 

315 data_dicts = [] 

316 for record in data: 

317 rec_dict = record.toDict() 

318 for key in rec_dict: 

319 if isinstance(rec_dict[key], astropy.time.Time): 

320 rec_dict[key] = rec_dict[key].utc.to_datetime() 

321 data_dicts += [rec_dict] 

322 self.data.append({ 

323 "type": "dimension", 

324 "element": element.name, 

325 "records": data_dicts, # TODO: encode regions 

326 }) 

327 

328 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset): 

329 # Docstring inherited from RepoExportBackend.saveDatasets. 

330 self.data.append({ 

331 "type": "dataset_type", 

332 "name": datasetType.name, 

333 "dimensions": [d.name for d in datasetType.dimensions], 

334 "storage_class": datasetType.storageClass.name, 

335 }) 

336 self.data.append({ 

337 "type": "run", 

338 "name": run, 

339 }) 

340 self.data.append({ 

341 "type": "dataset", 

342 "dataset_type": datasetType.name, 

343 "run": run, 

344 "records": [ 

345 { 

346 "dataset_id": [ref.id for ref in dataset.refs], 

347 "data_id": [ref.dataId.byName() for ref in dataset.refs], 

348 "path": dataset.path, 

349 "formatter": dataset.formatter, 

350 # TODO: look up and save other collections 

351 } 

352 for dataset in datasets 

353 ] 

354 }) 

355 

356 def finish(self): 

357 # Docstring inherited from RepoExportBackend. 

358 yaml.dump( 

359 { 

360 "description": "Butler Data Repository Export", 

361 "version": 0, 

362 "data": self.data, 

363 }, 

364 stream=self.stream, 

365 sort_keys=False, 

366 ) 

367 

368 

369class YamlRepoImportBackend(RepoImportBackend): 

370 """A repository import implementation that reads from a YAML file. 

371 

372 Parameters 

373 ---------- 

374 stream 

375 A readable file-like object. 

376 registry : `Registry` 

377 The registry datasets will be imported into. Only used to retreive 

378 dataset types during construction; all write happen in `register` 

379 and `load`. 

380 """ 

381 

382 def __init__(self, stream: IO, registry: Registry): 

383 # We read the file fully and convert its contents to Python objects 

384 # instead of loading incrementally so we can spot some problems early; 

385 # because `register` can't be put inside a transaction, we'd rather not 

386 # run that at all if there's going to be problem later in `load`. 

387 wrapper = yaml.safe_load(stream) 

388 # TODO: When version numbers become meaningful, check here that we can 

389 # read the version in the file. 

390 self.runs: List[str] = [] 

391 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

392 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

393 self.registry: Registry = registry 

394 datasetData = [] 

395 for data in wrapper["data"]: 

396 if data["type"] == "dimension": 

397 # convert all datetiem values to astropy 

398 for record in data["records"]: 

399 for key in record: 

400 if isinstance(record[key], datetime): 

401 record[key] = astropy.time.Time(record[key], scale="utc") 

402 element = self.registry.dimensions[data["element"]] 

403 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"]) 

404 elif data["type"] == "run": 

405 self.runs.append(data["name"]) 

406 elif data["type"] == "dataset_type": 

407 self.datasetTypes.add( 

408 DatasetType(data["name"], dimensions=data["dimensions"], 

409 storageClass=data["storage_class"], universe=self.registry.dimensions) 

410 ) 

411 elif data["type"] == "dataset": 

412 # Save raw dataset data for a second loop, so we can ensure we 

413 # know about all dataset types first. 

414 datasetData.append(data) 

415 else: 

416 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

417 # key is (dataset type name, run); inner most list is collections 

418 self.datasets: Mapping[(str, str), List[Tuple[FileDataset, List[str]]]] = defaultdict(list) 

419 for data in datasetData: 

420 datasetType = self.datasetTypes.get(data["dataset_type"]) 

421 if datasetType is None: 

422 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

423 self.datasets[data["dataset_type"], data["run"]].extend( 

424 ( 

425 FileDataset( 

426 d.get("path"), 

427 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

428 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

429 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

430 ), 

431 d.get("collections", []) 

432 ) 

433 for d in data["records"] 

434 ) 

435 

436 def register(self): 

437 # Docstring inherited from RepoImportBackend.register. 

438 for run in self.runs: 

439 self.registry.registerRun(run) 

440 for datasetType in self.datasetTypes: 

441 self.registry.registerDatasetType(datasetType) 

442 

443 def load(self, datastore: Optional[Datastore], *, 

444 directory: Optional[str] = None, transfer: Optional[str] = None): 

445 # Docstring inherited from RepoImportBackend.load. 

446 for element, records in self.dimensions.items(): 

447 self.registry.insertDimensionData(element, *records) 

448 # Mapping from collection name to list of DatasetRefs to associate. 

449 collections = defaultdict(list) 

450 # FileDatasets to ingest into the datastore (in bulk): 

451 fileDatasets = [] 

452 for (datasetTypeName, run), records in self.datasets.items(): 

453 datasetType = self.registry.getDatasetType(datasetTypeName) 

454 # Make a big flattened list of all data IDs, while remembering 

455 # slices that associate them with the FileDataset instances they 

456 # came from. 

457 dataIds = [] 

458 slices = [] 

459 for fileDataset, _ in records: 

460 start = len(dataIds) 

461 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

462 stop = len(dataIds) 

463 slices.append(slice(start, stop)) 

464 # Insert all of those DatasetRefs at once. 

465 # For now, we ignore the dataset_id we pulled from the file 

466 # and just insert without one to get a new autoincrement value. 

467 # Eventually (once we have origin in IDs) we'll preserve them. 

468 resolvedRefs = self.registry.insertDatasets( 

469 datasetType, 

470 dataIds=dataIds, 

471 run=run, 

472 recursive=True 

473 ) 

474 # Now iterate over the original records, and install the new 

475 # resolved DatasetRefs to replace the unresolved ones as we 

476 # reorganize the collection information. 

477 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records): 

478 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

479 if directory is not None: 

480 fileDataset.path = os.path.join(directory, fileDataset.path) 

481 fileDatasets.append(fileDataset) 

482 for collection in collectionsForDataset: 

483 collections[collection].extend(fileDataset.refs) 

484 # Ingest everything into the datastore at once. 

485 if datastore is not None and fileDatasets: 

486 datastore.ingest(*fileDatasets, transfer=transfer) 

487 # Associate with collections, one collection at a time. 

488 for collection, refs in collections.items(): 

489 self.registry.associate(collection, refs)