Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["FileDataset", "RepoExport", 

25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig", 

26 "YamlRepoExportBackend", "YamlRepoImportBackend"] 

27 

28import os 

29from abc import ABC, abstractmethod 

30from dataclasses import dataclass 

31from typing import TYPE_CHECKING, Iterable, Optional, IO, List, Mapping, Tuple, Callable, Union 

32from collections import defaultdict 

33 

34import yaml 

35 

36from lsst.utils import doImport 

37from .config import ConfigSubset 

38from .datasets import DatasetType, DatasetRef 

39from .utils import NamedValueSet, iterable 

40 

41if TYPE_CHECKING: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true

42 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate 

43 from ..registry import Registry 

44 from .datastore import Datastore 

45 from .formatter import FormatterParameter 

46 

47 

48class RepoTransferFormatConfig(ConfigSubset): 

49 """The section of butler configuration that associates repo import/export 

50 backends with file formats. 

51 """ 

52 component = "repo_transfer_formats" 

53 defaultConfigFile = "repo_transfer_formats.yaml" 

54 

55 

56@dataclass 

57class FileDataset: 

58 """A struct that represents a dataset exported to a file. 

59 """ 

60 __slots__ = ("refs", "path", "formatter") 

61 

62 refs: List[DatasetRef] 

63 """Registry information about the dataset. (`list` of `DatasetRef`). 

64 """ 

65 

66 path: str 

67 """Path to the dataset (`str`). 

68 

69 If the dataset was exported with ``transfer=None`` (i.e. in-place), 

70 this is relative to the datastore root (only datastores that have a 

71 well-defined root in the local filesystem can be expected to support 

72 in-place exports). Otherwise this is relative to the directory passed 

73 to `Datastore.export`. 

74 """ 

75 

76 formatter: FormatterParameter 

77 """A `Formatter` class or fully-qualified name. 

78 """ 

79 

80 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *, 

81 formatter: FormatterParameter = None): 

82 self.path = path 

83 if isinstance(refs, DatasetRef): 

84 refs = [refs] 

85 self.refs = refs 

86 self.formatter = formatter 

87 

88 

89class RepoExport: 

90 """Public interface for exporting a subset of a data repository. 

91 

92 Instances of this class are obtained by calling `Butler.export` as the 

93 value returned by that context manager:: 

94 

95 with butler.export(filename="export.yaml") as export: 

96 export.saveDataIds(...) 

97 export.saveDatasts(...) 

98 

99 Parameters 

100 ---------- 

101 registry : `Registry` 

102 Registry to export from. 

103 datastore : `Datastore` 

104 Datastore to export from. 

105 backend : `RepoExportBackend` 

106 Implementation class for a particular export file format. 

107 directory : `str`, optional 

108 Directory to pass to `Datastore.export`. 

109 transfer : `str`, optional 

110 Transfer mdoe to pass to `Datastore.export`. 

111 """ 

112 

113 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

114 directory: Optional[str] = None, transfer: Optional[str] = None): 

115 self._registry = registry 

116 self._datastore = datastore 

117 self._backend = backend 

118 self._directory = directory 

119 self._transfer = transfer 

120 self._dataset_ids = set() 

121 

122 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *, 

123 elements: Optional[Iterable[DimensionElement]] = None): 

124 """Export the dimension records associated with one or more data IDs. 

125 

126 Parameters 

127 ---------- 

128 dataIds : iterable of `ExpandedDataCoordinate`. 

129 Fully-expanded data IDs to export. 

130 elements : iterable of `DimensionElement`, optional 

131 Dimension elements whose records should be exported. If `None`, 

132 records for all dimensions will be exported. 

133 """ 

134 if elements is None: 

135 elements = frozenset(element for element in self._registry.dimensions.elements 

136 if element.hasTable() and element.viewOf is None) 

137 else: 

138 elements = frozenset(elements) 

139 records = defaultdict(dict) 

140 for dataId in dataIds: 

141 for record in dataId.records.values(): 

142 if record.definition in elements: 

143 records[record.definition].setdefault(record.dataId, record) 

144 for element in self._registry.dimensions.sorted(records.keys()): 

145 self._backend.saveDimensionData(element, *records[element].values()) 

146 

147 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

148 elements: Optional[Iterable[DimensionElement]] = None, 

149 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None): 

150 """Export one or more datasets. 

151 

152 This automatically exports any `DatasetType`, `Run`, and dimension 

153 records associated with the datasets. 

154 

155 Parameters 

156 ---------- 

157 refs : iterable of `DatasetRef` 

158 References to the datasets to export. Their `DatasetRef.id` 

159 attributes must not be `None`. Duplicates are automatically 

160 ignored. 

161 elements : iterable of `DimensionElement`, optional 

162 Dimension elements whose records should be exported; this is 

163 forwarded to `saveDataIds` when exporting the data IDs of the 

164 given datasets. 

165 rewrite : callable, optional 

166 A callable that takes a single `FileDataset` argument and returns 

167 a modified `FileDataset`. This is typically used to rewrite the 

168 path generated by the datastore. If `None`, the `FileDataset` 

169 returned by `Datastore.export` will be used directly. 

170 

171 Notes 

172 ----- 

173 At present, this only associates datasets with the collection that 

174 matches their run name. Other collections will be included in the 

175 export in the future (once `Registry` provides a way to look up that 

176 information). 

177 """ 

178 dataIds = set() 

179 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) 

180 for ref in refs: 

181 # The query interfaces that are often used to generate the refs 

182 # passed here often don't remove duplicates, so do that here for 

183 # convenience. 

184 if ref.id in self._dataset_ids: 

185 continue 

186 dataIds.add(ref.dataId) 

187 # TODO: we need to call getDataset here because most ways of 

188 # obtaining a DatasetRef (including queryDataset) don't populate 

189 # the run attribute. We should address that upstream in the 

190 # future. 

191 ref = self._registry.getDataset(ref.id, dataId=ref.dataId, datasetType=ref.datasetType) 

192 # `exports` is a single-element list here, because we anticipate 

193 # a future where more than just Datastore.export has a vectorized 

194 # API and we can pull this out of the loop. 

195 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

196 if rewrite is not None: 

197 exports = [rewrite(export) for export in exports] 

198 datasets[ref.datasetType, ref.run].extend(exports) 

199 self._dataset_ids.add(ref.id) 

200 self.saveDataIds(dataIds, elements=elements) 

201 for (datasetType, run), records in datasets.items(): 

202 self._backend.saveDatasets(datasetType, run, *records) 

203 

204 def _finish(self): 

205 """Delegate to the backend to finish the export process. 

206 

207 For use by `Butler.export` only. 

208 """ 

209 self._backend.finish() 

210 

211 

212class RepoExportBackend(ABC): 

213 """An abstract interface for data repository export implementations. 

214 """ 

215 

216 @abstractmethod 

217 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): 

218 """Export one or more dimension element records. 

219 

220 Parameters 

221 ---------- 

222 element : `DimensionElement` 

223 The `DimensionElement` whose elements are being exported. 

224 data : `DimensionRecord` (variadic) 

225 One or more records to export. 

226 """ 

227 raise NotImplementedError() 

228 

229 @abstractmethod 

230 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset, 

231 collections: Iterable[str] = ()): 

232 """Export one or more datasets, including their associated DatasetType 

233 and run information (but not including associated dimension 

234 information). 

235 

236 Parameters 

237 ---------- 

238 datasetType : `DatasetType` 

239 Type of all datasets being exported with this call. 

240 run : `str` 

241 Run associated with all datasets being exported with this call. 

242 datasets : `FileDataset`, variadic 

243 Per-dataset information to be exported. `FileDataset.formatter` 

244 attributes should be strings, not `Formatter` instances or classes. 

245 collections : iterable of `str` 

246 Extra collections (in addition to ``run``) the dataset 

247 should be associated with. 

248 """ 

249 raise NotImplementedError() 

250 

251 @abstractmethod 

252 def finish(self): 

253 """Complete the export process. 

254 """ 

255 raise NotImplementedError() 

256 

257 

258class RepoImportBackend(ABC): 

259 """An abstract interface for data repository import implementations. 

260 

261 Import backends are expected to be constructed with a description of 

262 the objects that need to be imported (from, e.g., a file written by the 

263 corresponding export backend), along with a `Registry`. 

264 """ 

265 

266 @abstractmethod 

267 def register(self): 

268 """Register all runs and dataset types associated with the backend with 

269 the `Registry` the backend was constructed with. 

270 

271 These operations cannot be performed inside transactions, unlike those 

272 performed by `load`, and must in general be performed before `load`. 

273 """ 

274 

275 @abstractmethod 

276 def load(self, datastore: Optional[Datastore], *, 

277 directory: Optional[str] = None, transfer: Optional[str] = None): 

278 """Import information associated with the backend into the given 

279 registry and datastore. 

280 

281 This must be run after `register`, and may be performed inside a 

282 transaction. 

283 

284 Parameters 

285 ---------- 

286 datastore : `Datastore` 

287 Datastore to import into. If `None`, datasets will only be 

288 inserted into the `Registry` (primarily intended for tests). 

289 directory : `str`, optional 

290 File all dataset paths are relative to. 

291 transfer : `str`, optional 

292 Transfer mode forwarded to `Datastore.ingest`. 

293 """ 

294 raise NotImplementedError() 

295 

296 

297class YamlRepoExportBackend(RepoExportBackend): 

298 """A repository export implementation that saves to a YAML file. 

299 

300 Parameters 

301 ---------- 

302 stream 

303 A writeable file-like object. 

304 """ 

305 

306 def __init__(self, stream: IO): 

307 self.stream = stream 

308 self.data = [] 

309 

310 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): 

311 # Docstring inherited from RepoExportBackend.saveDimensionData. 

312 self.data.append({ 

313 "type": "dimension", 

314 "element": element.name, 

315 "records": [d.toDict() for d in data], # TODO: encode regions 

316 }) 

317 

318 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset): 

319 # Docstring inherited from RepoExportBackend.saveDatasets. 

320 self.data.append({ 

321 "type": "dataset_type", 

322 "name": datasetType.name, 

323 "dimensions": [d.name for d in datasetType.dimensions], 

324 "storage_class": datasetType.storageClass.name, 

325 }) 

326 self.data.append({ 

327 "type": "run", 

328 "name": run, 

329 }) 

330 self.data.append({ 

331 "type": "dataset", 

332 "dataset_type": datasetType.name, 

333 "run": run, 

334 "records": [ 

335 { 

336 "dataset_id": [ref.id for ref in dataset.refs], 

337 "data_id": [ref.dataId.byName() for ref in dataset.refs], 

338 "path": dataset.path, 

339 "formatter": dataset.formatter, 

340 # TODO: look up and save other collections 

341 } 

342 for dataset in datasets 

343 ] 

344 }) 

345 

346 def finish(self): 

347 # Docstring inherited from RepoExportBackend. 

348 yaml.dump( 

349 { 

350 "description": "Butler Data Repository Export", 

351 "version": 0, 

352 "data": self.data, 

353 }, 

354 stream=self.stream, 

355 sort_keys=False, 

356 ) 

357 

358 

359class YamlRepoImportBackend(RepoImportBackend): 

360 """A repository import implementation that reads from a YAML file. 

361 

362 Parameters 

363 ---------- 

364 stream 

365 A readable file-like object. 

366 registry : `Registry` 

367 The registry datasets will be imported into. Only used to retreive 

368 dataset types during construction; all write happen in `register` 

369 and `load`. 

370 """ 

371 

372 def __init__(self, stream: IO, registry: Registry): 

373 # We read the file fully and convert its contents to Python objects 

374 # instead of loading incrementally so we can spot some problems early; 

375 # because `register` can't be put inside a transaction, we'd rather not 

376 # run that at all if there's going to be problem later in `load`. 

377 wrapper = yaml.safe_load(stream) 

378 # TODO: When version numbers become meaningful, check here that we can 

379 # read the version in the file. 

380 self.runs: List[str] = [] 

381 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

382 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

383 self.registry: Registry = registry 

384 datasetData = [] 

385 for data in wrapper["data"]: 

386 if data["type"] == "dimension": 

387 element = self.registry.dimensions[data["element"]] 

388 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"]) 

389 elif data["type"] == "run": 

390 self.runs.append(data["name"]) 

391 elif data["type"] == "dataset_type": 

392 self.datasetTypes.add( 

393 DatasetType(data["name"], dimensions=data["dimensions"], 

394 storageClass=data["storage_class"], universe=self.registry.dimensions) 

395 ) 

396 elif data["type"] == "dataset": 

397 # Save raw dataset data for a second loop, so we can ensure we 

398 # know about all dataset types first. 

399 datasetData.append(data) 

400 else: 

401 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

402 # key is (dataset type name, run); inner most list is collections 

403 self.datasets: Mapping[(str, str), List[Tuple[FileDataset, List[str]]]] = defaultdict(list) 

404 for data in datasetData: 

405 datasetType = self.datasetTypes.get(data["dataset_type"]) 

406 if datasetType is None: 

407 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

408 self.datasets[data["dataset_type"], data["run"]].extend( 

409 ( 

410 FileDataset( 

411 d.get("path"), 

412 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

413 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

414 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

415 ), 

416 d.get("collections", []) 

417 ) 

418 for d in data["records"] 

419 ) 

420 

421 def register(self): 

422 # Docstring inherited from RepoImportBackend.register. 

423 for run in self.runs: 

424 self.registry.registerRun(run) 

425 for datasetType in self.datasetTypes: 

426 self.registry.registerDatasetType(datasetType) 

427 

428 def load(self, datastore: Optional[Datastore], *, 

429 directory: Optional[str] = None, transfer: Optional[str] = None): 

430 # Docstring inherited from RepoImportBackend.load. 

431 for element, records in self.dimensions.items(): 

432 self.registry.insertDimensionData(element, *records) 

433 # Mapping from collection name to list of DatasetRefs to associate. 

434 collections = defaultdict(list) 

435 # FileDatasets to ingest into the datastore (in bulk): 

436 fileDatasets = [] 

437 for (datasetTypeName, run), records in self.datasets.items(): 

438 datasetType = self.registry.getDatasetType(datasetTypeName) 

439 # Make a big flattened list of all data IDs, while remembering 

440 # slices that associate them with the FileDataset instances they 

441 # came from. 

442 dataIds = [] 

443 slices = [] 

444 for fileDataset, _ in records: 

445 start = len(dataIds) 

446 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

447 stop = len(dataIds) 

448 slices.append(slice(start, stop)) 

449 # Insert all of those DatasetRefs at once. 

450 # For now, we ignore the dataset_id we pulled from the file 

451 # and just insert without one to get a new autoincrement value. 

452 # Eventually (once we have origin in IDs) we'll preserve them. 

453 resolvedRefs = self.registry.insertDatasets( 

454 datasetType, 

455 dataIds=dataIds, 

456 run=run, 

457 recursive=True 

458 ) 

459 # Now iterate over the original records, and install the new 

460 # resolved DatasetRefs to replace the unresolved ones as we 

461 # reorganize the collection information. 

462 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records): 

463 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

464 if directory is not None: 

465 fileDataset.path = os.path.join(directory, fileDataset.path) 

466 fileDatasets.append(fileDataset) 

467 for collection in collectionsForDataset: 

468 collections[collection].extend(fileDataset.refs) 

469 # Ingest everything into the datastore at once. 

470 if datastore is not None: 

471 datastore.ingest(*fileDatasets, transfer=transfer) 

472 # Associate with collections, one collection at a time. 

473 for collection, refs in collections.items(): 

474 self.registry.associate(collection, refs)