Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["FileDataset", "RepoExport", 

25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig", 

26 "YamlRepoExportBackend", "YamlRepoImportBackend"] 

27 

28import os 

29from abc import ABC, abstractmethod 

30from dataclasses import dataclass 

31from typing import TYPE_CHECKING, Iterable, Optional, IO, List, Mapping, Tuple, Callable, Union, Type 

32from collections import defaultdict 

33 

34import yaml 

35 

36from lsst.utils import doImport 

37from .config import ConfigSubset 

38from .datasets import DatasetType, DatasetRef 

39from .utils import NamedValueSet, iterable 

40 

41if TYPE_CHECKING: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true

42 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate 

43 from ..registry import Registry 

44 from .datastore import Datastore 

45 from .formatters import Formatter 

46 

47 

48class RepoTransferFormatConfig(ConfigSubset): 

49 """The section of butler configuration that associates repo import/export 

50 backends with file formats. 

51 """ 

52 component = "repo_transfer_formats" 

53 defaultConfigFile = "repo_transfer_formats.yaml" 

54 

55 

56@dataclass 

57class FileDataset: 

58 """A struct that represents a dataset exported to a file. 

59 """ 

60 __slots__ = ("refs", "path", "formatter") 

61 

62 refs: List[DatasetRef] 

63 """Registry information about the dataset. (`list` of `DatasetRef`). 

64 """ 

65 

66 path: str 

67 """Path to the dataset (`str`). 

68 

69 If the dataset was exported with ``transfer=None`` (i.e. in-place), 

70 this is relative to the datastore root (only datastores that have a 

71 well-defined root in the local filesystem can be expected to support 

72 in-place exports). Otherwise this is relative to the directory passed 

73 to `Datastore.export`. 

74 """ 

75 

76 formatter: Union[None, str, Type[Formatter]] 

77 """A `Formatter` class or fully-qualified name. 

78 """ 

79 

80 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *, 

81 formatter: Union[None, str, Type[Formatter]] = None): 

82 self.path = path 

83 if isinstance(refs, DatasetRef): 

84 refs = [refs] 

85 self.refs = refs 

86 self.formatter = formatter 

87 

88 

89class RepoExport: 

90 """Public interface for exporting a subset of a data repository. 

91 

92 Instances of this class are obtained by calling `Butler.export` as the 

93 value returned by that context manager:: 

94 

95 with butler.export(filename="export.yaml") as export: 

96 export.saveDataIds(...) 

97 export.saveDatasts(...) 

98 

99 Parameters 

100 ---------- 

101 registry : `Registry` 

102 Registry to export from. 

103 datastore : `Datastore` 

104 Datastore to export from. 

105 backend : `RepoExportBackend` 

106 Implementation class for a particular export file format. 

107 directory : `str`, optional 

108 Directory to pass to `Datastore.export`. 

109 transfer : `str`, optional 

110 Transfer mdoe to pass to `Datastore.export`. 

111 """ 

112 

113 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

114 directory: Optional[str] = None, transfer: Optional[str] = None): 

115 self._registry = registry 

116 self._datastore = datastore 

117 self._backend = backend 

118 self._directory = directory 

119 self._transfer = transfer 

120 self._dataset_ids = set() 

121 

122 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *, 

123 elements: Optional[Iterable[DimensionElement]] = None): 

124 """Export the dimension records associated with one or more data IDs. 

125 

126 Parameters 

127 ---------- 

128 dataIds : iterable of `ExpandedDataCoordinate`. 

129 Fully-expanded data IDs to export. 

130 elements : iterable of `DimensionElement`, optional 

131 Dimension elements whose records should be exported. If `None`, 

132 records for all dimensions will be exported. 

133 """ 

134 if elements is None: 

135 elements = frozenset(element for element in self._registry.dimensions.elements 

136 if element.hasTable() and element.viewOf is None) 

137 else: 

138 elements = frozenset(elements) 

139 records = defaultdict(dict) 

140 for dataId in dataIds: 

141 for record in dataId.records.values(): 

142 if record.definition in elements: 

143 records[record.definition].setdefault(record.dataId, record) 

144 for element in self._registry.dimensions.sorted(records.keys()): 

145 self._backend.saveDimensionData(element, *records[element].values()) 

146 

147 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

148 elements: Optional[Iterable[DimensionElement]] = None, 

149 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None): 

150 """Export one or more datasets. 

151 

152 This automatically exports any `DatasetType`, `Run`, and dimension 

153 records associated with the datasets. 

154 

155 Parameters 

156 ---------- 

157 refs : iterable of `DatasetRef` 

158 References to the datasets to export. Their `DatasetRef.id` 

159 attributes must not be `None`. Duplicates are automatically 

160 ignored. 

161 elements : iterable of `DimensionElement`, optional 

162 Dimension elements whose records should be exported; this is 

163 forwarded to `saveDataIds` when exporting the data IDs of the 

164 given datasets. 

165 rewrite : callable, optional 

166 A callable that takes a single `FileDataset` argument and returns 

167 a modified `FileDataset`. This is typically used to rewrite the 

168 path generated by the datastore. If `None`, the `FileDataset` 

169 returned by `Datastore.export` will be used directly. 

170 

171 Notes 

172 ----- 

173 At present, this only associates datasets with the collection that 

174 matches their run name. Other collections will be included in the 

175 export in the future (once `Registry` provides a way to look up that 

176 information). 

177 """ 

178 dataIds = set() 

179 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) 

180 for ref in refs: 

181 # The query interfaces that are often used to generate the refs 

182 # passed here often don't remove duplicates, so do that here for 

183 # convenience. 

184 if ref.id in self._dataset_ids: 

185 continue 

186 dataIds.add(ref.dataId) 

187 # TODO: we need to call getDataset here because most ways of 

188 # obtaining a DatasetRef (including queryDataset) don't populate 

189 # the run attribute. We should address that upstream in the 

190 # future. 

191 ref = self._registry.getDataset(ref.id, dataId=ref.dataId, datasetType=ref.datasetType) 

192 # `exports` is a single-element list here, because we anticipate 

193 # a future where more than just Datastore.export has a vectorized 

194 # API and we can pull this out of the loop. 

195 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

196 if rewrite is not None: 

197 exports = [rewrite(export) for export in exports] 

198 datasets[ref.datasetType, ref.run].extend(exports) 

199 self._dataset_ids.add(ref.id) 

200 self.saveDataIds(dataIds, elements=elements) 

201 for (datasetType, run), records in datasets.items(): 

202 self._backend.saveDatasets(datasetType, run, *records) 

203 

204 def _finish(self): 

205 """Delegate to the backend to finish the export process. 

206 

207 For use by `Butler.export` only. 

208 """ 

209 self._backend.finish() 

210 

211 

212class RepoExportBackend(ABC): 

213 """An abstract interface for data repository export implementations. 

214 """ 

215 

216 @abstractmethod 

217 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): 

218 """Export one or more dimension element records. 

219 

220 Parameters 

221 ---------- 

222 element : `DimensionElement` 

223 The `DimensionElement` whose elements are being exported. 

224 data : `DimensionRecord` (variadic) 

225 One or more records to export. 

226 """ 

227 raise NotImplementedError() 

228 

229 @abstractmethod 

230 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset, 

231 collections: Iterable[str] = ()): 

232 """Export one or more datasets, including their associated DatasetType 

233 and run information (but not including associated dimension 

234 information). 

235 

236 Parameters 

237 ---------- 

238 datasetType : `DatasetType` 

239 Type of all datasets being exported with this call. 

240 run : `str` 

241 Run associated with all datasets being exported with this call. 

242 datasets : `FileDataset`, variadic 

243 Per-dataset information to be exported. `FileDataset.formatter` 

244 attributes should be strings, not `Formatter` instances or classes. 

245 collections : iterable of `str` 

246 Extra collections (in addition to ``run``) the dataset 

247 should be associated with. 

248 """ 

249 raise NotImplementedError() 

250 

251 @abstractmethod 

252 def finish(self): 

253 """Complete the export process. 

254 """ 

255 raise NotImplementedError() 

256 

257 

258class RepoImportBackend(ABC): 

259 """An abstract interface for data repository import implementations. 

260 

261 Import backends are expected to be constructed with a description of 

262 the objects that need to be imported (from, e.g., a file written by the 

263 corresponding export backend), along with a `Registry`. 

264 """ 

265 

266 @abstractmethod 

267 def register(self): 

268 """Register all runs and dataset types associated with the backend with 

269 the `Registry` the backend was constructed with. 

270 

271 These operations cannot be performed inside transactions, unlike those 

272 performed by `load`, and must in general be performed before `load`. 

273 """ 

274 

275 @abstractmethod 

276 def load(self, datastore: Datastore, *, 

277 directory: Optional[str] = None, transfer: Optional[str] = None): 

278 """Import information associated with the backend into the given 

279 registry and datastore. 

280 

281 This must be run after `register`, and may be performed inside a 

282 transaction. 

283 

284 Parameters 

285 ---------- 

286 registry : `Registry` 

287 Registry to import into. 

288 datastore : `Datastore` 

289 Datastore to import into. 

290 directory : `str`, optional 

291 File all dataset paths are relative to. 

292 transfer : `str`, optional 

293 Transfer mode forwarded to `Datastore.ingest`. 

294 """ 

295 raise NotImplementedError() 

296 

297 

298class YamlRepoExportBackend(RepoExportBackend): 

299 """A repository export implementation that saves to a YAML file. 

300 

301 Parameters 

302 ---------- 

303 stream 

304 A writeable file-like object. 

305 """ 

306 

307 def __init__(self, stream: IO): 

308 self.stream = stream 

309 self.data = [] 

310 

311 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): 

312 # Docstring inherited from RepoExportBackend.saveDimensionData. 

313 self.data.append({ 

314 "type": "dimension", 

315 "element": element.name, 

316 "records": [d.toDict() for d in data], # TODO: encode regions 

317 }) 

318 

319 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset): 

320 # Docstring inherited from RepoExportBackend.saveDatasets. 

321 self.data.append({ 

322 "type": "dataset_type", 

323 "name": datasetType.name, 

324 "dimensions": [d.name for d in datasetType.dimensions], 

325 "storage_class": datasetType.storageClass.name, 

326 }) 

327 self.data.append({ 

328 "type": "run", 

329 "name": run, 

330 }) 

331 self.data.append({ 

332 "type": "dataset", 

333 "dataset_type": datasetType.name, 

334 "run": run, 

335 "records": [ 

336 { 

337 "dataset_id": [ref.id for ref in dataset.refs], 

338 "data_id": [ref.dataId.byName() for ref in dataset.refs], 

339 "path": dataset.path, 

340 "formatter": dataset.formatter, 

341 # TODO: look up and save other collections 

342 } 

343 for dataset in datasets 

344 ] 

345 }) 

346 

347 def finish(self): 

348 # Docstring inherited from RepoExportBackend. 

349 yaml.dump( 

350 { 

351 "description": "Butler Data Repository Export", 

352 "version": 0, 

353 "data": self.data, 

354 }, 

355 stream=self.stream, 

356 sort_keys=False, 

357 ) 

358 

359 

360class YamlRepoImportBackend(RepoImportBackend): 

361 """A repository import implementation that reads from a YAML file. 

362 

363 Parameters 

364 ---------- 

365 stream 

366 A readable file-like object. 

367 registry : `Registry` 

368 The registry datasets will be imported into. Only used to retreive 

369 dataset types during construction; all write happen in `register` 

370 and `load`. 

371 """ 

372 

373 def __init__(self, stream: IO, registry: Registry): 

374 # We read the file fully and convert its contents to Python objects 

375 # instead of loading incrementally so we can spot some problems early; 

376 # because `register` can't be put inside a transaction, we'd rather not 

377 # run that at all if there's going to be problem later in `load`. 

378 wrapper = yaml.safe_load(stream) 

379 # TODO: When version numbers become meaningful, check here that we can 

380 # read the version in the file. 

381 self.runs: List[str] = [] 

382 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

383 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

384 self.registry: Registry = registry 

385 datasetData = [] 

386 for data in wrapper["data"]: 

387 if data["type"] == "dimension": 

388 element = self.registry.dimensions[data["element"]] 

389 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"]) 

390 elif data["type"] == "run": 

391 self.runs.append(data["name"]) 

392 elif data["type"] == "dataset_type": 

393 self.datasetTypes.add( 

394 DatasetType(data["name"], dimensions=data["dimensions"], 

395 storageClass=data["storage_class"], universe=self.registry.dimensions) 

396 ) 

397 elif data["type"] == "dataset": 

398 # Save raw dataset data for a second loop, so we can ensure we 

399 # know about all dataset types first. 

400 datasetData.append(data) 

401 else: 

402 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

403 # key is (dataset type name, run); inner most list is collections 

404 self.datasets: Mapping[(str, str), List[Tuple[FileDataset, List[str]]]] = defaultdict(list) 

405 for data in datasetData: 

406 datasetType = self.datasetTypes.get(data["dataset_type"]) 

407 if datasetType is None: 

408 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

409 self.datasets[data["dataset_type"], data["run"]].extend( 

410 ( 

411 FileDataset( 

412 d["path"], 

413 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

414 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

415 formatter=doImport(d["formatter"]) 

416 ), 

417 d.get("collections", []) 

418 ) 

419 for d in data["records"] 

420 ) 

421 

422 def register(self): 

423 # Docstring inherited from RepoImportBackend.register. 

424 for run in self.runs: 

425 self.registry.registerRun(run) 

426 for datasetType in self.datasetTypes: 

427 self.registry.registerDatasetType(datasetType) 

428 

429 def load(self, datastore: Datastore, *, 

430 directory: Optional[str] = None, transfer: Optional[str] = None): 

431 # Docstring inherited from RepoImportBackend.load. 

432 for element, records in self.dimensions.items(): 

433 self.registry.insertDimensionData(element, *records) 

434 # Mapping from collection name to list of DatasetRefs to associate. 

435 collections = defaultdict(list) 

436 # FileDatasets to ingest into the datastore (in bulk): 

437 fileDatasets = [] 

438 for (datasetTypeName, run), records in self.datasets.items(): 

439 datasetType = self.registry.getDatasetType(datasetTypeName) 

440 # Make a big flattened list of all data IDs, while remembering 

441 # slices that associate them with the FileDataset instances they 

442 # came from. 

443 dataIds = [] 

444 slices = [] 

445 for fileDataset, _ in records: 

446 start = len(dataIds) 

447 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

448 stop = len(dataIds) 

449 slices.append(slice(start, stop)) 

450 # Insert all of those DatasetRefs at once. 

451 # For now, we ignore the dataset_id we pulled from the file 

452 # and just insert without one to get a new autoincrement value. 

453 # Eventually (once we have origin in IDs) we'll preserve them. 

454 resolvedRefs = self.registry.insertDatasets( 

455 datasetType, 

456 dataIds=dataIds, 

457 run=run, 

458 recursive=True 

459 ) 

460 # Now iterate over the original records, and install the new 

461 # resolved DatasetRefs to replace the unresolved ones as we 

462 # reorganize the collection information. 

463 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records): 

464 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

465 if directory is not None: 

466 fileDataset.path = os.path.join(directory, fileDataset.path) 

467 fileDatasets.append(fileDataset) 

468 for collection in collectionsForDataset: 

469 collections[collection].extend(fileDataset.refs) 

470 # Ingest everything into the datastore at once. 

471 datastore.ingest(*fileDatasets, transfer=transfer) 

472 # Associate with collections, one collection at a time. 

473 for collection, refs in collections.items(): 

474 self.registry.associate(collection, refs)