Coverage for python/lsst/daf/butler/transfers/_context.py: 10%

111 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["RepoExportContext"] 

31 

32from collections import defaultdict 

33from collections.abc import Callable, Iterable, Set 

34from typing import TYPE_CHECKING 

35 

36from ..core import ( 

37 DataCoordinate, 

38 DatasetAssociation, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetType, 

42 Datastore, 

43 DimensionElement, 

44 DimensionRecord, 

45 FileDataset, 

46) 

47from ..registry import CollectionType, _ButlerRegistry 

48from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

49from ._interfaces import RepoExportBackend 

50 

51if TYPE_CHECKING: 

52 from lsst.resources import ResourcePathExpression 

53 

54 

55class RepoExportContext: 

56 """Public interface for exporting a subset of a data repository. 

57 

58 Instances of this class are obtained by calling `Butler.export` as the 

59 value returned by that context manager:: 

60 

61 with butler.export(filename="export.yaml") as export: 

62 export.saveDataIds(...) 

63 export.saveDatasets(...) 

64 

65 Parameters 

66 ---------- 

67 registry : `_ButlerRegistry` 

68 Registry to export from. 

69 datastore : `Datastore` 

70 Datastore to export from. 

71 backend : `RepoExportBackend` 

72 Implementation class for a particular export file format. 

73 directory : `~lsst.resources.ResourcePathExpression`, optional 

74 Directory to pass to `Datastore.export`. Can be `None` to use 

75 the current working directory. 

76 transfer : `str`, optional 

77 Transfer mode to pass to `Datastore.export`. 

78 """ 

79 

80 def __init__( 

81 self, 

82 registry: _ButlerRegistry, 

83 datastore: Datastore, 

84 backend: RepoExportBackend, 

85 *, 

86 directory: ResourcePathExpression | None = None, 

87 transfer: str | None = None, 

88 ): 

89 self._registry = registry 

90 self._datastore = datastore 

91 self._backend = backend 

92 self._directory = directory 

93 self._transfer = transfer 

94 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

95 self._dataset_ids: set[DatasetId] = set() 

96 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict( 

97 lambda: defaultdict(list) 

98 ) 

99 self._collections: dict[str, CollectionRecord] = {} 

100 

101 def saveCollection(self, name: str) -> None: 

102 """Export the given collection. 

103 

104 Parameters 

105 ---------- 

106 name: `str` 

107 Name of the collection. 

108 

109 Notes 

110 ----- 

111 `~CollectionType.RUN` collections are also exported automatically when 

112 any dataset referencing them is exported. They may also be explicitly 

113 exported this method to export the collection with no datasets. 

114 Duplicate exports of collections are ignored. 

115 

116 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

117 collection will cause its associations with exported datasets to also 

118 be exported, but it does not export those datasets automatically. 

119 

120 Exporting a `~CollectionType.CHAINED` collection does not automatically 

121 export its child collections; these must be explicitly exported or 

122 already be present in the repository they are being imported into. 

123 """ 

124 self._collections[name] = self._registry._get_collection_record(name) 

125 

126 def saveDimensionData( 

127 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord] 

128 ) -> None: 

129 """Export the given dimension records associated with one or more data 

130 IDs. 

131 

132 Parameters 

133 ---------- 

134 element : `str` or `DimensionElement` 

135 `DimensionElement` or `str` indicating the logical table these 

136 records are from. 

137 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ] 

138 Records to export, as an iterable containing `DimensionRecord` or 

139 `dict` instances. 

140 """ 

141 if not isinstance(element, DimensionElement): 

142 element = self._registry.dimensions[element] 

143 for record in records: 

144 if not isinstance(record, DimensionRecord): 

145 record = element.RecordClass(**record) 

146 elif record.definition != element: 

147 raise ValueError( 

148 f"Mismatch between element={element.name} and " 

149 f"dimension record with definition={record.definition.name}." 

150 ) 

151 self._records[element].setdefault(record.dataId, record) 

152 

153 def saveDataIds( 

154 self, 

155 dataIds: Iterable[DataCoordinate], 

156 *, 

157 elements: Iterable[str | DimensionElement] | None = None, 

158 ) -> None: 

159 """Export the dimension records associated with one or more data IDs. 

160 

161 Parameters 

162 ---------- 

163 dataIds : iterable of `DataCoordinate`. 

164 Data IDs to export. For large numbers of data IDs obtained by 

165 calls to `Registry.queryDataIds`, it will be much more efficient if 

166 these are expanded to include records (i.e. 

167 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

168 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

169 elements : iterable of `DimensionElement` or `str`, optional 

170 Dimension elements whose records should be exported. If `None`, 

171 records for all dimensions will be exported. 

172 """ 

173 standardized_elements: Set[DimensionElement] 

174 if elements is None: 

175 standardized_elements = frozenset( 

176 element 

177 for element in self._registry.dimensions.getStaticElements() 

178 if element.hasTable() and element.viewOf is None 

179 ) 

180 else: 

181 standardized_elements = set() 

182 for element in elements: 

183 if not isinstance(element, DimensionElement): 

184 element = self._registry.dimensions[element] 

185 if element.hasTable() and element.viewOf is None: 

186 standardized_elements.add(element) 

187 for dataId in dataIds: 

188 # This is potentially quite slow, because it's approximately 

189 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

190 # if the data ID is already expanded, and DM-26692 will add (or at 

191 # least start to add / unblock) query functionality that should 

192 # let us speed this up internally as well. 

193 dataId = self._registry.expandDataId(dataId) 

194 for record in dataId.records.values(): 

195 if record is not None and record.definition in standardized_elements: 

196 self._records[record.definition].setdefault(record.dataId, record) 

197 

198 def saveDatasets( 

199 self, 

200 refs: Iterable[DatasetRef], 

201 *, 

202 elements: Iterable[str | DimensionElement] | None = None, 

203 rewrite: Callable[[FileDataset], FileDataset] | None = None, 

204 ) -> None: 

205 """Export one or more datasets. 

206 

207 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

208 collections, and dimension records associated with the datasets. 

209 

210 Parameters 

211 ---------- 

212 refs : iterable of `DatasetRef` 

213 References to the datasets to export. Their `DatasetRef.id` 

214 attributes must not be `None`. Duplicates are automatically 

215 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

216 return `True`. If any reference is to a component dataset, the 

217 parent will be exported instead. 

218 elements : iterable of `DimensionElement` or `str`, optional 

219 Dimension elements whose records should be exported; this is 

220 forwarded to `saveDataIds` when exporting the data IDs of the 

221 given datasets. 

222 rewrite : callable, optional 

223 A callable that takes a single `FileDataset` argument and returns 

224 a modified `FileDataset`. This is typically used to rewrite the 

225 path generated by the datastore. If `None`, the `FileDataset` 

226 returned by `Datastore.export` will be used directly. 

227 

228 Notes 

229 ----- 

230 At present, this only associates datasets with `~CollectionType.RUN` 

231 collections. Other collections will be included in the export in the 

232 future (once `Registry` provides a way to look up that information). 

233 """ 

234 data_ids = set() 

235 refs_to_export = {} 

236 for ref in sorted(refs): 

237 dataset_id = ref.id 

238 # The query interfaces that are often used to generate the refs 

239 # passed here often don't remove duplicates, so do that here for 

240 # convenience. 

241 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

242 continue 

243 # Also convert components to composites. 

244 if ref.isComponent(): 

245 ref = ref.makeCompositeRef() 

246 data_ids.add(ref.dataId) 

247 refs_to_export[dataset_id] = ref 

248 # Do a vectorized datastore export, which might be a lot faster than 

249 # one-by-one. 

250 exports = self._datastore.export( 

251 refs_to_export.values(), 

252 directory=self._directory, 

253 transfer=self._transfer, 

254 ) 

255 # Export associated data IDs. 

256 self.saveDataIds(data_ids, elements=elements) 

257 # Rewrite export filenames if desired, and then save them to the 

258 # data structure we'll write in `_finish`. 

259 # If a single exported FileDataset has multiple DatasetRefs, we save 

260 # it with each of them. 

261 for file_dataset in exports: 

262 if rewrite is not None: 

263 file_dataset = rewrite(file_dataset) 

264 for ref in file_dataset.refs: 

265 assert ref.run is not None 

266 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

267 self._dataset_ids.update(refs_to_export.keys()) 

268 

269 def _finish(self) -> None: 

270 """Delegate to the backend to finish the export process. 

271 

272 For use by `Butler.export` only. 

273 """ 

274 for element in self._registry.dimensions.sorted(self._records.keys()): 

275 # To make export deterministic sort the DataCoordinate instances. 

276 r = self._records[element] 

277 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

278 for datasetsByRun in self._datasets.values(): 

279 for run in datasetsByRun: 

280 self._collections[run] = self._registry._get_collection_record(run) 

281 for collectionName in self._computeSortedCollections(): 

282 doc = self._registry.getCollectionDocumentation(collectionName) 

283 self._backend.saveCollection(self._collections[collectionName], doc) 

284 # Sort the dataset types and runs before exporting to ensure 

285 # reproducible order in export file. 

286 for datasetType in sorted(self._datasets.keys()): 

287 for run in sorted(self._datasets[datasetType].keys()): 

288 # Sort the FileDataset 

289 records = sorted(self._datasets[datasetType][run]) 

290 self._backend.saveDatasets(datasetType, run, *records) 

291 # Export associations between datasets and collections. These need to 

292 # be sorted (at two levels; they're dicts) or created more 

293 # deterministically, too, which probably involves more data ID sorting. 

294 datasetAssociations = self._computeDatasetAssociations() 

295 for collection in sorted(datasetAssociations): 

296 self._backend.saveDatasetAssociations( 

297 collection, self._collections[collection].type, sorted(datasetAssociations[collection]) 

298 ) 

299 self._backend.finish() 

300 

301 def _computeSortedCollections(self) -> list[str]: 

302 """Sort collections in a way that is both deterministic and safe 

303 for registering them in a new repo in the presence of nested chains. 

304 

305 This method is intended for internal use by `RepoExportContext` only. 

306 

307 Returns 

308 ------- 

309 names: `List` [ `str` ] 

310 Ordered list of collection names. 

311 """ 

312 # Split collections into CHAINED and everything else, and just 

313 # sort "everything else" lexicographically since there are no 

314 # dependencies. 

315 chains: dict[str, list[str]] = {} 

316 result: list[str] = [] 

317 for record in self._collections.values(): 

318 if record.type is CollectionType.CHAINED: 

319 assert isinstance(record, ChainedCollectionRecord) 

320 chains[record.name] = list(record.children) 

321 else: 

322 result.append(record.name) 

323 result.sort() 

324 # Sort all chains topologically, breaking ties lexicographically. 

325 # Append these to 'result' and remove them from 'chains' as we go. 

326 while chains: 

327 unblocked = { 

328 parent 

329 for parent, children in chains.items() 

330 if not any(child in chains for child in children) 

331 } 

332 if not unblocked: 

333 raise RuntimeError( 

334 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}." 

335 ) 

336 result.extend(sorted(unblocked)) 

337 for name in unblocked: 

338 del chains[name] 

339 return result 

340 

341 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]: 

342 """Return datasets-collection associations, grouped by association. 

343 

344 This queries for all associations between exported datasets and 

345 exported TAGGED or CALIBRATION collections and is intended to be run 

346 only by `_finish`, as this ensures all collections and all datasets 

347 have already been exported and hence the order in which they are 

348 exported does not matter. 

349 

350 Returns 

351 ------- 

352 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

353 Dictionary keyed by collection name, with values lists of structs 

354 representing an association between that collection and a dataset. 

355 """ 

356 results = defaultdict(list) 

357 for datasetType in self._datasets: 

358 # We query for _all_ datasets of each dataset type we export, in 

359 # the specific collections we are exporting. The worst-case 

360 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

361 # subset). But we don't have any better options right now; we need 

362 # a way to query for a _lot_ of explicitly given dataset_ids, and 

363 # the only way to make that scale up is to either upload them to a 

364 # temporary table or recognize when they are already in one because 

365 # the user passed us a QueryResult object. That's blocked by (at 

366 # least) DM-26692. 

367 collectionTypes = {CollectionType.TAGGED} 

368 if datasetType.isCalibration(): 

369 collectionTypes.add(CollectionType.CALIBRATION) 

370 associationIter = self._registry.queryDatasetAssociations( 

371 datasetType, 

372 collections=self._collections.keys(), 

373 collectionTypes=collectionTypes, 

374 flattenChains=False, 

375 ) 

376 for association in associationIter: 

377 if association.ref.id in self._dataset_ids: 

378 results[association.collection].append(association) 

379 return results