Coverage for python/lsst/daf/butler/transfers/_context.py: 12%

116 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-12 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["RepoExportContext"] 

31 

32from collections import defaultdict 

33from collections.abc import Callable, Iterable, Set 

34from typing import TYPE_CHECKING 

35 

36from .._dataset_association import DatasetAssociation 

37from .._dataset_ref import DatasetId, DatasetRef 

38from .._dataset_type import DatasetType 

39from .._file_dataset import FileDataset 

40from ..datastore import Datastore 

41from ..dimensions import DataCoordinate, DimensionElement, DimensionRecord 

42from ..registry import CollectionType, _ButlerRegistry 

43from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

44from ._interfaces import RepoExportBackend 

45 

46if TYPE_CHECKING: 

47 from lsst.resources import ResourcePathExpression 

48 

49 

50class RepoExportContext: 

51 """Public interface for exporting a subset of a data repository. 

52 

53 Instances of this class are obtained by calling `Butler.export` as the 

54 value returned by that context manager:: 

55 

56 with butler.export(filename="export.yaml") as export: 

57 export.saveDataIds(...) 

58 export.saveDatasets(...) 

59 

60 Parameters 

61 ---------- 

62 registry : `_ButlerRegistry` 

63 Registry to export from. 

64 datastore : `Datastore` 

65 Datastore to export from. 

66 backend : `RepoExportBackend` 

67 Implementation class for a particular export file format. 

68 directory : `~lsst.resources.ResourcePathExpression`, optional 

69 Directory to pass to `Datastore.export`. Can be `None` to use 

70 the current working directory. 

71 transfer : `str`, optional 

72 Transfer mode to pass to `Datastore.export`. 

73 """ 

74 

75 def __init__( 

76 self, 

77 registry: _ButlerRegistry, 

78 datastore: Datastore, 

79 backend: RepoExportBackend, 

80 *, 

81 directory: ResourcePathExpression | None = None, 

82 transfer: str | None = None, 

83 ): 

84 self._registry = registry 

85 self._datastore = datastore 

86 self._backend = backend 

87 self._directory = directory 

88 self._transfer = transfer 

89 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

90 self._dataset_ids: set[DatasetId] = set() 

91 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict( 

92 lambda: defaultdict(list) 

93 ) 

94 self._collections: dict[str, CollectionRecord] = {} 

95 

96 def saveCollection(self, name: str) -> None: 

97 """Export the given collection. 

98 

99 Parameters 

100 ---------- 

101 name: `str` 

102 Name of the collection. 

103 

104 Notes 

105 ----- 

106 `~CollectionType.RUN` collections are also exported automatically when 

107 any dataset referencing them is exported. They may also be explicitly 

108 exported this method to export the collection with no datasets. 

109 Duplicate exports of collections are ignored. 

110 

111 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

112 collection will cause its associations with exported datasets to also 

113 be exported, but it does not export those datasets automatically. 

114 

115 Exporting a `~CollectionType.CHAINED` collection does not automatically 

116 export its child collections; these must be explicitly exported or 

117 already be present in the repository they are being imported into. 

118 """ 

119 self._collections[name] = self._registry._get_collection_record(name) 

120 

121 def saveDimensionData( 

122 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord] 

123 ) -> None: 

124 """Export the given dimension records associated with one or more data 

125 IDs. 

126 

127 Parameters 

128 ---------- 

129 element : `str` or `DimensionElement` 

130 `DimensionElement` or `str` indicating the logical table these 

131 records are from. 

132 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ] 

133 Records to export, as an iterable containing `DimensionRecord` or 

134 `dict` instances. 

135 """ 

136 if not isinstance(element, DimensionElement): 

137 element = self._registry.dimensions[element] 

138 for record in records: 

139 if not isinstance(record, DimensionRecord): 

140 record = element.RecordClass(**record) 

141 elif record.definition != element: 

142 raise ValueError( 

143 f"Mismatch between element={element.name} and " 

144 f"dimension record with definition={record.definition.name}." 

145 ) 

146 self._records[element].setdefault(record.dataId, record) 

147 

148 def saveDataIds( 

149 self, 

150 dataIds: Iterable[DataCoordinate], 

151 *, 

152 elements: Iterable[str | DimensionElement] | None = None, 

153 ) -> None: 

154 """Export the dimension records associated with one or more data IDs. 

155 

156 Parameters 

157 ---------- 

158 dataIds : iterable of `DataCoordinate`. 

159 Data IDs to export. For large numbers of data IDs obtained by 

160 calls to `Registry.queryDataIds`, it will be much more efficient if 

161 these are expanded to include records (i.e. 

162 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

163 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

164 elements : iterable of `DimensionElement` or `str`, optional 

165 Dimension elements whose records should be exported. If `None`, 

166 records for all dimensions will be exported. 

167 """ 

168 standardized_elements: Set[DimensionElement] 

169 if elements is None: 

170 standardized_elements = frozenset( 

171 element 

172 for element in self._registry.dimensions.getStaticElements() 

173 if element.hasTable() and element.viewOf is None 

174 ) 

175 else: 

176 standardized_elements = set() 

177 for element in elements: 

178 if not isinstance(element, DimensionElement): 

179 element = self._registry.dimensions[element] 

180 if element.hasTable() and element.viewOf is None: 

181 standardized_elements.add(element) 

182 for dataId in dataIds: 

183 # This is potentially quite slow, because it's approximately 

184 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

185 # if the data ID is already expanded, and DM-26692 will add (or at 

186 # least start to add / unblock) query functionality that should 

187 # let us speed this up internally as well. 

188 dataId = self._registry.expandDataId(dataId) 

189 for record in dataId.records.values(): 

190 if record is not None and record.definition in standardized_elements: 

191 self._records[record.definition].setdefault(record.dataId, record) 

192 

193 def saveDatasets( 

194 self, 

195 refs: Iterable[DatasetRef], 

196 *, 

197 elements: Iterable[str | DimensionElement] | None = None, 

198 rewrite: Callable[[FileDataset], FileDataset] | None = None, 

199 ) -> None: 

200 """Export one or more datasets. 

201 

202 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

203 collections, and dimension records associated with the datasets. 

204 

205 Parameters 

206 ---------- 

207 refs : iterable of `DatasetRef` 

208 References to the datasets to export. Their `DatasetRef.id` 

209 attributes must not be `None`. Duplicates are automatically 

210 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

211 return `True`. If any reference is to a component dataset, the 

212 parent will be exported instead. 

213 elements : iterable of `DimensionElement` or `str`, optional 

214 Dimension elements whose records should be exported; this is 

215 forwarded to `saveDataIds` when exporting the data IDs of the 

216 given datasets. 

217 rewrite : callable, optional 

218 A callable that takes a single `FileDataset` argument and returns 

219 a modified `FileDataset`. This is typically used to rewrite the 

220 path generated by the datastore. If `None`, the `FileDataset` 

221 returned by `Datastore.export` will be used directly. 

222 

223 Notes 

224 ----- 

225 At present, this only associates datasets with `~CollectionType.RUN` 

226 collections. Other collections will be included in the export in the 

227 future (once `Registry` provides a way to look up that information). 

228 """ 

229 data_ids = set() 

230 refs_to_export = {} 

231 for ref in sorted(refs): 

232 dataset_id = ref.id 

233 # The query interfaces that are often used to generate the refs 

234 # passed here often don't remove duplicates, so do that here for 

235 # convenience. 

236 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

237 continue 

238 # Also convert components to composites. 

239 if ref.isComponent(): 

240 ref = ref.makeCompositeRef() 

241 data_ids.add(ref.dataId) 

242 refs_to_export[dataset_id] = ref 

243 # Do a vectorized datastore export, which might be a lot faster than 

244 # one-by-one. 

245 exports = self._datastore.export( 

246 refs_to_export.values(), 

247 directory=self._directory, 

248 transfer=self._transfer, 

249 ) 

250 # Export associated data IDs. 

251 self.saveDataIds(data_ids, elements=elements) 

252 # Rewrite export filenames if desired, and then save them to the 

253 # data structure we'll write in `_finish`. 

254 # If a single exported FileDataset has multiple DatasetRefs, we save 

255 # it with each of them. 

256 for file_dataset in exports: 

257 if rewrite is not None: 

258 file_dataset = rewrite(file_dataset) 

259 for ref in file_dataset.refs: 

260 assert ref.run is not None 

261 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

262 self._dataset_ids.update(refs_to_export.keys()) 

263 

264 def _finish(self) -> None: 

265 """Delegate to the backend to finish the export process. 

266 

267 For use by `Butler.export` only. 

268 """ 

269 for element in self._registry.dimensions.sorted(self._records.keys()): 

270 # To make export deterministic sort the DataCoordinate instances. 

271 r = self._records[element] 

272 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

273 for datasetsByRun in self._datasets.values(): 

274 for run in datasetsByRun: 

275 self._collections[run] = self._registry._get_collection_record(run) 

276 for collectionName in self._computeSortedCollections(): 

277 doc = self._registry.getCollectionDocumentation(collectionName) 

278 self._backend.saveCollection(self._collections[collectionName], doc) 

279 # Sort the dataset types and runs before exporting to ensure 

280 # reproducible order in export file. 

281 for datasetType in sorted(self._datasets.keys()): 

282 for run in sorted(self._datasets[datasetType].keys()): 

283 # Sort the FileDataset 

284 records = sorted(self._datasets[datasetType][run]) 

285 self._backend.saveDatasets(datasetType, run, *records) 

286 # Export associations between datasets and collections. These need to 

287 # be sorted (at two levels; they're dicts) or created more 

288 # deterministically, too, which probably involves more data ID sorting. 

289 datasetAssociations = self._computeDatasetAssociations() 

290 for collection in sorted(datasetAssociations): 

291 self._backend.saveDatasetAssociations( 

292 collection, self._collections[collection].type, sorted(datasetAssociations[collection]) 

293 ) 

294 self._backend.finish() 

295 

296 def _computeSortedCollections(self) -> list[str]: 

297 """Sort collections in a way that is both deterministic and safe 

298 for registering them in a new repo in the presence of nested chains. 

299 

300 This method is intended for internal use by `RepoExportContext` only. 

301 

302 Returns 

303 ------- 

304 names: `List` [ `str` ] 

305 Ordered list of collection names. 

306 """ 

307 # Split collections into CHAINED and everything else, and just 

308 # sort "everything else" lexicographically since there are no 

309 # dependencies. 

310 chains: dict[str, list[str]] = {} 

311 result: list[str] = [] 

312 for record in self._collections.values(): 

313 if record.type is CollectionType.CHAINED: 

314 assert isinstance(record, ChainedCollectionRecord) 

315 chains[record.name] = list(record.children) 

316 else: 

317 result.append(record.name) 

318 result.sort() 

319 # Sort all chains topologically, breaking ties lexicographically. 

320 # Append these to 'result' and remove them from 'chains' as we go. 

321 while chains: 

322 unblocked = { 

323 parent 

324 for parent, children in chains.items() 

325 if not any(child in chains for child in children) 

326 } 

327 if not unblocked: 

328 raise RuntimeError( 

329 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}." 

330 ) 

331 result.extend(sorted(unblocked)) 

332 for name in unblocked: 

333 del chains[name] 

334 return result 

335 

336 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]: 

337 """Return datasets-collection associations, grouped by association. 

338 

339 This queries for all associations between exported datasets and 

340 exported TAGGED or CALIBRATION collections and is intended to be run 

341 only by `_finish`, as this ensures all collections and all datasets 

342 have already been exported and hence the order in which they are 

343 exported does not matter. 

344 

345 Returns 

346 ------- 

347 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

348 Dictionary keyed by collection name, with values lists of structs 

349 representing an association between that collection and a dataset. 

350 """ 

351 results = defaultdict(list) 

352 for datasetType in self._datasets: 

353 # We query for _all_ datasets of each dataset type we export, in 

354 # the specific collections we are exporting. The worst-case 

355 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

356 # subset). But we don't have any better options right now; we need 

357 # a way to query for a _lot_ of explicitly given dataset_ids, and 

358 # the only way to make that scale up is to either upload them to a 

359 # temporary table or recognize when they are already in one because 

360 # the user passed us a QueryResult object. That's blocked by (at 

361 # least) DM-26692. 

362 collectionTypes = {CollectionType.TAGGED} 

363 if datasetType.isCalibration(): 

364 collectionTypes.add(CollectionType.CALIBRATION) 

365 associationIter = self._registry.queryDatasetAssociations( 

366 datasetType, 

367 collections=self._collections.keys(), 

368 collectionTypes=collectionTypes, 

369 flattenChains=False, 

370 ) 

371 for association in associationIter: 

372 if association.ref.id in self._dataset_ids: 

373 results[association.collection].append(association) 

374 return results