Coverage for python/lsst/daf/butler/transfers/_context.py: 12%

116 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["RepoExportContext"] 

31 

32from collections import defaultdict 

33from collections.abc import Callable, Iterable, Set 

34from typing import TYPE_CHECKING 

35 

36from .._dataset_association import DatasetAssociation 

37from .._dataset_ref import DatasetId, DatasetRef 

38from .._dataset_type import DatasetType 

39from .._file_dataset import FileDataset 

40from ..datastore import Datastore 

41from ..dimensions import DataCoordinate, DimensionElement, DimensionRecord 

42from ..registry import CollectionType 

43from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

44 

45if TYPE_CHECKING: 

46 from lsst.resources import ResourcePathExpression 

47 

48 from ..registry.sql_registry import SqlRegistry 

49 from ._interfaces import RepoExportBackend 

50 

51 

52class RepoExportContext: 

53 """Public interface for exporting a subset of a data repository. 

54 

55 Instances of this class are obtained by calling `Butler.export` as the 

56 value returned by that context manager:: 

57 

58 with butler.export(filename="export.yaml") as export: 

59 export.saveDataIds(...) 

60 export.saveDatasets(...) 

61 

62 Parameters 

63 ---------- 

64 registry : `SqlRegistry` 

65 Registry to export from. 

66 datastore : `Datastore` 

67 Datastore to export from. 

68 backend : `RepoExportBackend` 

69 Implementation class for a particular export file format. 

70 directory : `~lsst.resources.ResourcePathExpression`, optional 

71 Directory to pass to `Datastore.export`. Can be `None` to use 

72 the current working directory. 

73 transfer : `str`, optional 

74 Transfer mode to pass to `Datastore.export`. 

75 """ 

76 

77 def __init__( 

78 self, 

79 registry: SqlRegistry, 

80 datastore: Datastore, 

81 backend: RepoExportBackend, 

82 *, 

83 directory: ResourcePathExpression | None = None, 

84 transfer: str | None = None, 

85 ): 

86 self._registry = registry 

87 self._datastore = datastore 

88 self._backend = backend 

89 self._directory = directory 

90 self._transfer = transfer 

91 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

92 self._dataset_ids: set[DatasetId] = set() 

93 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict( 

94 lambda: defaultdict(list) 

95 ) 

96 self._collections: dict[str, CollectionRecord] = {} 

97 

98 def saveCollection(self, name: str) -> None: 

99 """Export the given collection. 

100 

101 Parameters 

102 ---------- 

103 name: `str` 

104 Name of the collection. 

105 

106 Notes 

107 ----- 

108 `~CollectionType.RUN` collections are also exported automatically when 

109 any dataset referencing them is exported. They may also be explicitly 

110 exported this method to export the collection with no datasets. 

111 Duplicate exports of collections are ignored. 

112 

113 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

114 collection will cause its associations with exported datasets to also 

115 be exported, but it does not export those datasets automatically. 

116 

117 Exporting a `~CollectionType.CHAINED` collection does not automatically 

118 export its child collections; these must be explicitly exported or 

119 already be present in the repository they are being imported into. 

120 """ 

121 self._collections[name] = self._registry._get_collection_record(name) 

122 

123 def saveDimensionData( 

124 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord] 

125 ) -> None: 

126 """Export the given dimension records associated with one or more data 

127 IDs. 

128 

129 Parameters 

130 ---------- 

131 element : `str` or `DimensionElement` 

132 `DimensionElement` or `str` indicating the logical table these 

133 records are from. 

134 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ] 

135 Records to export, as an iterable containing `DimensionRecord` or 

136 `dict` instances. 

137 """ 

138 if not isinstance(element, DimensionElement): 

139 element = self._registry.dimensions[element] 

140 for record in records: 

141 if not isinstance(record, DimensionRecord): 

142 record = element.RecordClass(**record) 

143 elif record.definition != element: 

144 raise ValueError( 

145 f"Mismatch between element={element.name} and " 

146 f"dimension record with definition={record.definition.name}." 

147 ) 

148 self._records[element].setdefault(record.dataId, record) 

149 

150 def saveDataIds( 

151 self, 

152 dataIds: Iterable[DataCoordinate], 

153 *, 

154 elements: Iterable[str | DimensionElement] | None = None, 

155 ) -> None: 

156 """Export the dimension records associated with one or more data IDs. 

157 

158 Parameters 

159 ---------- 

160 dataIds : iterable of `DataCoordinate`. 

161 Data IDs to export. For large numbers of data IDs obtained by 

162 calls to `Registry.queryDataIds`, it will be much more efficient if 

163 these are expanded to include records (i.e. 

164 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

165 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

166 elements : iterable of `DimensionElement` or `str`, optional 

167 Dimension elements whose records should be exported. If `None`, 

168 records for all dimensions will be exported. 

169 """ 

170 standardized_elements: Set[DimensionElement] 

171 if elements is None: 

172 standardized_elements = frozenset( 

173 element 

174 for element in self._registry.dimensions.elements 

175 if element.hasTable() and element.viewOf is None 

176 ) 

177 else: 

178 standardized_elements = set() 

179 for element in elements: 

180 if not isinstance(element, DimensionElement): 

181 element = self._registry.dimensions[element] 

182 if element.hasTable() and element.viewOf is None: 

183 standardized_elements.add(element) 

184 for dataId in dataIds: 

185 # This is potentially quite slow, because it's approximately 

186 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

187 # if the data ID is already expanded, and DM-26692 will add (or at 

188 # least start to add / unblock) query functionality that should 

189 # let us speed this up internally as well. 

190 dataId = self._registry.expandDataId(dataId) 

191 for element_name in dataId.dimensions.elements: 

192 record = dataId.records[element_name] 

193 if record is not None and record.definition in standardized_elements: 

194 self._records[record.definition].setdefault(record.dataId, record) 

195 

196 def saveDatasets( 

197 self, 

198 refs: Iterable[DatasetRef], 

199 *, 

200 elements: Iterable[str | DimensionElement] | None = None, 

201 rewrite: Callable[[FileDataset], FileDataset] | None = None, 

202 ) -> None: 

203 """Export one or more datasets. 

204 

205 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

206 collections, and dimension records associated with the datasets. 

207 

208 Parameters 

209 ---------- 

210 refs : iterable of `DatasetRef` 

211 References to the datasets to export. Their `DatasetRef.id` 

212 attributes must not be `None`. Duplicates are automatically 

213 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

214 return `True`. If any reference is to a component dataset, the 

215 parent will be exported instead. 

216 elements : iterable of `DimensionElement` or `str`, optional 

217 Dimension elements whose records should be exported; this is 

218 forwarded to `saveDataIds` when exporting the data IDs of the 

219 given datasets. 

220 rewrite : callable, optional 

221 A callable that takes a single `FileDataset` argument and returns 

222 a modified `FileDataset`. This is typically used to rewrite the 

223 path generated by the datastore. If `None`, the `FileDataset` 

224 returned by `Datastore.export` will be used directly. 

225 

226 Notes 

227 ----- 

228 At present, this only associates datasets with `~CollectionType.RUN` 

229 collections. Other collections will be included in the export in the 

230 future (once `Registry` provides a way to look up that information). 

231 """ 

232 data_ids = set() 

233 refs_to_export = {} 

234 for ref in sorted(refs): 

235 dataset_id = ref.id 

236 # The query interfaces that are often used to generate the refs 

237 # passed here often don't remove duplicates, so do that here for 

238 # convenience. 

239 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

240 continue 

241 # Also convert components to composites. 

242 if ref.isComponent(): 

243 ref = ref.makeCompositeRef() 

244 data_ids.add(ref.dataId) 

245 refs_to_export[dataset_id] = ref 

246 # Do a vectorized datastore export, which might be a lot faster than 

247 # one-by-one. 

248 exports = self._datastore.export( 

249 refs_to_export.values(), 

250 directory=self._directory, 

251 transfer=self._transfer, 

252 ) 

253 # Export associated data IDs. 

254 self.saveDataIds(data_ids, elements=elements) 

255 # Rewrite export filenames if desired, and then save them to the 

256 # data structure we'll write in `_finish`. 

257 # If a single exported FileDataset has multiple DatasetRefs, we save 

258 # it with each of them. 

259 for file_dataset in exports: 

260 if rewrite is not None: 

261 file_dataset = rewrite(file_dataset) 

262 for ref in file_dataset.refs: 

263 assert ref.run is not None 

264 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

265 self._dataset_ids.update(refs_to_export.keys()) 

266 

267 def _finish(self) -> None: 

268 """Delegate to the backend to finish the export process. 

269 

270 For use by `Butler.export` only. 

271 """ 

272 for element in self._registry.dimensions.sorted(self._records.keys()): 

273 # To make export deterministic sort the DataCoordinate instances. 

274 r = self._records[element] 

275 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

276 for datasetsByRun in self._datasets.values(): 

277 for run in datasetsByRun: 

278 self._collections[run] = self._registry._get_collection_record(run) 

279 for collectionName in self._computeSortedCollections(): 

280 doc = self._registry.getCollectionDocumentation(collectionName) 

281 self._backend.saveCollection(self._collections[collectionName], doc) 

282 # Sort the dataset types and runs before exporting to ensure 

283 # reproducible order in export file. 

284 for datasetType in sorted(self._datasets.keys()): 

285 for run in sorted(self._datasets[datasetType].keys()): 

286 # Sort the FileDataset 

287 records = sorted(self._datasets[datasetType][run]) 

288 self._backend.saveDatasets(datasetType, run, *records) 

289 # Export associations between datasets and collections. These need to 

290 # be sorted (at two levels; they're dicts) or created more 

291 # deterministically, too, which probably involves more data ID sorting. 

292 datasetAssociations = self._computeDatasetAssociations() 

293 for collection in sorted(datasetAssociations): 

294 self._backend.saveDatasetAssociations( 

295 collection, self._collections[collection].type, sorted(datasetAssociations[collection]) 

296 ) 

297 self._backend.finish() 

298 

299 def _computeSortedCollections(self) -> list[str]: 

300 """Sort collections in a way that is both deterministic and safe 

301 for registering them in a new repo in the presence of nested chains. 

302 

303 This method is intended for internal use by `RepoExportContext` only. 

304 

305 Returns 

306 ------- 

307 names: `List` [ `str` ] 

308 Ordered list of collection names. 

309 """ 

310 # Split collections into CHAINED and everything else, and just 

311 # sort "everything else" lexicographically since there are no 

312 # dependencies. 

313 chains: dict[str, list[str]] = {} 

314 result: list[str] = [] 

315 for record in self._collections.values(): 

316 if record.type is CollectionType.CHAINED: 

317 assert isinstance(record, ChainedCollectionRecord) 

318 chains[record.name] = list(record.children) 

319 else: 

320 result.append(record.name) 

321 result.sort() 

322 # Sort all chains topologically, breaking ties lexicographically. 

323 # Append these to 'result' and remove them from 'chains' as we go. 

324 while chains: 

325 unblocked = { 

326 parent 

327 for parent, children in chains.items() 

328 if not any(child in chains for child in children) 

329 } 

330 if not unblocked: 

331 raise RuntimeError( 

332 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}." 

333 ) 

334 result.extend(sorted(unblocked)) 

335 for name in unblocked: 

336 del chains[name] 

337 return result 

338 

339 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]: 

340 """Return datasets-collection associations, grouped by association. 

341 

342 This queries for all associations between exported datasets and 

343 exported TAGGED or CALIBRATION collections and is intended to be run 

344 only by `_finish`, as this ensures all collections and all datasets 

345 have already been exported and hence the order in which they are 

346 exported does not matter. 

347 

348 Returns 

349 ------- 

350 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

351 Dictionary keyed by collection name, with values lists of structs 

352 representing an association between that collection and a dataset. 

353 """ 

354 results = defaultdict(list) 

355 for datasetType in self._datasets: 

356 # We query for _all_ datasets of each dataset type we export, in 

357 # the specific collections we are exporting. The worst-case 

358 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

359 # subset). But we don't have any better options right now; we need 

360 # a way to query for a _lot_ of explicitly given dataset_ids, and 

361 # the only way to make that scale up is to either upload them to a 

362 # temporary table or recognize when they are already in one because 

363 # the user passed us a QueryResult object. That's blocked by (at 

364 # least) DM-26692. 

365 collectionTypes = {CollectionType.TAGGED} 

366 if datasetType.isCalibration(): 

367 collectionTypes.add(CollectionType.CALIBRATION) 

368 associationIter = self._registry.queryDatasetAssociations( 

369 datasetType, 

370 collections=self._collections.keys(), 

371 collectionTypes=collectionTypes, 

372 flattenChains=False, 

373 ) 

374 for association in associationIter: 

375 if association.ref.id in self._dataset_ids: 

376 results[association.collection].append(association) 

377 return results