Coverage for python/lsst/daf/butler/transfers/_context.py: 10%

111 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["RepoExportContext"] 

25 

26from collections import defaultdict 

27from collections.abc import Callable, Iterable, Set 

28from typing import TYPE_CHECKING 

29 

30from ..core import ( 

31 DataCoordinate, 

32 DatasetAssociation, 

33 DatasetId, 

34 DatasetRef, 

35 DatasetType, 

36 Datastore, 

37 DimensionElement, 

38 DimensionRecord, 

39 FileDataset, 

40) 

41from ..registry import CollectionType, Registry 

42from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

43from ._interfaces import RepoExportBackend 

44 

45if TYPE_CHECKING: 

46 from lsst.resources import ResourcePathExpression 

47 

48 

49class RepoExportContext: 

50 """Public interface for exporting a subset of a data repository. 

51 

52 Instances of this class are obtained by calling `Butler.export` as the 

53 value returned by that context manager:: 

54 

55 with butler.export(filename="export.yaml") as export: 

56 export.saveDataIds(...) 

57 export.saveDatasets(...) 

58 

59 Parameters 

60 ---------- 

61 registry : `Registry` 

62 Registry to export from. 

63 datastore : `Datastore` 

64 Datastore to export from. 

65 backend : `RepoExportBackend` 

66 Implementation class for a particular export file format. 

67 directory : `~lsst.resources.ResourcePathExpression`, optional 

68 Directory to pass to `Datastore.export`. Can be `None` to use 

69 the current working directory. 

70 transfer : `str`, optional 

71 Transfer mode to pass to `Datastore.export`. 

72 """ 

73 

74 def __init__( 

75 self, 

76 registry: Registry, 

77 datastore: Datastore, 

78 backend: RepoExportBackend, 

79 *, 

80 directory: ResourcePathExpression | None = None, 

81 transfer: str | None = None, 

82 ): 

83 self._registry = registry 

84 self._datastore = datastore 

85 self._backend = backend 

86 self._directory = directory 

87 self._transfer = transfer 

88 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

89 self._dataset_ids: set[DatasetId] = set() 

90 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict( 

91 lambda: defaultdict(list) 

92 ) 

93 self._collections: dict[str, CollectionRecord] = {} 

94 

95 def saveCollection(self, name: str) -> None: 

96 """Export the given collection. 

97 

98 Parameters 

99 ---------- 

100 name: `str` 

101 Name of the collection. 

102 

103 Notes 

104 ----- 

105 `~CollectionType.RUN` collections are also exported automatically when 

106 any dataset referencing them is exported. They may also be explicitly 

107 exported this method to export the collection with no datasets. 

108 Duplicate exports of collections are ignored. 

109 

110 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

111 collection will cause its associations with exported datasets to also 

112 be exported, but it does not export those datasets automatically. 

113 

114 Exporting a `~CollectionType.CHAINED` collection does not automatically 

115 export its child collections; these must be explicitly exported or 

116 already be present in the repository they are being imported into. 

117 """ 

118 self._collections[name] = self._registry._get_collection_record(name) 

119 

120 def saveDimensionData( 

121 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord] 

122 ) -> None: 

123 """Export the given dimension records associated with one or more data 

124 IDs. 

125 

126 Parameters 

127 ---------- 

128 element : `str` or `DimensionElement` 

129 `DimensionElement` or `str` indicating the logical table these 

130 records are from. 

131 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ] 

132 Records to export, as an iterable containing `DimensionRecord` or 

133 `dict` instances. 

134 """ 

135 if not isinstance(element, DimensionElement): 

136 element = self._registry.dimensions[element] 

137 for record in records: 

138 if not isinstance(record, DimensionRecord): 

139 record = element.RecordClass(**record) 

140 elif record.definition != element: 

141 raise ValueError( 

142 f"Mismatch between element={element.name} and " 

143 f"dimension record with definition={record.definition.name}." 

144 ) 

145 self._records[element].setdefault(record.dataId, record) 

146 

147 def saveDataIds( 

148 self, 

149 dataIds: Iterable[DataCoordinate], 

150 *, 

151 elements: Iterable[str | DimensionElement] | None = None, 

152 ) -> None: 

153 """Export the dimension records associated with one or more data IDs. 

154 

155 Parameters 

156 ---------- 

157 dataIds : iterable of `DataCoordinate`. 

158 Data IDs to export. For large numbers of data IDs obtained by 

159 calls to `Registry.queryDataIds`, it will be much more efficient if 

160 these are expanded to include records (i.e. 

161 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

162 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

163 elements : iterable of `DimensionElement` or `str`, optional 

164 Dimension elements whose records should be exported. If `None`, 

165 records for all dimensions will be exported. 

166 """ 

167 standardized_elements: Set[DimensionElement] 

168 if elements is None: 

169 standardized_elements = frozenset( 

170 element 

171 for element in self._registry.dimensions.getStaticElements() 

172 if element.hasTable() and element.viewOf is None 

173 ) 

174 else: 

175 standardized_elements = set() 

176 for element in elements: 

177 if not isinstance(element, DimensionElement): 

178 element = self._registry.dimensions[element] 

179 if element.hasTable() and element.viewOf is None: 

180 standardized_elements.add(element) 

181 for dataId in dataIds: 

182 # This is potentially quite slow, because it's approximately 

183 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

184 # if the data ID is already expanded, and DM-26692 will add (or at 

185 # least start to add / unblock) query functionality that should 

186 # let us speed this up internally as well. 

187 dataId = self._registry.expandDataId(dataId) 

188 for record in dataId.records.values(): 

189 if record is not None and record.definition in standardized_elements: 

190 self._records[record.definition].setdefault(record.dataId, record) 

191 

192 def saveDatasets( 

193 self, 

194 refs: Iterable[DatasetRef], 

195 *, 

196 elements: Iterable[str | DimensionElement] | None = None, 

197 rewrite: Callable[[FileDataset], FileDataset] | None = None, 

198 ) -> None: 

199 """Export one or more datasets. 

200 

201 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

202 collections, and dimension records associated with the datasets. 

203 

204 Parameters 

205 ---------- 

206 refs : iterable of `DatasetRef` 

207 References to the datasets to export. Their `DatasetRef.id` 

208 attributes must not be `None`. Duplicates are automatically 

209 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

210 return `True`. If any reference is to a component dataset, the 

211 parent will be exported instead. 

212 elements : iterable of `DimensionElement` or `str`, optional 

213 Dimension elements whose records should be exported; this is 

214 forwarded to `saveDataIds` when exporting the data IDs of the 

215 given datasets. 

216 rewrite : callable, optional 

217 A callable that takes a single `FileDataset` argument and returns 

218 a modified `FileDataset`. This is typically used to rewrite the 

219 path generated by the datastore. If `None`, the `FileDataset` 

220 returned by `Datastore.export` will be used directly. 

221 

222 Notes 

223 ----- 

224 At present, this only associates datasets with `~CollectionType.RUN` 

225 collections. Other collections will be included in the export in the 

226 future (once `Registry` provides a way to look up that information). 

227 """ 

228 data_ids = set() 

229 refs_to_export = {} 

230 for ref in sorted(refs): 

231 dataset_id = ref.id 

232 # The query interfaces that are often used to generate the refs 

233 # passed here often don't remove duplicates, so do that here for 

234 # convenience. 

235 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

236 continue 

237 # Also convert components to composites. 

238 if ref.isComponent(): 

239 ref = ref.makeCompositeRef() 

240 data_ids.add(ref.dataId) 

241 refs_to_export[dataset_id] = ref 

242 # Do a vectorized datastore export, which might be a lot faster than 

243 # one-by-one. 

244 exports = self._datastore.export( 

245 refs_to_export.values(), 

246 directory=self._directory, 

247 transfer=self._transfer, 

248 ) 

249 # Export associated data IDs. 

250 self.saveDataIds(data_ids, elements=elements) 

251 # Rewrite export filenames if desired, and then save them to the 

252 # data structure we'll write in `_finish`. 

253 # If a single exported FileDataset has multiple DatasetRefs, we save 

254 # it with each of them. 

255 for file_dataset in exports: 

256 if rewrite is not None: 

257 file_dataset = rewrite(file_dataset) 

258 for ref in file_dataset.refs: 

259 assert ref.run is not None 

260 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

261 self._dataset_ids.update(refs_to_export.keys()) 

262 

263 def _finish(self) -> None: 

264 """Delegate to the backend to finish the export process. 

265 

266 For use by `Butler.export` only. 

267 """ 

268 for element in self._registry.dimensions.sorted(self._records.keys()): 

269 # To make export deterministic sort the DataCoordinate instances. 

270 r = self._records[element] 

271 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

272 for datasetsByRun in self._datasets.values(): 

273 for run in datasetsByRun.keys(): 

274 self._collections[run] = self._registry._get_collection_record(run) 

275 for collectionName in self._computeSortedCollections(): 

276 doc = self._registry.getCollectionDocumentation(collectionName) 

277 self._backend.saveCollection(self._collections[collectionName], doc) 

278 # Sort the dataset types and runs before exporting to ensure 

279 # reproducible order in export file. 

280 for datasetType in sorted(self._datasets.keys()): 

281 for run in sorted(self._datasets[datasetType].keys()): 

282 # Sort the FileDataset 

283 records = sorted(self._datasets[datasetType][run]) 

284 self._backend.saveDatasets(datasetType, run, *records) 

285 # Export associations between datasets and collections. These need to 

286 # be sorted (at two levels; they're dicts) or created more 

287 # deterministically, too, which probably involves more data ID sorting. 

288 datasetAssociations = self._computeDatasetAssociations() 

289 for collection in sorted(datasetAssociations): 

290 self._backend.saveDatasetAssociations( 

291 collection, self._collections[collection].type, sorted(datasetAssociations[collection]) 

292 ) 

293 self._backend.finish() 

294 

295 def _computeSortedCollections(self) -> list[str]: 

296 """Sort collections in a way that is both deterministic and safe 

297 for registering them in a new repo in the presence of nested chains. 

298 

299 This method is intended for internal use by `RepoExportContext` only. 

300 

301 Returns 

302 ------- 

303 names: `List` [ `str` ] 

304 Ordered list of collection names. 

305 """ 

306 # Split collections into CHAINED and everything else, and just 

307 # sort "everything else" lexicographically since there are no 

308 # dependencies. 

309 chains: dict[str, list[str]] = {} 

310 result: list[str] = [] 

311 for record in self._collections.values(): 

312 if record.type is CollectionType.CHAINED: 

313 assert isinstance(record, ChainedCollectionRecord) 

314 chains[record.name] = list(record.children) 

315 else: 

316 result.append(record.name) 

317 result.sort() 

318 # Sort all chains topologically, breaking ties lexicographically. 

319 # Append these to 'result' and remove them from 'chains' as we go. 

320 while chains: 

321 unblocked = { 

322 parent 

323 for parent, children in chains.items() 

324 if not any(child in chains.keys() for child in children) 

325 } 

326 if not unblocked: 

327 raise RuntimeError( 

328 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}." 

329 ) 

330 result.extend(sorted(unblocked)) 

331 for name in unblocked: 

332 del chains[name] 

333 return result 

334 

335 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]: 

336 """Return datasets-collection associations, grouped by association. 

337 

338 This queries for all associations between exported datasets and 

339 exported TAGGED or CALIBRATION collections and is intended to be run 

340 only by `_finish`, as this ensures all collections and all datasets 

341 have already been exported and hence the order in which they are 

342 exported does not matter. 

343 

344 Returns 

345 ------- 

346 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

347 Dictionary keyed by collection name, with values lists of structs 

348 representing an association between that collection and a dataset. 

349 """ 

350 results = defaultdict(list) 

351 for datasetType in self._datasets.keys(): 

352 # We query for _all_ datasets of each dataset type we export, in 

353 # the specific collections we are exporting. The worst-case 

354 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

355 # subset). But we don't have any better options right now; we need 

356 # a way to query for a _lot_ of explicitly given dataset_ids, and 

357 # the only way to make that scale up is to either upload them to a 

358 # temporary table or recognize when they are already in one because 

359 # the user passed us a QueryResult object. That's blocked by (at 

360 # least) DM-26692. 

361 collectionTypes = {CollectionType.TAGGED} 

362 if datasetType.isCalibration(): 

363 collectionTypes.add(CollectionType.CALIBRATION) 

364 associationIter = self._registry.queryDatasetAssociations( 

365 datasetType, 

366 collections=self._collections.keys(), 

367 collectionTypes=collectionTypes, 

368 flattenChains=False, 

369 ) 

370 for association in associationIter: 

371 if association.ref.id in self._dataset_ids: 

372 results[association.collection].append(association) 

373 return results