Coverage for python/lsst/daf/butler/transfers/_context.py: 9%

110 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-28 09:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["RepoExportContext"] 

25 

26from collections import defaultdict 

27from typing import AbstractSet, Callable, Dict, Iterable, List, Optional, Set, Union 

28 

29from ..core import ( 

30 DataCoordinate, 

31 DatasetAssociation, 

32 DatasetId, 

33 DatasetRef, 

34 DatasetType, 

35 Datastore, 

36 DimensionElement, 

37 DimensionRecord, 

38 FileDataset, 

39) 

40from ..registry import CollectionType, Registry 

41from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

42from ._interfaces import RepoExportBackend 

43 

44 

45class RepoExportContext: 

46 """Public interface for exporting a subset of a data repository. 

47 

48 Instances of this class are obtained by calling `Butler.export` as the 

49 value returned by that context manager:: 

50 

51 with butler.export(filename="export.yaml") as export: 

52 export.saveDataIds(...) 

53 export.saveDatasets(...) 

54 

55 Parameters 

56 ---------- 

57 registry : `Registry` 

58 Registry to export from. 

59 datastore : `Datastore` 

60 Datastore to export from. 

61 backend : `RepoExportBackend` 

62 Implementation class for a particular export file format. 

63 directory : `str`, optional 

64 Directory to pass to `Datastore.export`. 

65 transfer : `str`, optional 

66 Transfer mode to pass to `Datastore.export`. 

67 """ 

68 

69 def __init__( 

70 self, 

71 registry: Registry, 

72 datastore: Datastore, 

73 backend: RepoExportBackend, 

74 *, 

75 directory: Optional[str] = None, 

76 transfer: Optional[str] = None, 

77 ): 

78 self._registry = registry 

79 self._datastore = datastore 

80 self._backend = backend 

81 self._directory = directory 

82 self._transfer = transfer 

83 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

84 self._dataset_ids: Set[DatasetId] = set() 

85 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] = defaultdict( 

86 lambda: defaultdict(list) 

87 ) 

88 self._collections: Dict[str, CollectionRecord] = {} 

89 

90 def saveCollection(self, name: str) -> None: 

91 """Export the given collection. 

92 

93 Parameters 

94 ---------- 

95 name: `str` 

96 Name of the collection. 

97 

98 Notes 

99 ----- 

100 `~CollectionType.RUN` collections are also exported automatically when 

101 any dataset referencing them is exported. They may also be explicitly 

102 exported this method to export the collection with no datasets. 

103 Duplicate exports of collections are ignored. 

104 

105 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

106 collection will cause its associations with exported datasets to also 

107 be exported, but it does not export those datasets automatically. 

108 

109 Exporting a `~CollectionType.CHAINED` collection does not automatically 

110 export its child collections; these must be explicitly exported or 

111 already be present in the repository they are being imported into. 

112 """ 

113 self._collections[name] = self._registry._get_collection_record(name) 

114 

115 def saveDimensionData( 

116 self, element: Union[str, DimensionElement], records: Iterable[Union[dict, DimensionRecord]] 

117 ) -> None: 

118 """Export the given dimension records associated with one or more data 

119 IDs. 

120 

121 Parameters 

122 ---------- 

123 element : `str` or `DimensionElement` 

124 `DimensionElement` or `str` indicating the logical table these 

125 records are from. 

126 records : `Iterable` [ `DimensionRecord` or `dict` ] 

127 Records to export, as an iterable containing `DimensionRecord` or 

128 `dict` instances. 

129 """ 

130 if not isinstance(element, DimensionElement): 

131 element = self._registry.dimensions[element] 

132 for record in records: 

133 if not isinstance(record, DimensionRecord): 

134 record = element.RecordClass(**record) 

135 elif record.definition != element: 

136 raise ValueError( 

137 f"Mismatch between element={element.name} and " 

138 f"dimension record with definition={record.definition.name}." 

139 ) 

140 self._records[element].setdefault(record.dataId, record) 

141 

142 def saveDataIds( 

143 self, 

144 dataIds: Iterable[DataCoordinate], 

145 *, 

146 elements: Optional[Iterable[Union[str, DimensionElement]]] = None, 

147 ) -> None: 

148 """Export the dimension records associated with one or more data IDs. 

149 

150 Parameters 

151 ---------- 

152 dataIds : iterable of `DataCoordinate`. 

153 Data IDs to export. For large numbers of data IDs obtained by 

154 calls to `Registry.queryDataIds`, it will be much more efficient if 

155 these are expanded to include records (i.e. 

156 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

157 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

158 elements : iterable of `DimensionElement` or `str`, optional 

159 Dimension elements whose records should be exported. If `None`, 

160 records for all dimensions will be exported. 

161 """ 

162 standardized_elements: AbstractSet[DimensionElement] 

163 if elements is None: 

164 standardized_elements = frozenset( 

165 element 

166 for element in self._registry.dimensions.getStaticElements() 

167 if element.hasTable() and element.viewOf is None 

168 ) 

169 else: 

170 standardized_elements = set() 

171 for element in elements: 

172 if not isinstance(element, DimensionElement): 

173 element = self._registry.dimensions[element] 

174 if element.hasTable() and element.viewOf is None: 

175 standardized_elements.add(element) 

176 for dataId in dataIds: 

177 # This is potentially quite slow, because it's approximately 

178 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

179 # if the data ID is already expanded, and DM-26692 will add (or at 

180 # least start to add / unblock) query functionality that should 

181 # let us speed this up internally as well. 

182 dataId = self._registry.expandDataId(dataId) 

183 for record in dataId.records.values(): 

184 if record is not None and record.definition in standardized_elements: 

185 self._records[record.definition].setdefault(record.dataId, record) 

186 

187 def saveDatasets( 

188 self, 

189 refs: Iterable[DatasetRef], 

190 *, 

191 elements: Optional[Iterable[Union[str, DimensionElement]]] = None, 

192 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None, 

193 ) -> None: 

194 """Export one or more datasets. 

195 

196 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

197 collections, and dimension records associated with the datasets. 

198 

199 Parameters 

200 ---------- 

201 refs : iterable of `DatasetRef` 

202 References to the datasets to export. Their `DatasetRef.id` 

203 attributes must not be `None`. Duplicates are automatically 

204 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

205 return `True`. If any reference is to a component dataset, the 

206 parent will be exported instead. 

207 elements : iterable of `DimensionElement` or `str`, optional 

208 Dimension elements whose records should be exported; this is 

209 forwarded to `saveDataIds` when exporting the data IDs of the 

210 given datasets. 

211 rewrite : callable, optional 

212 A callable that takes a single `FileDataset` argument and returns 

213 a modified `FileDataset`. This is typically used to rewrite the 

214 path generated by the datastore. If `None`, the `FileDataset` 

215 returned by `Datastore.export` will be used directly. 

216 

217 Notes 

218 ----- 

219 At present, this only associates datasets with `~CollectionType.RUN` 

220 collections. Other collections will be included in the export in the 

221 future (once `Registry` provides a way to look up that information). 

222 """ 

223 data_ids = set() 

224 refs_to_export = {} 

225 for ref in sorted(refs): 

226 dataset_id = ref.getCheckedId() 

227 # The query interfaces that are often used to generate the refs 

228 # passed here often don't remove duplicates, so do that here for 

229 # convenience. 

230 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

231 continue 

232 # Also convert components to composites. 

233 if ref.isComponent(): 

234 ref = ref.makeCompositeRef() 

235 data_ids.add(ref.dataId) 

236 refs_to_export[dataset_id] = ref 

237 # Do a vectorized datastore export, which might be a lot faster than 

238 # one-by-one. 

239 exports = self._datastore.export( 

240 refs_to_export.values(), 

241 directory=self._directory, 

242 transfer=self._transfer, 

243 ) 

244 # Export associated data IDs. 

245 self.saveDataIds(data_ids, elements=elements) 

246 # Rewrite export filenames if desired, and then save them to the 

247 # data structure we'll write in `_finish`. 

248 # If a single exported FileDataset has multiple DatasetRefs, we save 

249 # it with each of them. 

250 for file_dataset in exports: 

251 if rewrite is not None: 

252 file_dataset = rewrite(file_dataset) 

253 for ref in file_dataset.refs: 

254 assert ref.run is not None 

255 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

256 self._dataset_ids.update(refs_to_export.keys()) 

257 

258 def _finish(self) -> None: 

259 """Delegate to the backend to finish the export process. 

260 

261 For use by `Butler.export` only. 

262 """ 

263 for element in self._registry.dimensions.sorted(self._records.keys()): 

264 # To make export deterministic sort the DataCoordinate instances. 

265 r = self._records[element] 

266 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

267 for datasetsByRun in self._datasets.values(): 

268 for run in datasetsByRun.keys(): 

269 self._collections[run] = self._registry._get_collection_record(run) 

270 for collectionName in self._computeSortedCollections(): 

271 doc = self._registry.getCollectionDocumentation(collectionName) 

272 self._backend.saveCollection(self._collections[collectionName], doc) 

273 # Sort the dataset types and runs before exporting to ensure 

274 # reproducible order in export file. 

275 for datasetType in sorted(self._datasets.keys()): 

276 for run in sorted(self._datasets[datasetType].keys()): 

277 # Sort the FileDataset 

278 records = sorted(self._datasets[datasetType][run]) 

279 self._backend.saveDatasets(datasetType, run, *records) 

280 # Export associations between datasets and collections. These need to 

281 # be sorted (at two levels; they're dicts) or created more 

282 # deterministically, too, which probably involves more data ID sorting. 

283 datasetAssociations = self._computeDatasetAssociations() 

284 for collection in sorted(datasetAssociations): 

285 self._backend.saveDatasetAssociations( 

286 collection, self._collections[collection].type, sorted(datasetAssociations[collection]) 

287 ) 

288 self._backend.finish() 

289 

290 def _computeSortedCollections(self) -> List[str]: 

291 """Sort collections in a way that is both deterministic and safe 

292 for registering them in a new repo in the presence of nested chains. 

293 

294 This method is intended for internal use by `RepoExportContext` only. 

295 

296 Returns 

297 ------- 

298 names: `List` [ `str` ] 

299 Ordered list of collection names. 

300 """ 

301 # Split collections into CHAINED and everything else, and just 

302 # sort "everything else" lexicographically since there are no 

303 # dependencies. 

304 chains: Dict[str, List[str]] = {} 

305 result: List[str] = [] 

306 for record in self._collections.values(): 

307 if record.type is CollectionType.CHAINED: 

308 assert isinstance(record, ChainedCollectionRecord) 

309 chains[record.name] = list(record.children) 

310 else: 

311 result.append(record.name) 

312 result.sort() 

313 # Sort all chains topologically, breaking ties lexicographically. 

314 # Append these to 'result' and remove them from 'chains' as we go. 

315 while chains: 

316 unblocked = { 

317 parent 

318 for parent, children in chains.items() 

319 if not any(child in chains.keys() for child in children) 

320 } 

321 if not unblocked: 

322 raise RuntimeError( 

323 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}." 

324 ) 

325 result.extend(sorted(unblocked)) 

326 for name in unblocked: 

327 del chains[name] 

328 return result 

329 

330 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]: 

331 """Return datasets-collection associations, grouped by association. 

332 

333 This queries for all associations between exported datasets and 

334 exported TAGGED or CALIBRATION collections and is intended to be run 

335 only by `_finish`, as this ensures all collections and all datasets 

336 have already been exported and hence the order in which they are 

337 exported does not matter. 

338 

339 Returns 

340 ------- 

341 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

342 Dictionary keyed by collection name, with values lists of structs 

343 representing an association between that collection and a dataset. 

344 """ 

345 results = defaultdict(list) 

346 for datasetType in self._datasets.keys(): 

347 # We query for _all_ datasets of each dataset type we export, in 

348 # the specific collections we are exporting. The worst-case 

349 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

350 # subset). But we don't have any better options right now; we need 

351 # a way to query for a _lot_ of explicitly given dataset_ids, and 

352 # the only way to make that scale up is to either upload them to a 

353 # temporary table or recognize when they are already in one because 

354 # the user passed us a QueryResult object. That's blocked by (at 

355 # least) DM-26692. 

356 collectionTypes = {CollectionType.TAGGED} 

357 if datasetType.isCalibration(): 

358 collectionTypes.add(CollectionType.CALIBRATION) 

359 associationIter = self._registry.queryDatasetAssociations( 

360 datasetType, 

361 collections=self._collections.keys(), 

362 collectionTypes=collectionTypes, 

363 flattenChains=False, 

364 ) 

365 for association in associationIter: 

366 if association.ref.id in self._dataset_ids: 

367 results[association.collection].append(association) 

368 return results