Coverage for python/lsst/daf/butler/transfers/_context.py: 9%

110 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-18 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["RepoExportContext"] 

25 

26from collections import defaultdict 

27from typing import TYPE_CHECKING, AbstractSet, Callable, Dict, Iterable, List, Optional, Set, Union 

28 

29from ..core import ( 

30 DataCoordinate, 

31 DatasetAssociation, 

32 DatasetId, 

33 DatasetRef, 

34 DatasetType, 

35 Datastore, 

36 DimensionElement, 

37 DimensionRecord, 

38 FileDataset, 

39) 

40from ..registry import CollectionType, Registry 

41from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

42from ._interfaces import RepoExportBackend 

43 

44if TYPE_CHECKING: 

45 from lsst.resources import ResourcePathExpression 

46 

47 

48class RepoExportContext: 

49 """Public interface for exporting a subset of a data repository. 

50 

51 Instances of this class are obtained by calling `Butler.export` as the 

52 value returned by that context manager:: 

53 

54 with butler.export(filename="export.yaml") as export: 

55 export.saveDataIds(...) 

56 export.saveDatasets(...) 

57 

58 Parameters 

59 ---------- 

60 registry : `Registry` 

61 Registry to export from. 

62 datastore : `Datastore` 

63 Datastore to export from. 

64 backend : `RepoExportBackend` 

65 Implementation class for a particular export file format. 

66 directory : `~lsst.resources.ResourcePathExpression`, optional 

67 Directory to pass to `Datastore.export`. Can be `None` to use 

68 the current working directory. 

69 transfer : `str`, optional 

70 Transfer mode to pass to `Datastore.export`. 

71 """ 

72 

73 def __init__( 

74 self, 

75 registry: Registry, 

76 datastore: Datastore, 

77 backend: RepoExportBackend, 

78 *, 

79 directory: Optional[ResourcePathExpression] = None, 

80 transfer: Optional[str] = None, 

81 ): 

82 self._registry = registry 

83 self._datastore = datastore 

84 self._backend = backend 

85 self._directory = directory 

86 self._transfer = transfer 

87 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

88 self._dataset_ids: Set[DatasetId] = set() 

89 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] = defaultdict( 

90 lambda: defaultdict(list) 

91 ) 

92 self._collections: Dict[str, CollectionRecord] = {} 

93 

94 def saveCollection(self, name: str) -> None: 

95 """Export the given collection. 

96 

97 Parameters 

98 ---------- 

99 name: `str` 

100 Name of the collection. 

101 

102 Notes 

103 ----- 

104 `~CollectionType.RUN` collections are also exported automatically when 

105 any dataset referencing them is exported. They may also be explicitly 

106 exported this method to export the collection with no datasets. 

107 Duplicate exports of collections are ignored. 

108 

109 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

110 collection will cause its associations with exported datasets to also 

111 be exported, but it does not export those datasets automatically. 

112 

113 Exporting a `~CollectionType.CHAINED` collection does not automatically 

114 export its child collections; these must be explicitly exported or 

115 already be present in the repository they are being imported into. 

116 """ 

117 self._collections[name] = self._registry._get_collection_record(name) 

118 

119 def saveDimensionData( 

120 self, element: Union[str, DimensionElement], records: Iterable[Union[dict, DimensionRecord]] 

121 ) -> None: 

122 """Export the given dimension records associated with one or more data 

123 IDs. 

124 

125 Parameters 

126 ---------- 

127 element : `str` or `DimensionElement` 

128 `DimensionElement` or `str` indicating the logical table these 

129 records are from. 

130 records : `Iterable` [ `DimensionRecord` or `dict` ] 

131 Records to export, as an iterable containing `DimensionRecord` or 

132 `dict` instances. 

133 """ 

134 if not isinstance(element, DimensionElement): 

135 element = self._registry.dimensions[element] 

136 for record in records: 

137 if not isinstance(record, DimensionRecord): 

138 record = element.RecordClass(**record) 

139 elif record.definition != element: 

140 raise ValueError( 

141 f"Mismatch between element={element.name} and " 

142 f"dimension record with definition={record.definition.name}." 

143 ) 

144 self._records[element].setdefault(record.dataId, record) 

145 

146 def saveDataIds( 

147 self, 

148 dataIds: Iterable[DataCoordinate], 

149 *, 

150 elements: Optional[Iterable[Union[str, DimensionElement]]] = None, 

151 ) -> None: 

152 """Export the dimension records associated with one or more data IDs. 

153 

154 Parameters 

155 ---------- 

156 dataIds : iterable of `DataCoordinate`. 

157 Data IDs to export. For large numbers of data IDs obtained by 

158 calls to `Registry.queryDataIds`, it will be much more efficient if 

159 these are expanded to include records (i.e. 

160 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

161 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

162 elements : iterable of `DimensionElement` or `str`, optional 

163 Dimension elements whose records should be exported. If `None`, 

164 records for all dimensions will be exported. 

165 """ 

166 standardized_elements: AbstractSet[DimensionElement] 

167 if elements is None: 

168 standardized_elements = frozenset( 

169 element 

170 for element in self._registry.dimensions.getStaticElements() 

171 if element.hasTable() and element.viewOf is None 

172 ) 

173 else: 

174 standardized_elements = set() 

175 for element in elements: 

176 if not isinstance(element, DimensionElement): 

177 element = self._registry.dimensions[element] 

178 if element.hasTable() and element.viewOf is None: 

179 standardized_elements.add(element) 

180 for dataId in dataIds: 

181 # This is potentially quite slow, because it's approximately 

182 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

183 # if the data ID is already expanded, and DM-26692 will add (or at 

184 # least start to add / unblock) query functionality that should 

185 # let us speed this up internally as well. 

186 dataId = self._registry.expandDataId(dataId) 

187 for record in dataId.records.values(): 

188 if record is not None and record.definition in standardized_elements: 

189 self._records[record.definition].setdefault(record.dataId, record) 

190 

191 def saveDatasets( 

192 self, 

193 refs: Iterable[DatasetRef], 

194 *, 

195 elements: Optional[Iterable[Union[str, DimensionElement]]] = None, 

196 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None, 

197 ) -> None: 

198 """Export one or more datasets. 

199 

200 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

201 collections, and dimension records associated with the datasets. 

202 

203 Parameters 

204 ---------- 

205 refs : iterable of `DatasetRef` 

206 References to the datasets to export. Their `DatasetRef.id` 

207 attributes must not be `None`. Duplicates are automatically 

208 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

209 return `True`. If any reference is to a component dataset, the 

210 parent will be exported instead. 

211 elements : iterable of `DimensionElement` or `str`, optional 

212 Dimension elements whose records should be exported; this is 

213 forwarded to `saveDataIds` when exporting the data IDs of the 

214 given datasets. 

215 rewrite : callable, optional 

216 A callable that takes a single `FileDataset` argument and returns 

217 a modified `FileDataset`. This is typically used to rewrite the 

218 path generated by the datastore. If `None`, the `FileDataset` 

219 returned by `Datastore.export` will be used directly. 

220 

221 Notes 

222 ----- 

223 At present, this only associates datasets with `~CollectionType.RUN` 

224 collections. Other collections will be included in the export in the 

225 future (once `Registry` provides a way to look up that information). 

226 """ 

227 data_ids = set() 

228 refs_to_export = {} 

229 for ref in sorted(refs): 

230 dataset_id = ref.getCheckedId() 

231 # The query interfaces that are often used to generate the refs 

232 # passed here often don't remove duplicates, so do that here for 

233 # convenience. 

234 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

235 continue 

236 # Also convert components to composites. 

237 if ref.isComponent(): 

238 ref = ref.makeCompositeRef() 

239 data_ids.add(ref.dataId) 

240 refs_to_export[dataset_id] = ref 

241 # Do a vectorized datastore export, which might be a lot faster than 

242 # one-by-one. 

243 exports = self._datastore.export( 

244 refs_to_export.values(), 

245 directory=self._directory, 

246 transfer=self._transfer, 

247 ) 

248 # Export associated data IDs. 

249 self.saveDataIds(data_ids, elements=elements) 

250 # Rewrite export filenames if desired, and then save them to the 

251 # data structure we'll write in `_finish`. 

252 # If a single exported FileDataset has multiple DatasetRefs, we save 

253 # it with each of them. 

254 for file_dataset in exports: 

255 if rewrite is not None: 

256 file_dataset = rewrite(file_dataset) 

257 for ref in file_dataset.refs: 

258 assert ref.run is not None 

259 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

260 self._dataset_ids.update(refs_to_export.keys()) 

261 

262 def _finish(self) -> None: 

263 """Delegate to the backend to finish the export process. 

264 

265 For use by `Butler.export` only. 

266 """ 

267 for element in self._registry.dimensions.sorted(self._records.keys()): 

268 # To make export deterministic sort the DataCoordinate instances. 

269 r = self._records[element] 

270 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

271 for datasetsByRun in self._datasets.values(): 

272 for run in datasetsByRun.keys(): 

273 self._collections[run] = self._registry._get_collection_record(run) 

274 for collectionName in self._computeSortedCollections(): 

275 doc = self._registry.getCollectionDocumentation(collectionName) 

276 self._backend.saveCollection(self._collections[collectionName], doc) 

277 # Sort the dataset types and runs before exporting to ensure 

278 # reproducible order in export file. 

279 for datasetType in sorted(self._datasets.keys()): 

280 for run in sorted(self._datasets[datasetType].keys()): 

281 # Sort the FileDataset 

282 records = sorted(self._datasets[datasetType][run]) 

283 self._backend.saveDatasets(datasetType, run, *records) 

284 # Export associations between datasets and collections. These need to 

285 # be sorted (at two levels; they're dicts) or created more 

286 # deterministically, too, which probably involves more data ID sorting. 

287 datasetAssociations = self._computeDatasetAssociations() 

288 for collection in sorted(datasetAssociations): 

289 self._backend.saveDatasetAssociations( 

290 collection, self._collections[collection].type, sorted(datasetAssociations[collection]) 

291 ) 

292 self._backend.finish() 

293 

294 def _computeSortedCollections(self) -> List[str]: 

295 """Sort collections in a way that is both deterministic and safe 

296 for registering them in a new repo in the presence of nested chains. 

297 

298 This method is intended for internal use by `RepoExportContext` only. 

299 

300 Returns 

301 ------- 

302 names: `List` [ `str` ] 

303 Ordered list of collection names. 

304 """ 

305 # Split collections into CHAINED and everything else, and just 

306 # sort "everything else" lexicographically since there are no 

307 # dependencies. 

308 chains: Dict[str, List[str]] = {} 

309 result: List[str] = [] 

310 for record in self._collections.values(): 

311 if record.type is CollectionType.CHAINED: 

312 assert isinstance(record, ChainedCollectionRecord) 

313 chains[record.name] = list(record.children) 

314 else: 

315 result.append(record.name) 

316 result.sort() 

317 # Sort all chains topologically, breaking ties lexicographically. 

318 # Append these to 'result' and remove them from 'chains' as we go. 

319 while chains: 

320 unblocked = { 

321 parent 

322 for parent, children in chains.items() 

323 if not any(child in chains.keys() for child in children) 

324 } 

325 if not unblocked: 

326 raise RuntimeError( 

327 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}." 

328 ) 

329 result.extend(sorted(unblocked)) 

330 for name in unblocked: 

331 del chains[name] 

332 return result 

333 

334 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]: 

335 """Return datasets-collection associations, grouped by association. 

336 

337 This queries for all associations between exported datasets and 

338 exported TAGGED or CALIBRATION collections and is intended to be run 

339 only by `_finish`, as this ensures all collections and all datasets 

340 have already been exported and hence the order in which they are 

341 exported does not matter. 

342 

343 Returns 

344 ------- 

345 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

346 Dictionary keyed by collection name, with values lists of structs 

347 representing an association between that collection and a dataset. 

348 """ 

349 results = defaultdict(list) 

350 for datasetType in self._datasets.keys(): 

351 # We query for _all_ datasets of each dataset type we export, in 

352 # the specific collections we are exporting. The worst-case 

353 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

354 # subset). But we don't have any better options right now; we need 

355 # a way to query for a _lot_ of explicitly given dataset_ids, and 

356 # the only way to make that scale up is to either upload them to a 

357 # temporary table or recognize when they are already in one because 

358 # the user passed us a QueryResult object. That's blocked by (at 

359 # least) DM-26692. 

360 collectionTypes = {CollectionType.TAGGED} 

361 if datasetType.isCalibration(): 

362 collectionTypes.add(CollectionType.CALIBRATION) 

363 associationIter = self._registry.queryDatasetAssociations( 

364 datasetType, 

365 collections=self._collections.keys(), 

366 collectionTypes=collectionTypes, 

367 flattenChains=False, 

368 ) 

369 for association in associationIter: 

370 if association.ref.id in self._dataset_ids: 

371 results[association.collection].append(association) 

372 return results