Coverage for python/lsst/daf/butler/transfers/_context.py: 10%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

110 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["RepoExportContext"] 

25 

26from typing import ( 

27 Callable, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Set, 

33 Union, 

34) 

35from collections import defaultdict 

36 

37from ..core import ( 

38 DataCoordinate, 

39 DatasetAssociation, 

40 DimensionElement, 

41 DimensionRecord, 

42 DatasetId, 

43 DatasetRef, 

44 DatasetType, 

45 Datastore, 

46 FileDataset, 

47) 

48from ..registry import CollectionType, Registry 

49from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

50from ._interfaces import RepoExportBackend 

51 

52 

53class RepoExportContext: 

54 """Public interface for exporting a subset of a data repository. 

55 

56 Instances of this class are obtained by calling `Butler.export` as the 

57 value returned by that context manager:: 

58 

59 with butler.export(filename="export.yaml") as export: 

60 export.saveDataIds(...) 

61 export.saveDatasets(...) 

62 

63 Parameters 

64 ---------- 

65 registry : `Registry` 

66 Registry to export from. 

67 datastore : `Datastore` 

68 Datastore to export from. 

69 backend : `RepoExportBackend` 

70 Implementation class for a particular export file format. 

71 directory : `str`, optional 

72 Directory to pass to `Datastore.export`. 

73 transfer : `str`, optional 

74 Transfer mdoe to pass to `Datastore.export`. 

75 """ 

76 

77 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

78 directory: Optional[str] = None, transfer: Optional[str] = None): 

79 self._registry = registry 

80 self._datastore = datastore 

81 self._backend = backend 

82 self._directory = directory 

83 self._transfer = transfer 

84 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

85 self._dataset_ids: Set[DatasetId] = set() 

86 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] \ 

87 = defaultdict(lambda: defaultdict(list)) 

88 self._collections: Dict[str, CollectionRecord] = {} 

89 

90 def saveCollection(self, name: str) -> None: 

91 """Export the given collection. 

92 

93 Parameters 

94 ---------- 

95 name: `str` 

96 Name of the collection. 

97 

98 Notes 

99 ----- 

100 `~CollectionType.RUN` collections are also exported automatically when 

101 any dataset referencing them is exported. They may also be explicitly 

102 exported this method to export the collection with no datasets. 

103 Duplicate exports of collections are ignored. 

104 

105 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

106 collection will cause its associations with exported datasets to also 

107 be exported, but it does not export those datasets automatically. 

108 

109 Exporting a `~CollectionType.CHAINED` collection does not automatically 

110 export its child collections; these must be explicitly exported or 

111 already be present in the repository they are being imported into. 

112 """ 

113 self._collections[name] = self._registry._get_collection_record(name) 

114 

115 def saveDimensionData(self, element: Union[str, DimensionElement], 

116 records: Iterable[Union[dict, DimensionRecord]]) -> None: 

117 """Export the given dimension records associated with one or more data 

118 IDs. 

119 

120 Parameters 

121 ---------- 

122 element : `str` or `DimensionElement` 

123 `DimensionElement` or `str` indicating the logical table these 

124 records are from. 

125 records : `Iterable` [ `DimensionRecord` or `dict` ] 

126 Records to export, as an iterable containing `DimensionRecord` or 

127 `dict` instances. 

128 """ 

129 if not isinstance(element, DimensionElement): 

130 element = self._registry.dimensions[element] 

131 for record in records: 

132 if not isinstance(record, DimensionRecord): 

133 record = element.RecordClass(**record) 

134 elif record.definition != element: 

135 raise ValueError( 

136 f"Mismatch between element={element.name} and " 

137 f"dimension record with definition={record.definition.name}." 

138 ) 

139 self._records[element].setdefault(record.dataId, record) 

140 

141 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *, 

142 elements: Optional[Iterable[Union[str, DimensionElement]]] = None) -> None: 

143 """Export the dimension records associated with one or more data IDs. 

144 

145 Parameters 

146 ---------- 

147 dataIds : iterable of `DataCoordinate`. 

148 Data IDs to export. For large numbers of data IDs obtained by 

149 calls to `Registry.queryDataIds`, it will be much more efficient if 

150 these are expanded to include records (i.e. 

151 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

152 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

153 elements : iterable of `DimensionElement` or `str`, optional 

154 Dimension elements whose records should be exported. If `None`, 

155 records for all dimensions will be exported. 

156 """ 

157 if elements is None: 

158 elements = frozenset(element for element in self._registry.dimensions.getStaticElements() 

159 if element.hasTable() and element.viewOf is None) 

160 else: 

161 elements = set() 

162 for element in elements: 

163 if not isinstance(element, DimensionElement): 

164 element = self._registry.dimensions[element] 

165 if element.hasTable() and element.viewOf is None: 

166 elements.add(element) 

167 for dataId in dataIds: 

168 # This is potentially quite slow, because it's approximately 

169 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

170 # if the data ID is already expanded, and DM-26692 will add (or at 

171 # least start to add / unblock) query functionality that should 

172 # let us speed this up internally as well. 

173 dataId = self._registry.expandDataId(dataId) 

174 for record in dataId.records.values(): 

175 if record is not None and record.definition in elements: 

176 self._records[record.definition].setdefault(record.dataId, record) 

177 

178 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

179 elements: Optional[Iterable[Union[str, DimensionElement]]] = None, 

180 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None: 

181 """Export one or more datasets. 

182 

183 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

184 collections, and dimension records associated with the datasets. 

185 

186 Parameters 

187 ---------- 

188 refs : iterable of `DatasetRef` 

189 References to the datasets to export. Their `DatasetRef.id` 

190 attributes must not be `None`. Duplicates are automatically 

191 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

192 return `True`. If any reference is to a component dataset, the 

193 parent will be exported instead. 

194 elements : iterable of `DimensionElement` or `str`, optional 

195 Dimension elements whose records should be exported; this is 

196 forwarded to `saveDataIds` when exporting the data IDs of the 

197 given datasets. 

198 rewrite : callable, optional 

199 A callable that takes a single `FileDataset` argument and returns 

200 a modified `FileDataset`. This is typically used to rewrite the 

201 path generated by the datastore. If `None`, the `FileDataset` 

202 returned by `Datastore.export` will be used directly. 

203 

204 Notes 

205 ----- 

206 At present, this only associates datasets with `~CollectionType.RUN` 

207 collections. Other collections will be included in the export in the 

208 future (once `Registry` provides a way to look up that information). 

209 """ 

210 data_ids = set() 

211 refs_to_export = {} 

212 for ref in sorted(refs): 

213 dataset_id = ref.getCheckedId() 

214 # The query interfaces that are often used to generate the refs 

215 # passed here often don't remove duplicates, so do that here for 

216 # convenience. 

217 if dataset_id in self._dataset_ids or dataset_id in refs_to_export: 

218 continue 

219 # Also convert components to composites. 

220 if ref.isComponent(): 

221 ref = ref.makeCompositeRef() 

222 data_ids.add(ref.dataId) 

223 refs_to_export[dataset_id] = ref 

224 # Do a vectorized datastore export, which might be a lot faster than 

225 # one-by-one. 

226 exports = self._datastore.export( 

227 refs_to_export.values(), 

228 directory=self._directory, 

229 transfer=self._transfer, 

230 ) 

231 # Export associated data IDs. 

232 self.saveDataIds(data_ids, elements=elements) 

233 # Rewrite export filenames if desired, and then save them to the 

234 # data structure we'll write in `_finish`. 

235 # If a single exported FileDataset has multiple DatasetRefs, we save 

236 # it with each of them. 

237 for file_dataset in exports: 

238 if rewrite is not None: 

239 file_dataset = rewrite(file_dataset) 

240 for ref in file_dataset.refs: 

241 assert ref.run is not None 

242 self._datasets[ref.datasetType][ref.run].append(file_dataset) 

243 self._dataset_ids.update(refs_to_export.keys()) 

244 

245 def _finish(self) -> None: 

246 """Delegate to the backend to finish the export process. 

247 

248 For use by `Butler.export` only. 

249 """ 

250 for element in self._registry.dimensions.sorted(self._records.keys()): 

251 # To make export deterministic sort the DataCoordinate instances. 

252 r = self._records[element] 

253 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

254 for datasetsByRun in self._datasets.values(): 

255 for run in datasetsByRun.keys(): 

256 self._collections[run] = self._registry._get_collection_record(run) 

257 for collectionName in self._computeSortedCollections(): 

258 doc = self._registry.getCollectionDocumentation(collectionName) 

259 self._backend.saveCollection(self._collections[collectionName], doc) 

260 # Sort the dataset types and runs before exporting to ensure 

261 # reproducible order in export file. 

262 for datasetType in sorted(self._datasets.keys()): 

263 for run in sorted(self._datasets[datasetType].keys()): 

264 # Sort the FileDataset 

265 records = sorted(self._datasets[datasetType][run]) 

266 self._backend.saveDatasets(datasetType, run, *records) 

267 # Export associations between datasets and collections. These need to 

268 # be sorted (at two levels; they're dicts) or created more 

269 # deterministically, too, which probably involves more data ID sorting. 

270 datasetAssociations = self._computeDatasetAssociations() 

271 for collection in sorted(datasetAssociations): 

272 self._backend.saveDatasetAssociations(collection, self._collections[collection].type, 

273 sorted(datasetAssociations[collection])) 

274 self._backend.finish() 

275 

276 def _computeSortedCollections(self) -> List[str]: 

277 """Sort collections in a way that is both deterministic and safe 

278 for registering them in a new repo in the presence of nested chains. 

279 

280 This method is intended for internal use by `RepoExportContext` only. 

281 

282 Returns 

283 ------- 

284 names: `List` [ `str` ] 

285 Ordered list of collection names. 

286 """ 

287 # Split collections into CHAINED and everything else, and just 

288 # sort "everything else" lexicographically since there are no 

289 # dependencies. 

290 chains: Dict[str, List[str]] = {} 

291 result: List[str] = [] 

292 for record in self._collections.values(): 

293 if record.type is CollectionType.CHAINED: 

294 assert isinstance(record, ChainedCollectionRecord) 

295 chains[record.name] = list(record.children) 

296 else: 

297 result.append(record.name) 

298 result.sort() 

299 # Sort all chains topologically, breaking ties lexicographically. 

300 # Append these to 'result' and remove them from 'chains' as we go. 

301 while chains: 

302 unblocked = { 

303 parent for parent, children in chains.items() 

304 if not any(child in chains.keys() for child in children) 

305 } 

306 if not unblocked: 

307 raise RuntimeError("Apparent cycle in CHAINED collection " 

308 f"dependencies involving {unblocked}.") 

309 result.extend(sorted(unblocked)) 

310 for name in unblocked: 

311 del chains[name] 

312 return result 

313 

314 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]: 

315 """Return datasets-collection associations, grouped by association. 

316 

317 This queries for all associations between exported datasets and 

318 exported TAGGED or CALIBRATION collections and is intended to be run 

319 only by `_finish`, as this ensures all collections and all datasets 

320 have already been exported and hence the order in which they are 

321 exported does not matter. 

322 

323 Returns 

324 ------- 

325 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

326 Dictionary keyed by collection name, with values lists of structs 

327 representing an association between that collection and a dataset. 

328 """ 

329 results = defaultdict(list) 

330 for datasetType in self._datasets.keys(): 

331 # We query for _all_ datasets of each dataset type we export, in 

332 # the specific collections we are exporting. The worst-case 

333 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

334 # subset). But we don't have any better options right now; we need 

335 # a way to query for a _lot_ of explicitly given dataset_ids, and 

336 # the only way to make that scale up is to either upload them to a 

337 # temporary table or recognize when they are already in one because 

338 # the user passed us a QueryResult object. That's blocked by (at 

339 # least) DM-26692. 

340 collectionTypes = {CollectionType.TAGGED} 

341 if datasetType.isCalibration(): 

342 collectionTypes.add(CollectionType.CALIBRATION) 

343 associationIter = self._registry.queryDatasetAssociations( 

344 datasetType, 

345 collections=self._collections.keys(), 

346 collectionTypes=collectionTypes, 

347 flattenChains=False, 

348 ) 

349 for association in associationIter: 

350 if association.ref.id in self._dataset_ids: 

351 results[association.collection].append(association) 

352 return results