Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["RepoExportContext"] 

25 

26from typing import ( 

27 Callable, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Set, 

33 Union, 

34) 

35from collections import defaultdict 

36 

37from ..core import ( 

38 DataCoordinate, 

39 DatasetAssociation, 

40 DimensionElement, 

41 DimensionRecord, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 FileDataset, 

46) 

47from ..registry import CollectionType, Registry 

48from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

49from ._interfaces import RepoExportBackend 

50 

51 

52class RepoExportContext: 

53 """Public interface for exporting a subset of a data repository. 

54 

55 Instances of this class are obtained by calling `Butler.export` as the 

56 value returned by that context manager:: 

57 

58 with butler.export(filename="export.yaml") as export: 

59 export.saveDataIds(...) 

60 export.saveDatasets(...) 

61 

62 Parameters 

63 ---------- 

64 registry : `Registry` 

65 Registry to export from. 

66 datastore : `Datastore` 

67 Datastore to export from. 

68 backend : `RepoExportBackend` 

69 Implementation class for a particular export file format. 

70 directory : `str`, optional 

71 Directory to pass to `Datastore.export`. 

72 transfer : `str`, optional 

73 Transfer mdoe to pass to `Datastore.export`. 

74 """ 

75 

76 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

77 directory: Optional[str] = None, transfer: Optional[str] = None): 

78 self._registry = registry 

79 self._datastore = datastore 

80 self._backend = backend 

81 self._directory = directory 

82 self._transfer = transfer 

83 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

84 self._dataset_ids: Set[int] = set() 

85 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] \ 

86 = defaultdict(lambda: defaultdict(list)) 

87 self._collections: Dict[str, CollectionRecord] = {} 

88 

89 def saveCollection(self, name: str) -> None: 

90 """Export the given collection. 

91 

92 Parameters 

93 ---------- 

94 name: `str` 

95 Name of the collection. 

96 

97 Notes 

98 ----- 

99 `~CollectionType.RUN` collections are also exported automatically when 

100 any dataset referencing them is exported. They may also be explicitly 

101 exported this method to export the collection with no datasets. 

102 Duplicate exports of collections are ignored. 

103 

104 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

105 collection will cause its associations with exported datasets to also 

106 be exported, but it does not export those datasets automatically. 

107 

108 Exporting a `~CollectionType.CHAINED` collection does not automatically 

109 export its child collections; these must be explicitly exported or 

110 already be present in the repository they are being imported into. 

111 """ 

112 self._collections[name] = self._registry._get_collection_record(name) 

113 

114 def saveDimensionData(self, element: Union[str, DimensionElement], 

115 records: Iterable[Union[dict, DimensionRecord]]) -> None: 

116 """Export the given dimension records associated with one or more data 

117 IDs. 

118 

119 Parameters 

120 ---------- 

121 element : `str` or `DimensionElement` 

122 `DimensionElement` or `str` indicating the logical table these 

123 records are from. 

124 records : `Iterable` [ `DimensionRecord` or `dict` ] 

125 Records to export, as an iterable containing `DimensionRecord` or 

126 `dict` instances. 

127 """ 

128 if not isinstance(element, DimensionElement): 

129 element = self._registry.dimensions[element] 

130 for record in records: 

131 if not isinstance(record, DimensionRecord): 

132 record = element.RecordClass(**record) 

133 elif record.definition != element: 

134 raise ValueError( 

135 f"Mismatch between element={element.name} and " 

136 f"dimension record with definition={record.definition.name}." 

137 ) 

138 self._records[element].setdefault(record.dataId, record) 

139 

140 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *, 

141 elements: Optional[Iterable[Union[str, DimensionElement]]] = None) -> None: 

142 """Export the dimension records associated with one or more data IDs. 

143 

144 Parameters 

145 ---------- 

146 dataIds : iterable of `DataCoordinate`. 

147 Data IDs to export. For large numbers of data IDs obtained by 

148 calls to `Registry.queryDataIds`, it will be much more efficient if 

149 these are expanded to include records (i.e. 

150 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

151 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

152 elements : iterable of `DimensionElement` or `str`, optional 

153 Dimension elements whose records should be exported. If `None`, 

154 records for all dimensions will be exported. 

155 """ 

156 if elements is None: 

157 elements = frozenset(element for element in self._registry.dimensions.getStaticElements() 

158 if element.hasTable() and element.viewOf is None) 

159 else: 

160 elements = set() 

161 for element in elements: 

162 if not isinstance(element, DimensionElement): 

163 element = self._registry.dimensions[element] 

164 if element.hasTable() and element.viewOf is None: 

165 elements.add(element) 

166 for dataId in dataIds: 

167 # This is potentially quite slow, because it's approximately 

168 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

169 # if the data ID is already expanded, and DM-26692 will add (or at 

170 # least start to add / unblock) query functionality that should 

171 # let us speed this up internally as well. 

172 dataId = self._registry.expandDataId(dataId) 

173 for record in dataId.records.values(): 

174 if record is not None and record.definition in elements: 

175 self._records[record.definition].setdefault(record.dataId, record) 

176 

177 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

178 elements: Optional[Iterable[Union[str, DimensionElement]]] = None, 

179 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None: 

180 """Export one or more datasets. 

181 

182 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

183 collections, and dimension records associated with the datasets. 

184 

185 Parameters 

186 ---------- 

187 refs : iterable of `DatasetRef` 

188 References to the datasets to export. Their `DatasetRef.id` 

189 attributes must not be `None`. Duplicates are automatically 

190 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

191 return `True`. 

192 elements : iterable of `DimensionElement` or `str`, optional 

193 Dimension elements whose records should be exported; this is 

194 forwarded to `saveDataIds` when exporting the data IDs of the 

195 given datasets. 

196 rewrite : callable, optional 

197 A callable that takes a single `FileDataset` argument and returns 

198 a modified `FileDataset`. This is typically used to rewrite the 

199 path generated by the datastore. If `None`, the `FileDataset` 

200 returned by `Datastore.export` will be used directly. 

201 

202 Notes 

203 ----- 

204 At present, this only associates datasets with `~CollectionType.RUN` 

205 collections. Other collections will be included in the export in the 

206 future (once `Registry` provides a way to look up that information). 

207 """ 

208 dataIds = set() 

209 for ref in sorted(refs): 

210 # The query interfaces that are often used to generate the refs 

211 # passed here often don't remove duplicates, so do that here for 

212 # convenience. 

213 if ref.id in self._dataset_ids: 

214 continue 

215 dataIds.add(ref.dataId) 

216 # `exports` is a single-element list here, because we anticipate 

217 # a future where more than just Datastore.export has a vectorized 

218 # API and we can pull this out of the loop. 

219 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

220 if rewrite is not None: 

221 exports = [rewrite(export) for export in exports] 

222 self._dataset_ids.add(ref.getCheckedId()) 

223 assert ref.run is not None 

224 self._datasets[ref.datasetType][ref.run].extend(exports) 

225 self.saveDataIds(dataIds, elements=elements) 

226 

227 def _finish(self) -> None: 

228 """Delegate to the backend to finish the export process. 

229 

230 For use by `Butler.export` only. 

231 """ 

232 for element in self._registry.dimensions.sorted(self._records.keys()): 

233 # To make export deterministic sort the DataCoordinate instances. 

234 r = self._records[element] 

235 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())]) 

236 for datasetsByRun in self._datasets.values(): 

237 for run in datasetsByRun.keys(): 

238 self._collections[run] = self._registry._get_collection_record(run) 

239 for collectionName in self._computeSortedCollections(): 

240 doc = self._registry.getCollectionDocumentation(collectionName) 

241 self._backend.saveCollection(self._collections[collectionName], doc) 

242 # Sort the dataset types and runs before exporting to ensure 

243 # reproducible order in export file. 

244 for datasetType in sorted(self._datasets.keys()): 

245 for run in sorted(self._datasets[datasetType].keys()): 

246 # Sort the FileDataset 

247 records = sorted(self._datasets[datasetType][run]) 

248 self._backend.saveDatasets(datasetType, run, *records) 

249 # Export associations between datasets and collections. These need to 

250 # be sorted (at two levels; they're dicts) or created more 

251 # deterministically, too, which probably involves more data ID sorting. 

252 datasetAssociations = self._computeDatasetAssociations() 

253 for collection in sorted(datasetAssociations): 

254 self._backend.saveDatasetAssociations(collection, self._collections[collection].type, 

255 sorted(datasetAssociations[collection])) 

256 self._backend.finish() 

257 

258 def _computeSortedCollections(self) -> List[str]: 

259 """Sort collections in a way that is both deterministic and safe 

260 for registering them in a new repo in the presence of nested chains. 

261 

262 This method is intended for internal use by `RepoExportContext` only. 

263 

264 Returns 

265 ------- 

266 names: `List` [ `str` ] 

267 Ordered list of collection names. 

268 """ 

269 # Split collections into CHAINED and everything else, and just 

270 # sort "everything else" lexicographically since there are no 

271 # dependencies. 

272 chains: Dict[str, List[str]] = {} 

273 result: List[str] = [] 

274 for record in self._collections.values(): 

275 if record.type is CollectionType.CHAINED: 

276 assert isinstance(record, ChainedCollectionRecord) 

277 chains[record.name] = list(record.children) 

278 else: 

279 result.append(record.name) 

280 result.sort() 

281 # Sort all chains topologically, breaking ties lexicographically. 

282 # Append these to 'result' and remove them from 'chains' as we go. 

283 while chains: 

284 unblocked = { 

285 parent for parent, children in chains.items() 

286 if not any(child in chains.keys() for child in children) 

287 } 

288 if not unblocked: 

289 raise RuntimeError("Apparent cycle in CHAINED collection " 

290 f"dependencies involving {unblocked}.") 

291 result.extend(sorted(unblocked)) 

292 for name in unblocked: 

293 del chains[name] 

294 return result 

295 

296 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]: 

297 """Return datasets-collection associations, grouped by association. 

298 

299 This queries for all associations between exported datasets and 

300 exported TAGGED or CALIBRATION collections and is intended to be run 

301 only by `_finish`, as this ensures all collections and all datasets 

302 have already been exported and hence the order in which they are 

303 exported does not matter. 

304 

305 Returns 

306 ------- 

307 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ] 

308 Dictionary keyed by collection name, with values lists of structs 

309 representing an association between that collection and a dataset. 

310 """ 

311 results = defaultdict(list) 

312 for datasetType in self._datasets.keys(): 

313 # We query for _all_ datasets of each dataset type we export, in 

314 # the specific collections we are exporting. The worst-case 

315 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny 

316 # subset). But we don't have any better options right now; we need 

317 # a way to query for a _lot_ of explicitly given dataset_ids, and 

318 # the only way to make that scale up is to either upload them to a 

319 # temporary table or recognize when they are already in one because 

320 # the user passed us a QueryResult object. That's blocked by (at 

321 # least) DM-26692. 

322 collectionTypes = {CollectionType.TAGGED} 

323 if datasetType.isCalibration(): 

324 collectionTypes.add(CollectionType.CALIBRATION) 

325 associationIter = self._registry.queryDatasetAssociations( 

326 datasetType, 

327 collections=self._collections.keys(), 

328 collectionTypes=collectionTypes, 

329 flattenChains=False, 

330 ) 

331 for association in associationIter: 

332 if association.ref.id in self._dataset_ids: 

333 results[association.collection].append(association) 

334 return results