Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler", ) 

24 

25import io 

26 

27from collections import defaultdict 

28import itertools 

29from typing import Callable, DefaultDict, Mapping, Optional, Set, Tuple, Iterable, List, Union 

30 

31from lsst.daf.butler import (DatasetRef, DatasetType, Butler, DataCoordinate, ButlerURI, Config) 

32from lsst.daf.butler.core.utils import getClassOf 

33from lsst.daf.butler.transfers import RepoExportContext 

34from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

35 

36from .graph import QuantumGraph, QuantumNode 

37from .pipeline import PipelineDatasetTypes 

38 

39DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

40 

41 

42def _accumulate( 

43 graph: QuantumGraph, 

44 dataset_types: PipelineDatasetTypes, 

45) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

46 # accumulate the DatasetRefs that will be transferred to the execution 

47 # registry 

48 

49 # exports holds all the existing data that will be migrated to the 

50 # execution butler 

51 exports: Set[DatasetRef] = set() 

52 

53 # inserts is the mapping of DatasetType to dataIds for what is to be 

54 # inserted into the registry. These are the products that are expected 

55 # to be produced during processing of the QuantumGraph 

56 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

57 

58 # Add inserts for initOutputs (including initIntermediates); these are 

59 # defined fully by their DatasetType, because they have no dimensions, and 

60 # they are by definition not resolved. initInputs are part of Quantum and 

61 # that's the only place the graph stores the dataset IDs, so we process 

62 # them there even though each Quantum for a task has the same ones. 

63 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

64 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

65 

66 n: QuantumNode 

67 for quantum in (n.quantum for n in graph): 

68 for attrName in ("initInputs", "inputs", "outputs"): 

69 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

70 

71 for type, refs in attr.items(): 

72 # This if block is because init inputs has a different 

73 # signature for its items 

74 if not isinstance(refs, list): 

75 refs = [refs] 

76 # iterate over all the references, if it has an id, it 

77 # means it exists and should be exported, if not it should 

78 # be inserted into the new registry 

79 for ref in refs: 

80 if ref.id is not None: 

81 exports.add(ref) 

82 else: 

83 if ref.isComponent(): 

84 # We can't insert a component, and a component will 

85 # be part of some other upstream dataset, so it 

86 # should be safe to skip them here 

87 continue 

88 inserts[type].add(ref.dataId) 

89 return exports, inserts 

90 

91 

92def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

93 # Recurse through any discovered collections to make sure all collections 

94 # are exported. This exists because I ran into a situation where some 

95 # collections were not properly being discovered and exported. This 

96 # method may be able to be removed in the future if collection export 

97 # logic changes 

98 collections = set(collections) 

99 while True: 

100 discoveredCollections = set(butler.registry.queryCollections(collections, flattenChains=True, 

101 includeChains=True)) 

102 if len(discoveredCollections) > len(collections): 

103 collections = discoveredCollections 

104 else: 

105 break 

106 return collections 

107 

108 

109def _export(butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], 

110 inserts: DataSetTypeMap) -> io.StringIO: 

111 # This exports the datasets that exist in the input butler using 

112 # daf butler objects, however it reaches in deep and does not use the 

113 # public methods so that it can export it to a string buffer and skip 

114 # disk access. 

115 yamlBuffer = io.StringIO() 

116 # Yaml is hard coded, since the class controls both ends of the 

117 # export/import 

118 BackendClass = getClassOf(butler._config["repo_transfer_formats", "yaml", "export"]) 

119 backend = BackendClass(yamlBuffer) 

120 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

121 exporter.saveDatasets(exports) 

122 

123 # Need to ensure that the dimension records for outputs are 

124 # transferred. 

125 for _, dataIds in inserts.items(): 

126 exporter.saveDataIds(dataIds) 

127 

128 # Look for any defined collection, if not get the defaults 

129 if collections is None: 

130 collections = butler.registry.defaults.collections 

131 

132 # look up all collections associated with those inputs, this follows 

133 # all chains to make sure everything is properly exported 

134 for c in _discoverCollections(butler, collections): 

135 exporter.saveCollection(c) 

136 exporter._finish() 

137 

138 # reset the string buffer to the beginning so the read operation will 

139 # actually *see* the data that was exported 

140 yamlBuffer.seek(0) 

141 return yamlBuffer 

142 

143 

144def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler: 

145 # Set up the new butler object at the specified location 

146 if dirExists: 

147 # Remove the existing table, if the code got this far and this exists 

148 # clobber must be true 

149 executionRegistry = outputLocation.join("gen3.sqlite3") 

150 if executionRegistry.exists(): 

151 executionRegistry.remove() 

152 else: 

153 outputLocation.mkdir() 

154 

155 # Copy the existing butler config, modifying the location of the 

156 # registry to the specified location. 

157 # Preserve the root path from the existing butler so things like 

158 # file data stores continue to look at the old location. 

159 config = Config(butler._config) 

160 config["root"] = outputLocation.geturl() 

161 config["allow_put_of_predefined_dataset"] = True 

162 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

163 

164 # Remove any namespace that may be set in main registry. 

165 config.pop(("registry", "namespace"), None) 

166 

167 # record the current root of the datastore if it is specified relative 

168 # to the butler root 

169 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG: 

170 config["datastore", "root"] = butler._config.configDir.geturl() 

171 config["datastore", "trust_get_request"] = True 

172 

173 # Requires that we use the dimension configuration from the original 

174 # butler and not use the defaults. 

175 config = Butler.makeRepo(root=outputLocation, config=config, 

176 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

177 overwrite=True, forceConfigRoot=False) 

178 

179 # Return a newly created butler 

180 return Butler(config, writeable=True) 

181 

182 

183def _import(yamlBuffer: io.StringIO, 

184 newButler: Butler, 

185 inserts: DataSetTypeMap, 

186 run: str, 

187 butlerModifier: Optional[Callable[[Butler], Butler]] 

188 ) -> Butler: 

189 # This method takes the exports from the existing butler, imports 

190 # them into the newly created butler, and then inserts the datasets 

191 # that are expected to be produced. 

192 

193 # import the existing datasets 

194 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="auto") 

195 

196 # If there is modifier callable, run it to make necessary updates 

197 # to the new butler. 

198 if butlerModifier is not None: 

199 newButler = butlerModifier(newButler) 

200 

201 # Register datasets to be produced and insert them into the registry 

202 for dsType, dataIds in inserts.items(): 

203 newButler.registry.registerDatasetType(dsType) 

204 newButler.registry.insertDatasets(dsType, dataIds, run) 

205 

206 return newButler 

207 

208 

209def buildExecutionButler(butler: Butler, 

210 graph: QuantumGraph, 

211 outputLocation: Union[str, ButlerURI], 

212 run: str, 

213 *, 

214 clobber: bool = False, 

215 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

216 collections: Optional[Iterable[str]] = None 

217 ) -> Butler: 

218 r"""buildExecutionButler is a function that is responsible for exporting 

219 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

220 only contains datasets specified by the `QuantumGraph`. These datasets are 

221 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

222 those that are expected to be produced during the execution of the 

223 `QuantumGraph`. 

224 

225 Parameters 

226 ---------- 

227 butler : `lsst.daf.butler.Bulter` 

228 This is the existing `~lsst.daf.butler.Butler` instance from which 

229 existing datasets will be exported. This should be the 

230 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

231 that will be converted with this object. 

232 graph : `QuantumGraph` 

233 Graph containing nodes that are to be exported into an execution 

234 butler 

235 outputLocation : `str` or `~lsst.daf.butler.ButlerURI` 

236 URI Location at which the execution butler is to be exported. May be 

237 specified as a string or a ButlerURI instance. 

238 run : `str` optional 

239 The run collection that the exported datasets are to be placed in. If 

240 None, the default value in registry.defaults will be used. 

241 clobber : `bool`, Optional 

242 By default a butler will not be created if a file or directory 

243 already exists at the output location. If this is set to `True` 

244 what is at the location will be deleted prior to running the 

245 export. Defaults to `False` 

246 butlerModifier : `~typing.Callable`, Optional 

247 If supplied this should be a callable that accepts a 

248 `~lsst.daf.butler.Butler`, and returns an instantiated 

249 `~lsst.daf.butler.Butler`. This callable may be used to make any 

250 modifications to the `~lsst.daf.butler.Butler` desired. This 

251 will be called after importing all datasets that exist in the input 

252 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

253 to be produced. Examples of what this method could do include 

254 things such as creating collections/runs/ etc. 

255 collections : `~typing.Iterable` of `str`, Optional 

256 An iterable of collection names that will be exported from the input 

257 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

258 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

259 default collections will be used. 

260 

261 Returns 

262 ------- 

263 executionButler : `lsst.daf.butler.Butler` 

264 An instance of the newly created execution butler 

265 

266 Raises 

267 ------ 

268 FileExistsError 

269 Raised if something exists in the filesystem at the specified output 

270 location and clobber is `False` 

271 NotADirectoryError 

272 Raised if specified output URI does not correspond to a directory 

273 """ 

274 # We know this must refer to a directory. 

275 outputLocation = ButlerURI(outputLocation, forceDirectory=True) 

276 

277 # Do this first to Fail Fast if the output exists 

278 if (dirExists := outputLocation.exists()) and not clobber: 

279 raise FileExistsError("Cannot create a butler at specified location, location exists") 

280 if not outputLocation.isdir(): 

281 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

282 

283 # Gather all DatasetTypes from the Python and check any that already exist 

284 # in the registry for consistency. This does not check that all dataset 

285 # types here exist, because they might want to register dataset types 

286 # later. It would be nice to also check that, but to that we would need to 

287 # be told whether they plan to register dataset types later (DM-30845). 

288 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

289 

290 exports, inserts = _accumulate(graph, dataset_types) 

291 yamlBuffer = _export(butler, collections, exports, inserts) 

292 

293 newButler = _setupNewButler(butler, outputLocation, dirExists) 

294 

295 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)