Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 14%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

93 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler", ) 

24 

25import io 

26 

27from collections import defaultdict 

28import itertools 

29from typing import Callable, DefaultDict, Mapping, Optional, Set, Tuple, Iterable, List, Union 

30 

31from lsst.daf.butler import (DatasetRef, DatasetType, Butler, DataCoordinate, ButlerURI, Config) 

32from lsst.daf.butler.core.utils import getClassOf 

33from lsst.daf.butler.transfers import RepoExportContext 

34from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

35 

36from .graph import QuantumGraph, QuantumNode 

37from .pipeline import PipelineDatasetTypes 

38 

39DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

40 

41 

42def _accumulate( 

43 graph: QuantumGraph, 

44 dataset_types: PipelineDatasetTypes, 

45) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

46 # accumulate the DatasetRefs that will be transferred to the execution 

47 # registry 

48 

49 # exports holds all the existing data that will be migrated to the 

50 # execution butler 

51 exports: Set[DatasetRef] = set() 

52 

53 # inserts is the mapping of DatasetType to dataIds for what is to be 

54 # inserted into the registry. These are the products that are expected 

55 # to be produced during processing of the QuantumGraph 

56 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

57 

58 # Add inserts for initOutputs (including initIntermediates); these are 

59 # defined fully by their DatasetType, because they have no dimensions, and 

60 # they are by definition not resolved. initInputs are part of Quantum and 

61 # that's the only place the graph stores the dataset IDs, so we process 

62 # them there even though each Quantum for a task has the same ones. 

63 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

64 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

65 

66 n: QuantumNode 

67 for quantum in (n.quantum for n in graph): 

68 for attrName in ("initInputs", "inputs", "outputs"): 

69 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

70 

71 for type, refs in attr.items(): 

72 # This if block is because init inputs has a different 

73 # signature for its items 

74 if not isinstance(refs, list): 

75 refs = [refs] 

76 # iterate over all the references, if it has an id, it 

77 # means it exists and should be exported, if not it should 

78 # be inserted into the new registry 

79 for ref in refs: 

80 if ref.id is not None: 

81 # If this is a component we want the composite to be 

82 # exported. 

83 if ref.isComponent(): 

84 ref = ref.makeCompositeRef() 

85 exports.add(ref) 

86 else: 

87 if ref.isComponent(): 

88 # We can't insert a component, and a component will 

89 # be part of some other upstream dataset, so it 

90 # should be safe to skip them here 

91 continue 

92 inserts[type].add(ref.dataId) 

93 return exports, inserts 

94 

95 

96def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

97 # Recurse through any discovered collections to make sure all collections 

98 # are exported. This exists because I ran into a situation where some 

99 # collections were not properly being discovered and exported. This 

100 # method may be able to be removed in the future if collection export 

101 # logic changes 

102 collections = set(collections) 

103 while True: 

104 discoveredCollections = set(butler.registry.queryCollections(collections, flattenChains=True, 

105 includeChains=True)) 

106 if len(discoveredCollections) > len(collections): 

107 collections = discoveredCollections 

108 else: 

109 break 

110 return collections 

111 

112 

113def _export(butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], 

114 inserts: DataSetTypeMap) -> io.StringIO: 

115 # This exports the datasets that exist in the input butler using 

116 # daf butler objects, however it reaches in deep and does not use the 

117 # public methods so that it can export it to a string buffer and skip 

118 # disk access. 

119 yamlBuffer = io.StringIO() 

120 # Yaml is hard coded, since the class controls both ends of the 

121 # export/import 

122 BackendClass = getClassOf(butler._config["repo_transfer_formats", "yaml", "export"]) 

123 backend = BackendClass(yamlBuffer) 

124 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

125 

126 # Need to ensure that the dimension records for input are transferred. 

127 # Butler.transfer_from() does not (yet) transfer records. 

128 dataIds = set(ref.dataId for ref in exports) 

129 exporter.saveDataIds(dataIds) 

130 

131 # Need to ensure that the dimension records for outputs are 

132 # transferred. 

133 for _, dataIds in inserts.items(): 

134 exporter.saveDataIds(dataIds) 

135 

136 # Look for any defined collection, if not get the defaults 

137 if collections is None: 

138 collections = butler.registry.defaults.collections 

139 

140 # look up all collections associated with those inputs, this follows 

141 # all chains to make sure everything is properly exported 

142 for c in _discoverCollections(butler, collections): 

143 exporter.saveCollection(c) 

144 exporter._finish() 

145 

146 # reset the string buffer to the beginning so the read operation will 

147 # actually *see* the data that was exported 

148 yamlBuffer.seek(0) 

149 return yamlBuffer 

150 

151 

152def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler: 

153 # Set up the new butler object at the specified location 

154 if dirExists: 

155 # Remove the existing table, if the code got this far and this exists 

156 # clobber must be true 

157 executionRegistry = outputLocation.join("gen3.sqlite3") 

158 if executionRegistry.exists(): 

159 executionRegistry.remove() 

160 else: 

161 outputLocation.mkdir() 

162 

163 # Copy the existing butler config, modifying the location of the 

164 # registry to the specified location. 

165 # Preserve the root path from the existing butler so things like 

166 # file data stores continue to look at the old location. 

167 config = Config(butler._config) 

168 config["root"] = outputLocation.geturl() 

169 config["allow_put_of_predefined_dataset"] = True 

170 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

171 

172 # Remove any namespace that may be set in main registry. 

173 config.pop(("registry", "namespace"), None) 

174 

175 # record the current root of the datastore if it is specified relative 

176 # to the butler root 

177 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG: 

178 config["datastore", "root"] = butler._config.configDir.geturl() 

179 config["datastore", "trust_get_request"] = True 

180 

181 # Requires that we use the dimension configuration from the original 

182 # butler and not use the defaults. 

183 config = Butler.makeRepo(root=outputLocation, config=config, 

184 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

185 overwrite=True, forceConfigRoot=False) 

186 

187 # Return a newly created butler 

188 return Butler(config, writeable=True) 

189 

190 

191def _import(yamlBuffer: io.StringIO, 

192 newButler: Butler, 

193 inserts: DataSetTypeMap, 

194 run: str, 

195 butlerModifier: Optional[Callable[[Butler], Butler]] 

196 ) -> Butler: 

197 # This method takes the exports from the existing butler, imports 

198 # them into the newly created butler, and then inserts the datasets 

199 # that are expected to be produced. 

200 

201 # import the existing datasets using "split" mode. "split" is safe 

202 # because execution butler is assumed to be able to see all the file 

203 # locations that the main datastore can see. "split" supports some 

204 # absolute URIs in the datastore. 

205 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

206 

207 # If there is modifier callable, run it to make necessary updates 

208 # to the new butler. 

209 if butlerModifier is not None: 

210 newButler = butlerModifier(newButler) 

211 

212 # Register datasets to be produced and insert them into the registry 

213 for dsType, dataIds in inserts.items(): 

214 newButler.registry.registerDatasetType(dsType) 

215 newButler.registry.insertDatasets(dsType, dataIds, run) 

216 

217 return newButler 

218 

219 

220def buildExecutionButler(butler: Butler, 

221 graph: QuantumGraph, 

222 outputLocation: Union[str, ButlerURI], 

223 run: str, 

224 *, 

225 clobber: bool = False, 

226 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

227 collections: Optional[Iterable[str]] = None 

228 ) -> Butler: 

229 r"""buildExecutionButler is a function that is responsible for exporting 

230 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

231 only contains datasets specified by the `QuantumGraph`. These datasets are 

232 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

233 those that are expected to be produced during the execution of the 

234 `QuantumGraph`. 

235 

236 Parameters 

237 ---------- 

238 butler : `lsst.daf.butler.Bulter` 

239 This is the existing `~lsst.daf.butler.Butler` instance from which 

240 existing datasets will be exported. This should be the 

241 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

242 that will be converted with this object. 

243 graph : `QuantumGraph` 

244 Graph containing nodes that are to be exported into an execution 

245 butler 

246 outputLocation : `str` or `~lsst.daf.butler.ButlerURI` 

247 URI Location at which the execution butler is to be exported. May be 

248 specified as a string or a ButlerURI instance. 

249 run : `str` optional 

250 The run collection that the exported datasets are to be placed in. If 

251 None, the default value in registry.defaults will be used. 

252 clobber : `bool`, Optional 

253 By default a butler will not be created if a file or directory 

254 already exists at the output location. If this is set to `True` 

255 what is at the location will be deleted prior to running the 

256 export. Defaults to `False` 

257 butlerModifier : `~typing.Callable`, Optional 

258 If supplied this should be a callable that accepts a 

259 `~lsst.daf.butler.Butler`, and returns an instantiated 

260 `~lsst.daf.butler.Butler`. This callable may be used to make any 

261 modifications to the `~lsst.daf.butler.Butler` desired. This 

262 will be called after importing all datasets that exist in the input 

263 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

264 to be produced. Examples of what this method could do include 

265 things such as creating collections/runs/ etc. 

266 collections : `~typing.Iterable` of `str`, Optional 

267 An iterable of collection names that will be exported from the input 

268 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

269 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

270 default collections will be used. 

271 

272 Returns 

273 ------- 

274 executionButler : `lsst.daf.butler.Butler` 

275 An instance of the newly created execution butler 

276 

277 Raises 

278 ------ 

279 FileExistsError 

280 Raised if something exists in the filesystem at the specified output 

281 location and clobber is `False` 

282 NotADirectoryError 

283 Raised if specified output URI does not correspond to a directory 

284 """ 

285 # We know this must refer to a directory. 

286 outputLocation = ButlerURI(outputLocation, forceDirectory=True) 

287 

288 # Do this first to Fail Fast if the output exists 

289 if (dirExists := outputLocation.exists()) and not clobber: 

290 raise FileExistsError("Cannot create a butler at specified location, location exists") 

291 if not outputLocation.isdir(): 

292 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

293 

294 # Gather all DatasetTypes from the Python and check any that already exist 

295 # in the registry for consistency. This does not check that all dataset 

296 # types here exist, because they might want to register dataset types 

297 # later. It would be nice to also check that, but to that we would need to 

298 # be told whether they plan to register dataset types later (DM-30845). 

299 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

300 

301 exports, inserts = _accumulate(graph, dataset_types) 

302 yamlBuffer = _export(butler, collections, exports, inserts) 

303 

304 newButler = _setupNewButler(butler, outputLocation, dirExists) 

305 

306 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

307 

308 # Transfer the existing datasets directly from the source butler. 

309 newButler.transfer_from( 

310 butler, 

311 exports, 

312 transfer="auto", # No transfers should be happening. 

313 skip_missing=False, # Everything should exist. 

314 register_dataset_types=True, 

315 ) 

316 

317 return newButler