Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 15%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

88 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26import itertools 

27from collections import defaultdict 

28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

29 

30from lsst.daf.butler import Butler, ButlerURI, Config, DataCoordinate, DatasetRef, DatasetType 

31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

32from lsst.daf.butler.transfers import RepoExportContext 

33from lsst.utils.introspection import get_class_of 

34 

35from .graph import QuantumGraph, QuantumNode 

36from .pipeline import PipelineDatasetTypes 

37 

38DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

39 

40 

41def _accumulate( 

42 graph: QuantumGraph, 

43 dataset_types: PipelineDatasetTypes, 

44) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

45 # accumulate the DatasetRefs that will be transferred to the execution 

46 # registry 

47 

48 # exports holds all the existing data that will be migrated to the 

49 # execution butler 

50 exports: Set[DatasetRef] = set() 

51 

52 # inserts is the mapping of DatasetType to dataIds for what is to be 

53 # inserted into the registry. These are the products that are expected 

54 # to be produced during processing of the QuantumGraph 

55 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

56 

57 # Add inserts for initOutputs (including initIntermediates); these are 

58 # defined fully by their DatasetType, because they have no dimensions, and 

59 # they are by definition not resolved. initInputs are part of Quantum and 

60 # that's the only place the graph stores the dataset IDs, so we process 

61 # them there even though each Quantum for a task has the same ones. 

62 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

63 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

64 

65 n: QuantumNode 

66 for quantum in (n.quantum for n in graph): 

67 for attrName in ("initInputs", "inputs", "outputs"): 

68 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

69 

70 for type, refs in attr.items(): 

71 # This if block is because init inputs has a different 

72 # signature for its items 

73 if not isinstance(refs, list): 

74 refs = [refs] 

75 # iterate over all the references, if it has an id, it 

76 # means it exists and should be exported, if not it should 

77 # be inserted into the new registry 

78 for ref in refs: 

79 if ref.id is not None: 

80 exports.add(ref) 

81 else: 

82 if ref.isComponent(): 

83 # We can't insert a component, and a component will 

84 # be part of some other upstream dataset, so it 

85 # should be safe to skip them here 

86 continue 

87 inserts[type].add(ref.dataId) 

88 return exports, inserts 

89 

90 

91def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

92 # Recurse through any discovered collections to make sure all collections 

93 # are exported. This exists because I ran into a situation where some 

94 # collections were not properly being discovered and exported. This 

95 # method may be able to be removed in the future if collection export 

96 # logic changes 

97 collections = set(collections) 

98 while True: 

99 discoveredCollections = set( 

100 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

101 ) 

102 if len(discoveredCollections) > len(collections): 

103 collections = discoveredCollections 

104 else: 

105 break 

106 return collections 

107 

108 

109def _export( 

110 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap 

111) -> io.StringIO: 

112 # This exports the datasets that exist in the input butler using 

113 # daf butler objects, however it reaches in deep and does not use the 

114 # public methods so that it can export it to a string buffer and skip 

115 # disk access. 

116 yamlBuffer = io.StringIO() 

117 # Yaml is hard coded, since the class controls both ends of the 

118 # export/import 

119 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

120 backend = BackendClass(yamlBuffer) 

121 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

122 exporter.saveDatasets(exports) 

123 

124 # Need to ensure that the dimension records for outputs are 

125 # transferred. 

126 for _, dataIds in inserts.items(): 

127 exporter.saveDataIds(dataIds) 

128 

129 # Look for any defined collection, if not get the defaults 

130 if collections is None: 

131 collections = butler.registry.defaults.collections 

132 

133 # look up all collections associated with those inputs, this follows 

134 # all chains to make sure everything is properly exported 

135 for c in _discoverCollections(butler, collections): 

136 exporter.saveCollection(c) 

137 exporter._finish() 

138 

139 # reset the string buffer to the beginning so the read operation will 

140 # actually *see* the data that was exported 

141 yamlBuffer.seek(0) 

142 return yamlBuffer 

143 

144 

145def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler: 

146 # Set up the new butler object at the specified location 

147 if dirExists: 

148 # Remove the existing table, if the code got this far and this exists 

149 # clobber must be true 

150 executionRegistry = outputLocation.join("gen3.sqlite3") 

151 if executionRegistry.exists(): 

152 executionRegistry.remove() 

153 else: 

154 outputLocation.mkdir() 

155 

156 # Copy the existing butler config, modifying the location of the 

157 # registry to the specified location. 

158 # Preserve the root path from the existing butler so things like 

159 # file data stores continue to look at the old location. 

160 config = Config(butler._config) 

161 config["root"] = outputLocation.geturl() 

162 config["allow_put_of_predefined_dataset"] = True 

163 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

164 

165 # Remove any namespace that may be set in main registry. 

166 config.pop(("registry", "namespace"), None) 

167 

168 # record the current root of the datastore if it is specified relative 

169 # to the butler root 

170 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG: 

171 config["datastore", "root"] = butler._config.configDir.geturl() 

172 config["datastore", "trust_get_request"] = True 

173 

174 # Requires that we use the dimension configuration from the original 

175 # butler and not use the defaults. 

176 config = Butler.makeRepo( 

177 root=outputLocation, 

178 config=config, 

179 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

180 overwrite=True, 

181 forceConfigRoot=False, 

182 ) 

183 

184 # Return a newly created butler 

185 return Butler(config, writeable=True) 

186 

187 

188def _import( 

189 yamlBuffer: io.StringIO, 

190 newButler: Butler, 

191 inserts: DataSetTypeMap, 

192 run: str, 

193 butlerModifier: Optional[Callable[[Butler], Butler]], 

194) -> Butler: 

195 # This method takes the exports from the existing butler, imports 

196 # them into the newly created butler, and then inserts the datasets 

197 # that are expected to be produced. 

198 

199 # import the existing datasets using "split" mode. "split" is safe 

200 # because execution butler is assumed to be able to see all the file 

201 # locations that the main datastore can see. "split" supports some 

202 # absolute URIs in the datastore. 

203 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

204 

205 # If there is modifier callable, run it to make necessary updates 

206 # to the new butler. 

207 if butlerModifier is not None: 

208 newButler = butlerModifier(newButler) 

209 

210 # Register datasets to be produced and insert them into the registry 

211 for dsType, dataIds in inserts.items(): 

212 newButler.registry.registerDatasetType(dsType) 

213 newButler.registry.insertDatasets(dsType, dataIds, run) 

214 

215 return newButler 

216 

217 

218def buildExecutionButler( 

219 butler: Butler, 

220 graph: QuantumGraph, 

221 outputLocation: Union[str, ButlerURI], 

222 run: str, 

223 *, 

224 clobber: bool = False, 

225 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

226 collections: Optional[Iterable[str]] = None, 

227) -> Butler: 

228 r"""buildExecutionButler is a function that is responsible for exporting 

229 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

230 only contains datasets specified by the `QuantumGraph`. These datasets are 

231 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

232 those that are expected to be produced during the execution of the 

233 `QuantumGraph`. 

234 

235 Parameters 

236 ---------- 

237 butler : `lsst.daf.butler.Bulter` 

238 This is the existing `~lsst.daf.butler.Butler` instance from which 

239 existing datasets will be exported. This should be the 

240 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

241 that will be converted with this object. 

242 graph : `QuantumGraph` 

243 Graph containing nodes that are to be exported into an execution 

244 butler 

245 outputLocation : `str` or `~lsst.daf.butler.ButlerURI` 

246 URI Location at which the execution butler is to be exported. May be 

247 specified as a string or a ButlerURI instance. 

248 run : `str` optional 

249 The run collection that the exported datasets are to be placed in. If 

250 None, the default value in registry.defaults will be used. 

251 clobber : `bool`, Optional 

252 By default a butler will not be created if a file or directory 

253 already exists at the output location. If this is set to `True` 

254 what is at the location will be deleted prior to running the 

255 export. Defaults to `False` 

256 butlerModifier : `~typing.Callable`, Optional 

257 If supplied this should be a callable that accepts a 

258 `~lsst.daf.butler.Butler`, and returns an instantiated 

259 `~lsst.daf.butler.Butler`. This callable may be used to make any 

260 modifications to the `~lsst.daf.butler.Butler` desired. This 

261 will be called after importing all datasets that exist in the input 

262 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

263 to be produced. Examples of what this method could do include 

264 things such as creating collections/runs/ etc. 

265 collections : `~typing.Iterable` of `str`, Optional 

266 An iterable of collection names that will be exported from the input 

267 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

268 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

269 default collections will be used. 

270 

271 Returns 

272 ------- 

273 executionButler : `lsst.daf.butler.Butler` 

274 An instance of the newly created execution butler 

275 

276 Raises 

277 ------ 

278 FileExistsError 

279 Raised if something exists in the filesystem at the specified output 

280 location and clobber is `False` 

281 NotADirectoryError 

282 Raised if specified output URI does not correspond to a directory 

283 """ 

284 # We know this must refer to a directory. 

285 outputLocation = ButlerURI(outputLocation, forceDirectory=True) 

286 

287 # Do this first to Fail Fast if the output exists 

288 if (dirExists := outputLocation.exists()) and not clobber: 

289 raise FileExistsError("Cannot create a butler at specified location, location exists") 

290 if not outputLocation.isdir(): 

291 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

292 

293 # Gather all DatasetTypes from the Python and check any that already exist 

294 # in the registry for consistency. This does not check that all dataset 

295 # types here exist, because they might want to register dataset types 

296 # later. It would be nice to also check that, but to that we would need to 

297 # be told whether they plan to register dataset types later (DM-30845). 

298 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

299 

300 exports, inserts = _accumulate(graph, dataset_types) 

301 yamlBuffer = _export(butler, collections, exports, inserts) 

302 

303 newButler = _setupNewButler(butler, outputLocation, dirExists) 

304 

305 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)