Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 15%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

94 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26import itertools 

27from collections import defaultdict 

28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

29 

30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType 

31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

32from lsst.daf.butler.transfers import RepoExportContext 

33from lsst.resources import ResourcePath, ResourcePathExpression 

34from lsst.utils.introspection import get_class_of 

35 

36from .graph import QuantumGraph, QuantumNode 

37from .pipeline import PipelineDatasetTypes 

38 

39DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

40 

41 

42def _accumulate( 

43 graph: QuantumGraph, 

44 dataset_types: PipelineDatasetTypes, 

45) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

46 # accumulate the DatasetRefs that will be transferred to the execution 

47 # registry 

48 

49 # exports holds all the existing data that will be migrated to the 

50 # execution butler 

51 exports: Set[DatasetRef] = set() 

52 

53 # inserts is the mapping of DatasetType to dataIds for what is to be 

54 # inserted into the registry. These are the products that are expected 

55 # to be produced during processing of the QuantumGraph 

56 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

57 

58 # Add inserts for initOutputs (including initIntermediates); these are 

59 # defined fully by their DatasetType, because they have no dimensions, and 

60 # they are by definition not resolved. initInputs are part of Quantum and 

61 # that's the only place the graph stores the dataset IDs, so we process 

62 # them there even though each Quantum for a task has the same ones. 

63 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

64 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

65 

66 n: QuantumNode 

67 for quantum in (n.quantum for n in graph): 

68 for attrName in ("initInputs", "inputs", "outputs"): 

69 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

70 

71 for type, refs in attr.items(): 

72 # This if block is because init inputs has a different 

73 # signature for its items 

74 if not isinstance(refs, list): 

75 refs = [refs] 

76 # iterate over all the references, if it has an id, it 

77 # means it exists and should be exported, if not it should 

78 # be inserted into the new registry 

79 for ref in refs: 

80 if ref.id is not None: 

81 # If this is a component we want the composite to be 

82 # exported. 

83 if ref.isComponent(): 

84 ref = ref.makeCompositeRef() 

85 exports.add(ref) 

86 else: 

87 if ref.isComponent(): 

88 # We can't insert a component, and a component will 

89 # be part of some other upstream dataset, so it 

90 # should be safe to skip them here 

91 continue 

92 inserts[type].add(ref.dataId) 

93 return exports, inserts 

94 

95 

96def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

97 # Recurse through any discovered collections to make sure all collections 

98 # are exported. This exists because I ran into a situation where some 

99 # collections were not properly being discovered and exported. This 

100 # method may be able to be removed in the future if collection export 

101 # logic changes 

102 collections = set(collections) 

103 while True: 

104 discoveredCollections = set( 

105 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

106 ) 

107 if len(discoveredCollections) > len(collections): 

108 collections = discoveredCollections 

109 else: 

110 break 

111 return collections 

112 

113 

114def _export( 

115 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap 

116) -> io.StringIO: 

117 # This exports the datasets that exist in the input butler using 

118 # daf butler objects, however it reaches in deep and does not use the 

119 # public methods so that it can export it to a string buffer and skip 

120 # disk access. 

121 yamlBuffer = io.StringIO() 

122 # Yaml is hard coded, since the class controls both ends of the 

123 # export/import 

124 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

125 backend = BackendClass(yamlBuffer) 

126 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

127 

128 # Need to ensure that the dimension records for input are transferred. 

129 # Butler.transfer_from() does not (yet) transfer records. 

130 dataIds = set(ref.dataId for ref in exports) 

131 exporter.saveDataIds(dataIds) 

132 

133 # Need to ensure that the dimension records for outputs are 

134 # transferred. 

135 for _, dataIds in inserts.items(): 

136 exporter.saveDataIds(dataIds) 

137 

138 # Look for any defined collection, if not get the defaults 

139 if collections is None: 

140 collections = butler.registry.defaults.collections 

141 

142 # look up all collections associated with those inputs, this follows 

143 # all chains to make sure everything is properly exported 

144 for c in _discoverCollections(butler, collections): 

145 exporter.saveCollection(c) 

146 exporter._finish() 

147 

148 # reset the string buffer to the beginning so the read operation will 

149 # actually *see* the data that was exported 

150 yamlBuffer.seek(0) 

151 return yamlBuffer 

152 

153 

154def _setupNewButler(butler: Butler, outputLocation: ResourcePath, dirExists: bool) -> Butler: 

155 # Set up the new butler object at the specified location 

156 if dirExists: 

157 # Remove the existing table, if the code got this far and this exists 

158 # clobber must be true 

159 executionRegistry = outputLocation.join("gen3.sqlite3") 

160 if executionRegistry.exists(): 

161 executionRegistry.remove() 

162 else: 

163 outputLocation.mkdir() 

164 

165 # Copy the existing butler config, modifying the location of the 

166 # registry to the specified location. 

167 # Preserve the root path from the existing butler so things like 

168 # file data stores continue to look at the old location. 

169 config = Config(butler._config) 

170 config["root"] = outputLocation.geturl() 

171 config["allow_put_of_predefined_dataset"] = True 

172 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

173 

174 # Remove any namespace that may be set in main registry. 

175 config.pop(("registry", "namespace"), None) 

176 

177 # record the current root of the datastore if it is specified relative 

178 # to the butler root 

179 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG: 

180 config["datastore", "root"] = butler._config.configDir.geturl() 

181 config["datastore", "trust_get_request"] = True 

182 

183 # Requires that we use the dimension configuration from the original 

184 # butler and not use the defaults. 

185 config = Butler.makeRepo( 

186 root=outputLocation, 

187 config=config, 

188 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

189 overwrite=True, 

190 forceConfigRoot=False, 

191 ) 

192 

193 # Return a newly created butler 

194 return Butler(config, writeable=True) 

195 

196 

197def _import( 

198 yamlBuffer: io.StringIO, 

199 newButler: Butler, 

200 inserts: DataSetTypeMap, 

201 run: str, 

202 butlerModifier: Optional[Callable[[Butler], Butler]], 

203) -> Butler: 

204 # This method takes the exports from the existing butler, imports 

205 # them into the newly created butler, and then inserts the datasets 

206 # that are expected to be produced. 

207 

208 # import the existing datasets using "split" mode. "split" is safe 

209 # because execution butler is assumed to be able to see all the file 

210 # locations that the main datastore can see. "split" supports some 

211 # absolute URIs in the datastore. 

212 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

213 

214 # If there is modifier callable, run it to make necessary updates 

215 # to the new butler. 

216 if butlerModifier is not None: 

217 newButler = butlerModifier(newButler) 

218 

219 # Register datasets to be produced and insert them into the registry 

220 for dsType, dataIds in inserts.items(): 

221 newButler.registry.registerDatasetType(dsType) 

222 newButler.registry.insertDatasets(dsType, dataIds, run) 

223 

224 return newButler 

225 

226 

227def buildExecutionButler( 

228 butler: Butler, 

229 graph: QuantumGraph, 

230 outputLocation: ResourcePathExpression, 

231 run: str, 

232 *, 

233 clobber: bool = False, 

234 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

235 collections: Optional[Iterable[str]] = None, 

236) -> Butler: 

237 r"""buildExecutionButler is a function that is responsible for exporting 

238 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

239 only contains datasets specified by the `QuantumGraph`. These datasets are 

240 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

241 those that are expected to be produced during the execution of the 

242 `QuantumGraph`. 

243 

244 Parameters 

245 ---------- 

246 butler : `lsst.daf.butler.Bulter` 

247 This is the existing `~lsst.daf.butler.Butler` instance from which 

248 existing datasets will be exported. This should be the 

249 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

250 that will be converted with this object. 

251 graph : `QuantumGraph` 

252 Graph containing nodes that are to be exported into an execution 

253 butler 

254 outputLocation : convertible to `ResourcePath 

255 URI Location at which the execution butler is to be exported. May be 

256 specified as a string or a `ResourcePath` instance. 

257 run : `str` optional 

258 The run collection that the exported datasets are to be placed in. If 

259 None, the default value in registry.defaults will be used. 

260 clobber : `bool`, Optional 

261 By default a butler will not be created if a file or directory 

262 already exists at the output location. If this is set to `True` 

263 what is at the location will be deleted prior to running the 

264 export. Defaults to `False` 

265 butlerModifier : `~typing.Callable`, Optional 

266 If supplied this should be a callable that accepts a 

267 `~lsst.daf.butler.Butler`, and returns an instantiated 

268 `~lsst.daf.butler.Butler`. This callable may be used to make any 

269 modifications to the `~lsst.daf.butler.Butler` desired. This 

270 will be called after importing all datasets that exist in the input 

271 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

272 to be produced. Examples of what this method could do include 

273 things such as creating collections/runs/ etc. 

274 collections : `~typing.Iterable` of `str`, Optional 

275 An iterable of collection names that will be exported from the input 

276 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

277 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

278 default collections will be used. 

279 

280 Returns 

281 ------- 

282 executionButler : `lsst.daf.butler.Butler` 

283 An instance of the newly created execution butler 

284 

285 Raises 

286 ------ 

287 FileExistsError 

288 Raised if something exists in the filesystem at the specified output 

289 location and clobber is `False` 

290 NotADirectoryError 

291 Raised if specified output URI does not correspond to a directory 

292 """ 

293 # We know this must refer to a directory. 

294 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

295 

296 # Do this first to Fail Fast if the output exists 

297 if (dirExists := outputLocation.exists()) and not clobber: 

298 raise FileExistsError("Cannot create a butler at specified location, location exists") 

299 if not outputLocation.isdir(): 

300 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

301 

302 # Gather all DatasetTypes from the Python and check any that already exist 

303 # in the registry for consistency. This does not check that all dataset 

304 # types here exist, because they might want to register dataset types 

305 # later. It would be nice to also check that, but to that we would need to 

306 # be told whether they plan to register dataset types later (DM-30845). 

307 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

308 

309 exports, inserts = _accumulate(graph, dataset_types) 

310 yamlBuffer = _export(butler, collections, exports, inserts) 

311 

312 newButler = _setupNewButler(butler, outputLocation, dirExists) 

313 

314 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

315 

316 # Transfer the existing datasets directly from the source butler. 

317 newButler.transfer_from( 

318 butler, 

319 exports, 

320 transfer="auto", # No transfers should be happening. 

321 skip_missing=False, # Everything should exist. 

322 register_dataset_types=True, 

323 ) 

324 

325 return newButler