Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler", ) 

24 

25import io 

26 

27from collections import defaultdict 

28from typing import Callable, DefaultDict, Mapping, Optional, Set, Tuple, Iterable, List, Union 

29 

30from lsst.daf.butler import (DatasetRef, DatasetType, Butler, DataCoordinate, ButlerURI, Config) 

31from lsst.daf.butler.core.utils import getClassOf 

32from lsst.daf.butler.transfers import RepoExportContext 

33from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

34 

35 

36from . import QuantumGraph, QuantumNode 

37 

38DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

39 

40 

41def _accumulate(graph: QuantumGraph) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

42 # accumulate the dataIds that will be transferred to the execution 

43 # registry 

44 

45 # exports holds all the existing data that will be migrated to the 

46 # execution butler 

47 exports: Set[DatasetRef] = set() 

48 

49 # inserts is the mapping of DatasetType to dataIds for what is to be 

50 # inserted into the registry. These are the products that are expected 

51 # to be produced during processing of the QuantumGraph 

52 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

53 

54 n: QuantumNode 

55 for quantum in (n.quantum for n in graph): 

56 for attrName in ("initInputs", "inputs", "outputs"): 

57 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

58 

59 for type, refs in attr.items(): 

60 # This if block is because init inputs has a different 

61 # signature for its items 

62 if not isinstance(refs, list): 

63 refs = [refs] 

64 # iterate over all the references, if it has an id, it 

65 # means it exists and should be exported, if not it should 

66 # be inserted into the new registry 

67 for ref in refs: 

68 if ref.isComponent(): 

69 # We can't insert a component, and a component will 

70 # be part of some other upstream dataset, so it 

71 # should be safe to skip them here 

72 continue 

73 

74 if ref.id is not None: 

75 exports.add(ref) 

76 else: 

77 inserts[type].add(ref.dataId) 

78 return exports, inserts 

79 

80 

81def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

82 # Recurse through any discovered collections to make sure all collections 

83 # are exported. This exists because I ran into a situation where some 

84 # collections were not properly being discovered and exported. This 

85 # method may be able to be removed in the future if collection export 

86 # logic changes 

87 collections = set(collections) 

88 while True: 

89 discoveredCollections = set(butler.registry.queryCollections(collections, flattenChains=True, 

90 includeChains=True)) 

91 if len(discoveredCollections) > len(collections): 

92 collections = discoveredCollections 

93 else: 

94 break 

95 return collections 

96 

97 

98def _export(butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], 

99 inserts: DataSetTypeMap) -> io.StringIO: 

100 # This exports the datasets that exist in the input butler using 

101 # daf butler objects, however it reaches in deep and does not use the 

102 # public methods so that it can export it to a string buffer and skip 

103 # disk access. 

104 yamlBuffer = io.StringIO() 

105 # Yaml is hard coded, since the class controls both ends of the 

106 # export/import 

107 BackendClass = getClassOf(butler._config["repo_transfer_formats", "yaml", "export"]) 

108 backend = BackendClass(yamlBuffer) 

109 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

110 exporter.saveDatasets(exports) 

111 

112 # Need to ensure that the dimension records for outputs are 

113 # transferred. 

114 for _, dataIds in inserts.items(): 

115 exporter.saveDataIds(dataIds) 

116 

117 # Look for any defined collection, if not get the defaults 

118 if collections is None: 

119 collections = butler.registry.defaults.collections 

120 

121 # look up all collections associated with those inputs, this follows 

122 # all chains to make sure everything is properly exported 

123 for c in _discoverCollections(butler, collections): 

124 exporter.saveCollection(c) 

125 exporter._finish() 

126 

127 # reset the string buffer to the beginning so the read operation will 

128 # actually *see* the data that was exported 

129 yamlBuffer.seek(0) 

130 return yamlBuffer 

131 

132 

133def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler: 

134 # Set up the new butler object at the specified location 

135 if dirExists: 

136 # Remove the existing table, if the code got this far and this exists 

137 # clobber must be true 

138 executionRegistry = outputLocation.join("gen3.sqlite3") 

139 if executionRegistry.exists(): 

140 executionRegistry.remove() 

141 else: 

142 outputLocation.mkdir() 

143 

144 # Copy the existing butler config, modifying the location of the 

145 # registry to the specified location. 

146 # Preserve the root path from the existing butler so things like 

147 # file data stores continue to look at the old location. 

148 config = Config(butler._config) 

149 config["root"] = outputLocation.geturl() 

150 config["allow_put_of_predefined_dataset"] = True 

151 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

152 # record the current root of the datastore if it is specified relative 

153 # to the butler root 

154 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG: 

155 config["datastore", "root"] = butler._config.configDir.geturl() 

156 config["datastore", "trust_get_request"] = True 

157 

158 # Requires that we use the dimension configuration from the original 

159 # butler and not use the defaults. 

160 config = Butler.makeRepo(root=outputLocation, config=config, 

161 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

162 overwrite=True, forceConfigRoot=False) 

163 

164 # Return a newly created butler 

165 return Butler(config, writeable=True) 

166 

167 

168def _import(yamlBuffer: io.StringIO, 

169 newButler: Butler, 

170 inserts: DataSetTypeMap, 

171 run: str, 

172 butlerModifier: Optional[Callable[[Butler], Butler]] 

173 ) -> Butler: 

174 # This method takes the exports from the existing butler, imports 

175 # them into the newly created butler, and then inserts the datasets 

176 # that are expected to be produced. 

177 

178 # import the existing datasets 

179 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True) 

180 

181 # If there is modifier callable, run it to make necessary updates 

182 # to the new butler. 

183 if butlerModifier is not None: 

184 newButler = butlerModifier(newButler) 

185 

186 # Register datasets to be produced and insert them into the registry 

187 for dsType, dataIds in inserts.items(): 

188 newButler.registry.registerDatasetType(dsType) 

189 newButler.registry.insertDatasets(dsType, dataIds, run) 

190 

191 return newButler 

192 

193 

194def buildExecutionButler(butler: Butler, 

195 graph: QuantumGraph, 

196 outputLocation: Union[str, ButlerURI], 

197 run: str, 

198 *, 

199 clobber: bool = False, 

200 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

201 collections: Optional[Iterable[str]] = None 

202 ) -> Butler: 

203 r"""buildExecutionButler is a function that is responsible for exporting 

204 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

205 only contains datasets specified by the `QuantumGraph`. These datasets are 

206 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

207 those that are expected to be produced during the execution of the 

208 `QuantumGraph`. 

209 

210 Parameters 

211 ---------- 

212 butler : `lsst.daf.butler.Bulter` 

213 This is the existing `~lsst.daf.butler.Butler` instance from which 

214 existing datasets will be exported. This should be the 

215 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

216 that will be converted with this object. 

217 graph : `QuantumGraph` 

218 Graph containing nodes that are to be exported into an execution 

219 butler 

220 outputLocation : `str` or `~lsst.daf.butler.ButlerURI` 

221 URI Location at which the execution butler is to be exported. May be 

222 specified as a string or a ButlerURI instance. 

223 run : `str` optional 

224 The run collection that the exported datasets are to be placed in. If 

225 None, the default value in registry.defaults will be used. 

226 clobber : `bool`, Optional 

227 By default a butler will not be created if a file or directory 

228 already exists at the output location. If this is set to `True` 

229 what is at the location will be deleted prior to running the 

230 export. Defaults to `False` 

231 butlerModifier : `~typing.Callable`, Optional 

232 If supplied this should be a callable that accepts a 

233 `~lsst.daf.butler.Butler`, and returns an instantiated 

234 `~lsst.daf.butler.Butler`. This callable may be used to make any 

235 modifications to the `~lsst.daf.butler.Butler` desired. This 

236 will be called after importing all datasets that exist in the input 

237 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

238 to be produced. Examples of what this method could do include 

239 things such as creating collections/runs/ etc. 

240 collections : `~typing.Iterable` of `str`, Optional 

241 An iterable of collection names that will be exported from the input 

242 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

243 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

244 default collections will be used. 

245 

246 Returns 

247 ------- 

248 executionButler : `lsst.daf.butler.Butler` 

249 An instance of the newly created execution butler 

250 

251 Raises 

252 ------ 

253 FileExistsError 

254 Raised if something exists in the filesystem at the specified output 

255 location and clobber is `False` 

256 NotADirectoryError 

257 Raised if specified output URI does not correspond to a directory 

258 """ 

259 # We know this must refer to a directory. 

260 outputLocation = ButlerURI(outputLocation, forceDirectory=True) 

261 

262 # Do this first to Fail Fast if the output exists 

263 if (dirExists := outputLocation.exists()) and not clobber: 

264 raise FileExistsError("Cannot create a butler at specified location, location exists") 

265 if not outputLocation.isdir(): 

266 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

267 

268 exports, inserts = _accumulate(graph) 

269 yamlBuffer = _export(butler, collections, exports, inserts) 

270 

271 newButler = _setupNewButler(butler, outputLocation, dirExists) 

272 

273 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)