Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 11%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

98 statements  

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["PreExecInit"] 

23 

24import itertools 

25 

26# ------------------------------- 

27# Imports of standard modules -- 

28# ------------------------------- 

29import logging 

30 

31# ----------------------------- 

32# Imports for other modules -- 

33# ----------------------------- 

34from lsst.base import Packages 

35from lsst.pipe.base import PipelineDatasetTypes 

36 

37_LOG = logging.getLogger(__name__) 

38 

39 

40class PreExecInit: 

41 """Initialization of registry for QuantumGraph execution. 

42 

43 This class encapsulates all necessary operations that have to be performed 

44 on butler and registry to prepare them for QuantumGraph execution. 

45 

46 Parameters 

47 ---------- 

48 butler : `~lsst.daf.butler.Butler` 

49 Data butler instance. 

50 taskFactory : `~lsst.pipe.base.TaskFactory` 

51 Task factory. 

52 extendRun : `bool`, optional 

53 If `True` then do not try to overwrite any datasets that might exist 

54 in ``butler.run``; instead compare them when appropriate/possible. If 

55 `False`, then any existing conflicting dataset will cause a butler 

56 exception to be raised. 

57 """ 

58 

59 def __init__(self, butler, taskFactory, extendRun=False): 

60 self.butler = butler 

61 self.taskFactory = taskFactory 

62 self.extendRun = extendRun 

63 if self.extendRun and self.butler.run is None: 

64 raise RuntimeError( 

65 "Cannot perform extendRun logic unless butler is initialized " 

66 "with a default output RUN collection." 

67 ) 

68 

69 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

70 """Perform all initialization steps. 

71 

72 Convenience method to execute all initialization steps. Instead of 

73 calling this method and providing all options it is also possible to 

74 call methods individually. 

75 

76 Parameters 

77 ---------- 

78 graph : `~lsst.pipe.base.QuantumGraph` 

79 Execution graph. 

80 saveInitOutputs : `bool`, optional 

81 If ``True`` (default) then save "init outputs", configurations, 

82 and package versions to butler. 

83 registerDatasetTypes : `bool`, optional 

84 If ``True`` then register dataset types in registry, otherwise 

85 they must be already registered. 

86 saveVersions : `bool`, optional 

87 If ``False`` then do not save package versions even if 

88 ``saveInitOutputs`` is set to ``True``. 

89 """ 

90 # register dataset types or check consistency 

91 self.initializeDatasetTypes(graph, registerDatasetTypes) 

92 

93 # Save task initialization data or check that saved data 

94 # is consistent with what tasks would save 

95 if saveInitOutputs: 

96 self.saveInitOutputs(graph) 

97 self.saveConfigs(graph) 

98 if saveVersions: 

99 self.savePackageVersions(graph) 

100 

101 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

102 """Save or check DatasetTypes output by the tasks in a graph. 

103 

104 Iterates over all DatasetTypes for all tasks in a graph and either 

105 tries to add them to registry or compares them to exising ones. 

106 

107 Parameters 

108 ---------- 

109 graph : `~lsst.pipe.base.QuantumGraph` 

110 Execution graph. 

111 registerDatasetTypes : `bool`, optional 

112 If ``True`` then register dataset types in registry, otherwise 

113 they must be already registered. 

114 

115 Raises 

116 ------ 

117 ValueError 

118 Raised if existing DatasetType is different from DatasetType 

119 in a graph. 

120 KeyError 

121 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

122 does not exist in registry. 

123 """ 

124 pipeline = graph.taskGraph 

125 datasetTypes = PipelineDatasetTypes.fromPipeline( 

126 pipeline, registry=self.butler.registry, include_configs=True, include_packages=True 

127 ) 

128 for datasetType in itertools.chain( 

129 datasetTypes.initIntermediates, 

130 datasetTypes.initOutputs, 

131 datasetTypes.intermediates, 

132 datasetTypes.outputs, 

133 ): 

134 # Only composites are registered, no components, and by this point 

135 # the composite should already exist. 

136 if registerDatasetTypes and not datasetType.isComponent(): 

137 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

138 # this is a no-op if it already exists and is consistent, 

139 # and it raises if it is inconsistent. 

140 self.butler.registry.registerDatasetType(datasetType) 

141 else: 

142 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

143 try: 

144 expected = self.butler.registry.getDatasetType(datasetType.name) 

145 except KeyError: 

146 # Likely means that --register-dataset-types is forgotten. 

147 raise KeyError( 

148 f"Dataset type with name '{datasetType.name}' not found. Dataset types " 

149 "have to be registered with either `butler register-dataset-type` or " 

150 "passing `--register-dataset-types` option to `pipetask run`." 

151 ) from None 

152 if expected != datasetType: 

153 raise ValueError( 

154 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

155 ) 

156 

157 def saveInitOutputs(self, graph): 

158 """Write any datasets produced by initializing tasks in a graph. 

159 

160 Parameters 

161 ---------- 

162 graph : `~lsst.pipe.base.QuantumGraph` 

163 Execution graph. 

164 

165 Raises 

166 ------ 

167 TypeError 

168 Raised if ``extendRun`` is `True` but type of existing object in 

169 butler is different from new data. 

170 Exception 

171 Raised if ``extendRun`` is `False` and datasets already 

172 exists. Content of a butler collection may be changed if 

173 exception is raised. 

174 

175 Notes 

176 ----- 

177 If ``extendRun`` is `True` then existing datasets are not 

178 overwritten, instead we should check that their stored object is 

179 exactly the same as what we would save at this time. Comparing 

180 arbitrary types of object is, of course, non-trivial. Current 

181 implementation only checks the existence of the datasets and their 

182 types against the types of objects produced by tasks. Ideally we 

183 would like to check that object data is identical too but presently 

184 there is no generic way to compare objects. In the future we can 

185 potentially introduce some extensible mechanism for that. 

186 """ 

187 _LOG.debug("Will save InitOutputs for all tasks") 

188 for taskDef in graph.iterTaskGraph(): 

189 task = self.taskFactory.makeTask( 

190 taskDef.taskClass, taskDef.label, taskDef.config, None, self.butler 

191 ) 

192 for name in taskDef.connections.initOutputs: 

193 attribute = getattr(taskDef.connections, name) 

194 initOutputVar = getattr(task, name) 

195 objFromStore = None 

196 if self.extendRun: 

197 # check if it is there already 

198 _LOG.debug( 

199 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

200 ) 

201 try: 

202 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

203 # Types are supposed to be identical. 

204 # TODO: Check that object contents is identical too. 

205 if type(objFromStore) is not type(initOutputVar): 

206 raise TypeError( 

207 f"Stored initOutput object type {type(objFromStore)} " 

208 f"is different from task-generated type " 

209 f"{type(initOutputVar)} for task {taskDef}" 

210 ) 

211 except (LookupError, FileNotFoundError): 

212 # FileNotFoundError likely means execution butler 

213 # where refs do exist but datastore artifacts do not. 

214 pass 

215 if objFromStore is None: 

216 # butler will raise exception if dataset is already there 

217 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

218 self.butler.put(initOutputVar, attribute.name, {}) 

219 

220 def saveConfigs(self, graph): 

221 """Write configurations for pipeline tasks to butler or check that 

222 existing configurations are equal to the new ones. 

223 

224 Parameters 

225 ---------- 

226 graph : `~lsst.pipe.base.QuantumGraph` 

227 Execution graph. 

228 

229 Raises 

230 ------ 

231 TypeError 

232 Raised if ``extendRun`` is `True` but existing object in butler is 

233 different from new data. 

234 Exception 

235 Raised if ``extendRun`` is `False` and datasets already exists. 

236 Content of a butler collection should not be changed if exception 

237 is raised. 

238 """ 

239 

240 def logConfigMismatch(msg): 

241 """Log messages about configuration mismatch.""" 

242 _LOG.fatal("Comparing configuration: %s", msg) 

243 

244 _LOG.debug("Will save Configs for all tasks") 

245 # start transaction to rollback any changes on exceptions 

246 with self.butler.transaction(): 

247 for taskDef in graph.taskGraph: 

248 configName = taskDef.configDatasetName 

249 

250 oldConfig = None 

251 if self.extendRun: 

252 try: 

253 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

254 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

255 raise TypeError( 

256 f"Config does not match existing task config {configName!r} in butler; " 

257 "tasks configurations must be consistent within the same run collection" 

258 ) 

259 except (LookupError, FileNotFoundError): 

260 # FileNotFoundError likely means execution butler 

261 # where refs do exist but datastore artifacts do not. 

262 pass 

263 if oldConfig is None: 

264 # butler will raise exception if dataset is already there 

265 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

266 self.butler.put(taskDef.config, configName, {}) 

267 

268 def savePackageVersions(self, graph): 

269 """Write versions of software packages to butler. 

270 

271 Parameters 

272 ---------- 

273 graph : `~lsst.pipe.base.QuantumGraph` 

274 Execution graph. 

275 

276 Raises 

277 ------ 

278 TypeError 

279 Raised if ``extendRun`` is `True` but existing object in butler is 

280 different from new data. 

281 """ 

282 packages = Packages.fromSystem() 

283 _LOG.debug("want to save packages: %s", packages) 

284 datasetType = PipelineDatasetTypes.packagesDatasetName 

285 dataId = {} 

286 oldPackages = None 

287 # start transaction to rollback any changes on exceptions 

288 with self.butler.transaction(): 

289 if self.extendRun: 

290 try: 

291 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

292 _LOG.debug("old packages: %s", oldPackages) 

293 except (LookupError, FileNotFoundError): 

294 # FileNotFoundError likely means execution butler where 

295 # refs do exist but datastore artifacts do not. 

296 pass 

297 if oldPackages is not None: 

298 # Note that because we can only detect python modules that have 

299 # been imported, the stored list of products may be more or 

300 # less complete than what we have now. What's important is 

301 # that the products that are in common have the same version. 

302 diff = packages.difference(oldPackages) 

303 if diff: 

304 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

305 raise TypeError(f"Package versions mismatch: ({versions_str})") 

306 else: 

307 _LOG.debug("new packages are consistent with old") 

308 # Update the old set of packages in case we have more packages 

309 # that haven't been persisted. 

310 extra = packages.extra(oldPackages) 

311 if extra: 

312 _LOG.debug("extra packages: %s", extra) 

313 oldPackages.update(packages) 

314 # have to remove existing dataset first, butler has no 

315 # replace option. 

316 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

317 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

318 self.butler.put(oldPackages, datasetType, dataId) 

319 else: 

320 self.butler.put(packages, datasetType, dataId)