Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 9%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

98 statements  

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['PreExecInit'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import itertools 

29 

30# ----------------------------- 

31# Imports for other modules -- 

32# ----------------------------- 

33from lsst.base import Packages 

34from lsst.pipe.base import PipelineDatasetTypes 

35 

36_LOG = logging.getLogger(__name__.partition(".")[2]) 

37 

38 

39class PreExecInit: 

40 """Initialization of registry for QuantumGraph execution. 

41 

42 This class encapsulates all necessary operations that have to be performed 

43 on butler and registry to prepare them for QuantumGraph execution. 

44 

45 Parameters 

46 ---------- 

47 butler : `~lsst.daf.butler.Butler` 

48 Data butler instance. 

49 taskFactory : `~lsst.pipe.base.TaskFactory` 

50 Task factory. 

51 extendRun : `bool`, optional 

52 If `True` then do not try to overwrite any datasets that might exist 

53 in ``butler.run``; instead compare them when appropriate/possible. If 

54 `False`, then any existing conflicting dataset will cause a butler 

55 exception to be raised. 

56 """ 

57 def __init__(self, butler, taskFactory, extendRun=False): 

58 self.butler = butler 

59 self.taskFactory = taskFactory 

60 self.extendRun = extendRun 

61 if self.extendRun and self.butler.run is None: 

62 raise RuntimeError( 

63 "Cannot perform extendRun logic unless butler is initialized " 

64 "with a default output RUN collection." 

65 ) 

66 

67 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

68 """Perform all initialization steps. 

69 

70 Convenience method to execute all initialization steps. Instead of 

71 calling this method and providing all options it is also possible to 

72 call methods individually. 

73 

74 Parameters 

75 ---------- 

76 graph : `~lsst.pipe.base.QuantumGraph` 

77 Execution graph. 

78 saveInitOutputs : `bool`, optional 

79 If ``True`` (default) then save "init outputs", configurations, 

80 and package versions to butler. 

81 registerDatasetTypes : `bool`, optional 

82 If ``True`` then register dataset types in registry, otherwise 

83 they must be already registered. 

84 saveVersions : `bool`, optional 

85 If ``False`` then do not save package versions even if 

86 ``saveInitOutputs`` is set to ``True``. 

87 """ 

88 # register dataset types or check consistency 

89 self.initializeDatasetTypes(graph, registerDatasetTypes) 

90 

91 # Save task initialization data or check that saved data 

92 # is consistent with what tasks would save 

93 if saveInitOutputs: 

94 self.saveInitOutputs(graph) 

95 self.saveConfigs(graph) 

96 if saveVersions: 

97 self.savePackageVersions(graph) 

98 

99 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

100 """Save or check DatasetTypes output by the tasks in a graph. 

101 

102 Iterates over all DatasetTypes for all tasks in a graph and either 

103 tries to add them to registry or compares them to exising ones. 

104 

105 Parameters 

106 ---------- 

107 graph : `~lsst.pipe.base.QuantumGraph` 

108 Execution graph. 

109 registerDatasetTypes : `bool`, optional 

110 If ``True`` then register dataset types in registry, otherwise 

111 they must be already registered. 

112 

113 Raises 

114 ------ 

115 ValueError 

116 Raised if existing DatasetType is different from DatasetType 

117 in a graph. 

118 KeyError 

119 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

120 does not exist in registry. 

121 """ 

122 pipeline = graph.taskGraph 

123 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry, 

124 include_configs=True, include_packages=True) 

125 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs, 

126 datasetTypes.intermediates, datasetTypes.outputs): 

127 # Only composites are registered, no components, and by this point 

128 # the composite should already exist. 

129 if registerDatasetTypes and not datasetType.isComponent(): 

130 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

131 # this is a no-op if it already exists and is consistent, 

132 # and it raises if it is inconsistent. 

133 self.butler.registry.registerDatasetType(datasetType) 

134 else: 

135 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

136 try: 

137 expected = self.butler.registry.getDatasetType(datasetType.name) 

138 except KeyError: 

139 # Likely means that --register-dataset-types is forgotten. 

140 raise KeyError(f"Dataset type with name '{datasetType.name}' not found. Dataset types " 

141 "have to be registered with either `butler register-dataset-type` or " 

142 "passing `--register-dataset-types` option to `pipetask run`.") from None 

143 if expected != datasetType: 

144 raise ValueError(f"DatasetType configuration does not match Registry: " 

145 f"{datasetType} != {expected}") 

146 

147 def saveInitOutputs(self, graph): 

148 """Write any datasets produced by initializing tasks in a graph. 

149 

150 Parameters 

151 ---------- 

152 graph : `~lsst.pipe.base.QuantumGraph` 

153 Execution graph. 

154 

155 Raises 

156 ------ 

157 TypeError 

158 Raised if ``extendRun`` is `True` but type of existing object in 

159 butler is different from new data. 

160 Exception 

161 Raised if ``extendRun`` is `False` and datasets already 

162 exists. Content of a butler collection may be changed if 

163 exception is raised. 

164 

165 Notes 

166 ----- 

167 If ``extendRun`` is `True` then existing datasets are not 

168 overwritten, instead we should check that their stored object is 

169 exactly the same as what we would save at this time. Comparing 

170 arbitrary types of object is, of course, non-trivial. Current 

171 implementation only checks the existence of the datasets and their 

172 types against the types of objects produced by tasks. Ideally we 

173 would like to check that object data is identical too but presently 

174 there is no generic way to compare objects. In the future we can 

175 potentially introduce some extensible mechanism for that. 

176 """ 

177 _LOG.debug("Will save InitOutputs for all tasks") 

178 for taskDef in graph.iterTaskGraph(): 

179 task = self.taskFactory.makeTask(taskDef.taskClass, 

180 taskDef.label, 

181 taskDef.config, 

182 None, 

183 self.butler) 

184 for name in taskDef.connections.initOutputs: 

185 attribute = getattr(taskDef.connections, name) 

186 initOutputVar = getattr(task, name) 

187 objFromStore = None 

188 if self.extendRun: 

189 # check if it is there already 

190 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", 

191 task, name, attribute.name) 

192 try: 

193 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

194 # Types are supposed to be identical. 

195 # TODO: Check that object contents is identical too. 

196 if type(objFromStore) is not type(initOutputVar): 

197 raise TypeError(f"Stored initOutput object type {type(objFromStore)} " 

198 f"is different from task-generated type " 

199 f"{type(initOutputVar)} for task {taskDef}") 

200 except (LookupError, FileNotFoundError): 

201 # FileNotFoundError likely means execution butler 

202 # where refs do exist but datastore artifacts do not. 

203 pass 

204 if objFromStore is None: 

205 # butler will raise exception if dataset is already there 

206 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

207 self.butler.put(initOutputVar, attribute.name, {}) 

208 

209 def saveConfigs(self, graph): 

210 """Write configurations for pipeline tasks to butler or check that 

211 existing configurations are equal to the new ones. 

212 

213 Parameters 

214 ---------- 

215 graph : `~lsst.pipe.base.QuantumGraph` 

216 Execution graph. 

217 

218 Raises 

219 ------ 

220 TypeError 

221 Raised if ``extendRun`` is `True` but existing object in butler is 

222 different from new data. 

223 Exception 

224 Raised if ``extendRun`` is `False` and datasets already exists. 

225 Content of a butler collection should not be changed if exception 

226 is raised. 

227 """ 

228 def logConfigMismatch(msg): 

229 """Log messages about configuration mismatch. 

230 """ 

231 _LOG.fatal("Comparing configuration: %s", msg) 

232 

233 _LOG.debug("Will save Configs for all tasks") 

234 # start transaction to rollback any changes on exceptions 

235 with self.butler.transaction(): 

236 for taskDef in graph.taskGraph: 

237 configName = taskDef.configDatasetName 

238 

239 oldConfig = None 

240 if self.extendRun: 

241 try: 

242 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

243 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

244 raise TypeError( 

245 f"Config does not match existing task config {configName!r} in butler; " 

246 "tasks configurations must be consistent within the same run collection") 

247 except (LookupError, FileNotFoundError): 

248 # FileNotFoundError likely means execution butler 

249 # where refs do exist but datastore artifacts do not. 

250 pass 

251 if oldConfig is None: 

252 # butler will raise exception if dataset is already there 

253 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

254 self.butler.put(taskDef.config, configName, {}) 

255 

256 def savePackageVersions(self, graph): 

257 """Write versions of software packages to butler. 

258 

259 Parameters 

260 ---------- 

261 graph : `~lsst.pipe.base.QuantumGraph` 

262 Execution graph. 

263 

264 Raises 

265 ------ 

266 TypeError 

267 Raised if ``extendRun`` is `True` but existing object in butler is 

268 different from new data. 

269 """ 

270 packages = Packages.fromSystem() 

271 _LOG.debug("want to save packages: %s", packages) 

272 datasetType = PipelineDatasetTypes.packagesDatasetName 

273 dataId = {} 

274 oldPackages = None 

275 # start transaction to rollback any changes on exceptions 

276 with self.butler.transaction(): 

277 if self.extendRun: 

278 try: 

279 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

280 _LOG.debug("old packages: %s", oldPackages) 

281 except (LookupError, FileNotFoundError): 

282 # FileNotFoundError likely means execution butler where 

283 # refs do exist but datastore artifacts do not. 

284 pass 

285 if oldPackages is not None: 

286 # Note that because we can only detect python modules that have 

287 # been imported, the stored list of products may be more or 

288 # less complete than what we have now. What's important is 

289 # that the products that are in common have the same version. 

290 diff = packages.difference(oldPackages) 

291 if diff: 

292 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

293 raise TypeError(f"Package versions mismatch: ({versions_str})") 

294 else: 

295 _LOG.debug("new packages are consistent with old") 

296 # Update the old set of packages in case we have more packages 

297 # that haven't been persisted. 

298 extra = packages.extra(oldPackages) 

299 if extra: 

300 _LOG.debug("extra packages: %s", extra) 

301 oldPackages.update(packages) 

302 # have to remove existing dataset first, butler has no 

303 # replace option. 

304 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

305 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

306 self.butler.put(oldPackages, datasetType, dataId) 

307 else: 

308 self.butler.put(packages, datasetType, dataId)