Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 10%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

113 statements  

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["PreExecInit"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28 

29# ----------------------------- 

30# Imports for other modules -- 

31# ----------------------------- 

32from lsst.base import Packages 

33from lsst.daf.butler.registry import ConflictingDefinitionError 

34from lsst.pipe.base import PipelineDatasetTypes 

35 

36_LOG = logging.getLogger(__name__) 

37 

38 

39class PreExecInit: 

40 """Initialization of registry for QuantumGraph execution. 

41 

42 This class encapsulates all necessary operations that have to be performed 

43 on butler and registry to prepare them for QuantumGraph execution. 

44 

45 Parameters 

46 ---------- 

47 butler : `~lsst.daf.butler.Butler` 

48 Data butler instance. 

49 taskFactory : `~lsst.pipe.base.TaskFactory` 

50 Task factory. 

51 extendRun : `bool`, optional 

52 If `True` then do not try to overwrite any datasets that might exist 

53 in ``butler.run``; instead compare them when appropriate/possible. If 

54 `False`, then any existing conflicting dataset will cause a butler 

55 exception to be raised. 

56 """ 

57 

58 def __init__(self, butler, taskFactory, extendRun=False): 

59 self.butler = butler 

60 self.taskFactory = taskFactory 

61 self.extendRun = extendRun 

62 if self.extendRun and self.butler.run is None: 

63 raise RuntimeError( 

64 "Cannot perform extendRun logic unless butler is initialized " 

65 "with a default output RUN collection." 

66 ) 

67 

68 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

69 """Perform all initialization steps. 

70 

71 Convenience method to execute all initialization steps. Instead of 

72 calling this method and providing all options it is also possible to 

73 call methods individually. 

74 

75 Parameters 

76 ---------- 

77 graph : `~lsst.pipe.base.QuantumGraph` 

78 Execution graph. 

79 saveInitOutputs : `bool`, optional 

80 If ``True`` (default) then save "init outputs", configurations, 

81 and package versions to butler. 

82 registerDatasetTypes : `bool`, optional 

83 If ``True`` then register dataset types in registry, otherwise 

84 they must be already registered. 

85 saveVersions : `bool`, optional 

86 If ``False`` then do not save package versions even if 

87 ``saveInitOutputs`` is set to ``True``. 

88 """ 

89 # register dataset types or check consistency 

90 self.initializeDatasetTypes(graph, registerDatasetTypes) 

91 

92 # Save task initialization data or check that saved data 

93 # is consistent with what tasks would save 

94 if saveInitOutputs: 

95 self.saveInitOutputs(graph) 

96 self.saveConfigs(graph) 

97 if saveVersions: 

98 self.savePackageVersions(graph) 

99 

100 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

101 """Save or check DatasetTypes output by the tasks in a graph. 

102 

103 Iterates over all DatasetTypes for all tasks in a graph and either 

104 tries to add them to registry or compares them to exising ones. 

105 

106 Parameters 

107 ---------- 

108 graph : `~lsst.pipe.base.QuantumGraph` 

109 Execution graph. 

110 registerDatasetTypes : `bool`, optional 

111 If ``True`` then register dataset types in registry, otherwise 

112 they must be already registered. 

113 

114 Raises 

115 ------ 

116 ValueError 

117 Raised if existing DatasetType is different from DatasetType 

118 in a graph. 

119 KeyError 

120 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

121 does not exist in registry. 

122 """ 

123 pipeline = graph.taskGraph 

124 datasetTypes = PipelineDatasetTypes.fromPipeline( 

125 pipeline, registry=self.butler.registry, include_configs=True, include_packages=True 

126 ) 

127 

128 for datasetTypes, is_input in ( 

129 (datasetTypes.initIntermediates, True), 

130 (datasetTypes.initOutputs, False), 

131 (datasetTypes.intermediates, True), 

132 (datasetTypes.outputs, False), 

133 ): 

134 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input) 

135 

136 def _register_output_dataset_types(self, registerDatasetTypes, datasetTypes, is_input): 

137 def _check_compatibility(datasetType, expected, is_input) -> bool: 

138 # These are output dataset types so check for compatibility on put. 

139 is_compatible = expected.is_compatible_with(datasetType) 

140 

141 if is_input: 

142 # This dataset type is also used for input so must be 

143 # compatible on get as ell. 

144 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

145 

146 if is_compatible: 

147 _LOG.debug( 

148 "The dataset type configurations differ (%s from task != %s from registry) " 

149 "but the storage classes are compatible. Can continue.", 

150 datasetType, 

151 expected, 

152 ) 

153 return is_compatible 

154 

155 for datasetType in datasetTypes: 

156 # Only composites are registered, no components, and by this point 

157 # the composite should already exist. 

158 if registerDatasetTypes and not datasetType.isComponent(): 

159 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

160 # this is a no-op if it already exists and is consistent, 

161 # and it raises if it is inconsistent. 

162 try: 

163 self.butler.registry.registerDatasetType(datasetType) 

164 except ConflictingDefinitionError: 

165 if not _check_compatibility( 

166 datasetType, self.butler.registry.getDatasetType(datasetType.name), is_input 

167 ): 

168 raise 

169 else: 

170 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

171 try: 

172 expected = self.butler.registry.getDatasetType(datasetType.name) 

173 except KeyError: 

174 # Likely means that --register-dataset-types is forgotten. 

175 raise KeyError( 

176 f"Dataset type with name '{datasetType.name}' not found. Dataset types " 

177 "have to be registered with either `butler register-dataset-type` or " 

178 "passing `--register-dataset-types` option to `pipetask run`." 

179 ) from None 

180 if expected != datasetType: 

181 if not _check_compatibility(datasetType, expected, is_input): 

182 raise ValueError( 

183 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

184 ) 

185 

186 def saveInitOutputs(self, graph): 

187 """Write any datasets produced by initializing tasks in a graph. 

188 

189 Parameters 

190 ---------- 

191 graph : `~lsst.pipe.base.QuantumGraph` 

192 Execution graph. 

193 

194 Raises 

195 ------ 

196 TypeError 

197 Raised if ``extendRun`` is `True` but type of existing object in 

198 butler is different from new data. 

199 Exception 

200 Raised if ``extendRun`` is `False` and datasets already 

201 exists. Content of a butler collection may be changed if 

202 exception is raised. 

203 

204 Notes 

205 ----- 

206 If ``extendRun`` is `True` then existing datasets are not 

207 overwritten, instead we should check that their stored object is 

208 exactly the same as what we would save at this time. Comparing 

209 arbitrary types of object is, of course, non-trivial. Current 

210 implementation only checks the existence of the datasets and their 

211 types against the types of objects produced by tasks. Ideally we 

212 would like to check that object data is identical too but presently 

213 there is no generic way to compare objects. In the future we can 

214 potentially introduce some extensible mechanism for that. 

215 """ 

216 _LOG.debug("Will save InitOutputs for all tasks") 

217 for taskDef in graph.iterTaskGraph(): 

218 task = self.taskFactory.makeTask( 

219 taskDef.taskClass, taskDef.label, taskDef.config, None, self.butler 

220 ) 

221 for name in taskDef.connections.initOutputs: 

222 attribute = getattr(taskDef.connections, name) 

223 initOutputVar = getattr(task, name) 

224 objFromStore = None 

225 if self.extendRun: 

226 # check if it is there already 

227 _LOG.debug( 

228 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

229 ) 

230 try: 

231 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

232 # Types are supposed to be identical. 

233 # TODO: Check that object contents is identical too. 

234 if type(objFromStore) is not type(initOutputVar): 

235 raise TypeError( 

236 f"Stored initOutput object type {type(objFromStore)} " 

237 f"is different from task-generated type " 

238 f"{type(initOutputVar)} for task {taskDef}" 

239 ) 

240 except (LookupError, FileNotFoundError): 

241 # FileNotFoundError likely means execution butler 

242 # where refs do exist but datastore artifacts do not. 

243 pass 

244 if objFromStore is None: 

245 # butler will raise exception if dataset is already there 

246 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

247 self.butler.put(initOutputVar, attribute.name, {}) 

248 

249 def saveConfigs(self, graph): 

250 """Write configurations for pipeline tasks to butler or check that 

251 existing configurations are equal to the new ones. 

252 

253 Parameters 

254 ---------- 

255 graph : `~lsst.pipe.base.QuantumGraph` 

256 Execution graph. 

257 

258 Raises 

259 ------ 

260 TypeError 

261 Raised if ``extendRun`` is `True` but existing object in butler is 

262 different from new data. 

263 Exception 

264 Raised if ``extendRun`` is `False` and datasets already exists. 

265 Content of a butler collection should not be changed if exception 

266 is raised. 

267 """ 

268 

269 def logConfigMismatch(msg): 

270 """Log messages about configuration mismatch.""" 

271 _LOG.fatal("Comparing configuration: %s", msg) 

272 

273 _LOG.debug("Will save Configs for all tasks") 

274 # start transaction to rollback any changes on exceptions 

275 with self.butler.transaction(): 

276 for taskDef in graph.taskGraph: 

277 configName = taskDef.configDatasetName 

278 

279 oldConfig = None 

280 if self.extendRun: 

281 try: 

282 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

283 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

284 raise TypeError( 

285 f"Config does not match existing task config {configName!r} in butler; " 

286 "tasks configurations must be consistent within the same run collection" 

287 ) 

288 except (LookupError, FileNotFoundError): 

289 # FileNotFoundError likely means execution butler 

290 # where refs do exist but datastore artifacts do not. 

291 pass 

292 if oldConfig is None: 

293 # butler will raise exception if dataset is already there 

294 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

295 self.butler.put(taskDef.config, configName, {}) 

296 

297 def savePackageVersions(self, graph): 

298 """Write versions of software packages to butler. 

299 

300 Parameters 

301 ---------- 

302 graph : `~lsst.pipe.base.QuantumGraph` 

303 Execution graph. 

304 

305 Raises 

306 ------ 

307 TypeError 

308 Raised if ``extendRun`` is `True` but existing object in butler is 

309 different from new data. 

310 """ 

311 packages = Packages.fromSystem() 

312 _LOG.debug("want to save packages: %s", packages) 

313 datasetType = PipelineDatasetTypes.packagesDatasetName 

314 dataId = {} 

315 oldPackages = None 

316 # start transaction to rollback any changes on exceptions 

317 with self.butler.transaction(): 

318 if self.extendRun: 

319 try: 

320 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

321 _LOG.debug("old packages: %s", oldPackages) 

322 except (LookupError, FileNotFoundError): 

323 # FileNotFoundError likely means execution butler where 

324 # refs do exist but datastore artifacts do not. 

325 pass 

326 if oldPackages is not None: 

327 # Note that because we can only detect python modules that have 

328 # been imported, the stored list of products may be more or 

329 # less complete than what we have now. What's important is 

330 # that the products that are in common have the same version. 

331 diff = packages.difference(oldPackages) 

332 if diff: 

333 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

334 raise TypeError(f"Package versions mismatch: ({versions_str})") 

335 else: 

336 _LOG.debug("new packages are consistent with old") 

337 # Update the old set of packages in case we have more packages 

338 # that haven't been persisted. 

339 extra = packages.extra(oldPackages) 

340 if extra: 

341 _LOG.debug("extra packages: %s", extra) 

342 oldPackages.update(packages) 

343 # have to remove existing dataset first, butler has no 

344 # replace option. 

345 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

346 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

347 self.butler.put(oldPackages, datasetType, dataId) 

348 else: 

349 self.butler.put(packages, datasetType, dataId)