Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 9%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

131 statements  

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["PreExecInit"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28 

29# ----------------------------- 

30# Imports for other modules -- 

31# ----------------------------- 

32from lsst.daf.butler import DatasetType 

33from lsst.daf.butler.registry import ConflictingDefinitionError 

34from lsst.pipe.base import PipelineDatasetTypes 

35from lsst.utils.packages import Packages 

36 

37from .mock_task import MockButlerQuantumContext 

38 

39_LOG = logging.getLogger(__name__) 

40 

41 

42class PreExecInit: 

43 """Initialization of registry for QuantumGraph execution. 

44 

45 This class encapsulates all necessary operations that have to be performed 

46 on butler and registry to prepare them for QuantumGraph execution. 

47 

48 Parameters 

49 ---------- 

50 butler : `~lsst.daf.butler.Butler` 

51 Data butler instance. 

52 taskFactory : `~lsst.pipe.base.TaskFactory` 

53 Task factory. 

54 extendRun : `bool`, optional 

55 If `True` then do not try to overwrite any datasets that might exist 

56 in ``butler.run``; instead compare them when appropriate/possible. If 

57 `False`, then any existing conflicting dataset will cause a butler 

58 exception to be raised. 

59 mock : `bool`, optional 

60 If `True` then also do initialization needed for pipeline mocking. 

61 """ 

62 

63 def __init__(self, butler, taskFactory, extendRun=False, mock=False): 

64 self.butler = butler 

65 self.taskFactory = taskFactory 

66 self.extendRun = extendRun 

67 self.mock = mock 

68 if self.extendRun and self.butler.run is None: 

69 raise RuntimeError( 

70 "Cannot perform extendRun logic unless butler is initialized " 

71 "with a default output RUN collection." 

72 ) 

73 

74 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

75 """Perform all initialization steps. 

76 

77 Convenience method to execute all initialization steps. Instead of 

78 calling this method and providing all options it is also possible to 

79 call methods individually. 

80 

81 Parameters 

82 ---------- 

83 graph : `~lsst.pipe.base.QuantumGraph` 

84 Execution graph. 

85 saveInitOutputs : `bool`, optional 

86 If ``True`` (default) then save "init outputs", configurations, 

87 and package versions to butler. 

88 registerDatasetTypes : `bool`, optional 

89 If ``True`` then register dataset types in registry, otherwise 

90 they must be already registered. 

91 saveVersions : `bool`, optional 

92 If ``False`` then do not save package versions even if 

93 ``saveInitOutputs`` is set to ``True``. 

94 """ 

95 # register dataset types or check consistency 

96 self.initializeDatasetTypes(graph, registerDatasetTypes) 

97 

98 # Save task initialization data or check that saved data 

99 # is consistent with what tasks would save 

100 if saveInitOutputs: 

101 self.saveInitOutputs(graph) 

102 self.saveConfigs(graph) 

103 if saveVersions: 

104 self.savePackageVersions(graph) 

105 

106 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

107 """Save or check DatasetTypes output by the tasks in a graph. 

108 

109 Iterates over all DatasetTypes for all tasks in a graph and either 

110 tries to add them to registry or compares them to exising ones. 

111 

112 Parameters 

113 ---------- 

114 graph : `~lsst.pipe.base.QuantumGraph` 

115 Execution graph. 

116 registerDatasetTypes : `bool`, optional 

117 If ``True`` then register dataset types in registry, otherwise 

118 they must be already registered. 

119 

120 Raises 

121 ------ 

122 ValueError 

123 Raised if existing DatasetType is different from DatasetType 

124 in a graph. 

125 KeyError 

126 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

127 does not exist in registry. 

128 """ 

129 pipeline = graph.taskGraph 

130 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

131 pipeline, registry=self.butler.registry, include_configs=True, include_packages=True 

132 ) 

133 

134 for datasetTypes, is_input in ( 

135 (pipelineDatasetTypes.initIntermediates, True), 

136 (pipelineDatasetTypes.initOutputs, False), 

137 (pipelineDatasetTypes.intermediates, True), 

138 (pipelineDatasetTypes.outputs, False), 

139 ): 

140 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input) 

141 

142 if self.mock: 

143 # register special mock data types, skip logs and metadata 

144 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline} 

145 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline} 

146 for datasetTypes, is_input in ( 

147 (pipelineDatasetTypes.intermediates, True), 

148 (pipelineDatasetTypes.outputs, False), 

149 ): 

150 mockDatasetTypes = [] 

151 for datasetType in datasetTypes: 

152 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()): 

153 mockDatasetTypes.append( 

154 DatasetType( 

155 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name), 

156 datasetType.dimensions, 

157 "StructuredDataDict", 

158 ) 

159 ) 

160 if mockDatasetTypes: 

161 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input) 

162 

163 def _register_output_dataset_types(self, registerDatasetTypes, datasetTypes, is_input): 

164 def _check_compatibility(datasetType, expected, is_input) -> bool: 

165 # These are output dataset types so check for compatibility on put. 

166 is_compatible = expected.is_compatible_with(datasetType) 

167 

168 if is_input: 

169 # This dataset type is also used for input so must be 

170 # compatible on get as ell. 

171 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

172 

173 if is_compatible: 

174 _LOG.debug( 

175 "The dataset type configurations differ (%s from task != %s from registry) " 

176 "but the storage classes are compatible. Can continue.", 

177 datasetType, 

178 expected, 

179 ) 

180 return is_compatible 

181 

182 missing_datasetTypes = set() 

183 for datasetType in datasetTypes: 

184 # Only composites are registered, no components, and by this point 

185 # the composite should already exist. 

186 if registerDatasetTypes and not datasetType.isComponent(): 

187 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

188 # this is a no-op if it already exists and is consistent, 

189 # and it raises if it is inconsistent. 

190 try: 

191 self.butler.registry.registerDatasetType(datasetType) 

192 except ConflictingDefinitionError: 

193 if not _check_compatibility( 

194 datasetType, self.butler.registry.getDatasetType(datasetType.name), is_input 

195 ): 

196 raise 

197 else: 

198 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

199 try: 

200 expected = self.butler.registry.getDatasetType(datasetType.name) 

201 except KeyError: 

202 # Likely means that --register-dataset-types is forgotten. 

203 missing_datasetTypes.add(datasetType.name) 

204 continue 

205 if expected != datasetType: 

206 if not _check_compatibility(datasetType, expected, is_input): 

207 raise ValueError( 

208 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

209 ) 

210 

211 if missing_datasetTypes: 

212 plural = "s" if len(missing_datasetTypes) != 1 else "" 

213 raise KeyError( 

214 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

215 "Dataset types have to be registered with either `butler register-dataset-type` or " 

216 "passing `--register-dataset-types` option to `pipetask run`." 

217 ) 

218 

219 def saveInitOutputs(self, graph): 

220 """Write any datasets produced by initializing tasks in a graph. 

221 

222 Parameters 

223 ---------- 

224 graph : `~lsst.pipe.base.QuantumGraph` 

225 Execution graph. 

226 

227 Raises 

228 ------ 

229 TypeError 

230 Raised if ``extendRun`` is `True` but type of existing object in 

231 butler is different from new data. 

232 Exception 

233 Raised if ``extendRun`` is `False` and datasets already 

234 exists. Content of a butler collection may be changed if 

235 exception is raised. 

236 

237 Notes 

238 ----- 

239 If ``extendRun`` is `True` then existing datasets are not 

240 overwritten, instead we should check that their stored object is 

241 exactly the same as what we would save at this time. Comparing 

242 arbitrary types of object is, of course, non-trivial. Current 

243 implementation only checks the existence of the datasets and their 

244 types against the types of objects produced by tasks. Ideally we 

245 would like to check that object data is identical too but presently 

246 there is no generic way to compare objects. In the future we can 

247 potentially introduce some extensible mechanism for that. 

248 """ 

249 _LOG.debug("Will save InitOutputs for all tasks") 

250 for taskDef in graph.iterTaskGraph(): 

251 task = self.taskFactory.makeTask( 

252 taskDef.taskClass, taskDef.label, taskDef.config, None, self.butler 

253 ) 

254 for name in taskDef.connections.initOutputs: 

255 attribute = getattr(taskDef.connections, name) 

256 initOutputVar = getattr(task, name) 

257 objFromStore = None 

258 if self.extendRun: 

259 # check if it is there already 

260 _LOG.debug( 

261 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

262 ) 

263 try: 

264 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

265 # Types are supposed to be identical. 

266 # TODO: Check that object contents is identical too. 

267 if type(objFromStore) is not type(initOutputVar): 

268 raise TypeError( 

269 f"Stored initOutput object type {type(objFromStore)} " 

270 f"is different from task-generated type " 

271 f"{type(initOutputVar)} for task {taskDef}" 

272 ) 

273 except (LookupError, FileNotFoundError): 

274 # FileNotFoundError likely means execution butler 

275 # where refs do exist but datastore artifacts do not. 

276 pass 

277 if objFromStore is None: 

278 # butler will raise exception if dataset is already there 

279 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

280 self.butler.put(initOutputVar, attribute.name, {}) 

281 

282 def saveConfigs(self, graph): 

283 """Write configurations for pipeline tasks to butler or check that 

284 existing configurations are equal to the new ones. 

285 

286 Parameters 

287 ---------- 

288 graph : `~lsst.pipe.base.QuantumGraph` 

289 Execution graph. 

290 

291 Raises 

292 ------ 

293 TypeError 

294 Raised if ``extendRun`` is `True` but existing object in butler is 

295 different from new data. 

296 Exception 

297 Raised if ``extendRun`` is `False` and datasets already exists. 

298 Content of a butler collection should not be changed if exception 

299 is raised. 

300 """ 

301 

302 def logConfigMismatch(msg): 

303 """Log messages about configuration mismatch.""" 

304 _LOG.fatal("Comparing configuration: %s", msg) 

305 

306 _LOG.debug("Will save Configs for all tasks") 

307 # start transaction to rollback any changes on exceptions 

308 with self.butler.transaction(): 

309 for taskDef in graph.taskGraph: 

310 configName = taskDef.configDatasetName 

311 

312 oldConfig = None 

313 if self.extendRun: 

314 try: 

315 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

316 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

317 raise TypeError( 

318 f"Config does not match existing task config {configName!r} in butler; " 

319 "tasks configurations must be consistent within the same run collection" 

320 ) 

321 except (LookupError, FileNotFoundError): 

322 # FileNotFoundError likely means execution butler 

323 # where refs do exist but datastore artifacts do not. 

324 pass 

325 if oldConfig is None: 

326 # butler will raise exception if dataset is already there 

327 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

328 self.butler.put(taskDef.config, configName, {}) 

329 

330 def savePackageVersions(self, graph): 

331 """Write versions of software packages to butler. 

332 

333 Parameters 

334 ---------- 

335 graph : `~lsst.pipe.base.QuantumGraph` 

336 Execution graph. 

337 

338 Raises 

339 ------ 

340 TypeError 

341 Raised if ``extendRun`` is `True` but existing object in butler is 

342 different from new data. 

343 """ 

344 packages = Packages.fromSystem() 

345 _LOG.debug("want to save packages: %s", packages) 

346 datasetType = PipelineDatasetTypes.packagesDatasetName 

347 dataId = {} 

348 oldPackages = None 

349 # start transaction to rollback any changes on exceptions 

350 with self.butler.transaction(): 

351 if self.extendRun: 

352 try: 

353 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

354 _LOG.debug("old packages: %s", oldPackages) 

355 except (LookupError, FileNotFoundError): 

356 # FileNotFoundError likely means execution butler where 

357 # refs do exist but datastore artifacts do not. 

358 pass 

359 if oldPackages is not None: 

360 # Note that because we can only detect python modules that have 

361 # been imported, the stored list of products may be more or 

362 # less complete than what we have now. What's important is 

363 # that the products that are in common have the same version. 

364 diff = packages.difference(oldPackages) 

365 if diff: 

366 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

367 raise TypeError(f"Package versions mismatch: ({versions_str})") 

368 else: 

369 _LOG.debug("new packages are consistent with old") 

370 # Update the old set of packages in case we have more packages 

371 # that haven't been persisted. 

372 extra = packages.extra(oldPackages) 

373 if extra: 

374 _LOG.debug("extra packages: %s", extra) 

375 oldPackages.update(packages) 

376 # have to remove existing dataset first, butler has no 

377 # replace option. 

378 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

379 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

380 self.butler.put(oldPackages, datasetType, dataId) 

381 else: 

382 self.butler.put(packages, datasetType, dataId)