Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 10%

137 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-19 02:01 -0800

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["PreExecInit"] 

25 

26# ------------------------------- 

27# Imports of standard modules -- 

28# ------------------------------- 

29import logging 

30from typing import TYPE_CHECKING, Any, Iterable 

31 

32# ----------------------------- 

33# Imports for other modules -- 

34# ----------------------------- 

35from lsst.daf.butler import DatasetType 

36from lsst.daf.butler.registry import ConflictingDefinitionError 

37from lsst.pipe.base import PipelineDatasetTypes 

38from lsst.utils.packages import Packages 

39 

40from .mock_task import MockButlerQuantumContext 

41 

42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true

43 from lsst.daf.butler import Butler 

44 from lsst.pipe.base import QuantumGraph, TaskFactory 

45 

46_LOG = logging.getLogger(__name__) 

47 

48 

49class PreExecInit: 

50 """Initialization of registry for QuantumGraph execution. 

51 

52 This class encapsulates all necessary operations that have to be performed 

53 on butler and registry to prepare them for QuantumGraph execution. 

54 

55 Parameters 

56 ---------- 

57 butler : `~lsst.daf.butler.Butler` 

58 Data butler instance. 

59 taskFactory : `~lsst.pipe.base.TaskFactory` 

60 Task factory. 

61 extendRun : `bool`, optional 

62 If `True` then do not try to overwrite any datasets that might exist 

63 in ``butler.run``; instead compare them when appropriate/possible. If 

64 `False`, then any existing conflicting dataset will cause a butler 

65 exception to be raised. 

66 mock : `bool`, optional 

67 If `True` then also do initialization needed for pipeline mocking. 

68 """ 

69 

70 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False, mock: bool = False): 

71 self.butler = butler 

72 self.taskFactory = taskFactory 

73 self.extendRun = extendRun 

74 self.mock = mock 

75 if self.extendRun and self.butler.run is None: 

76 raise RuntimeError( 

77 "Cannot perform extendRun logic unless butler is initialized " 

78 "with a default output RUN collection." 

79 ) 

80 

81 def initialize( 

82 self, 

83 graph: QuantumGraph, 

84 saveInitOutputs: bool = True, 

85 registerDatasetTypes: bool = False, 

86 saveVersions: bool = True, 

87 ) -> None: 

88 """Perform all initialization steps. 

89 

90 Convenience method to execute all initialization steps. Instead of 

91 calling this method and providing all options it is also possible to 

92 call methods individually. 

93 

94 Parameters 

95 ---------- 

96 graph : `~lsst.pipe.base.QuantumGraph` 

97 Execution graph. 

98 saveInitOutputs : `bool`, optional 

99 If ``True`` (default) then save "init outputs", configurations, 

100 and package versions to butler. 

101 registerDatasetTypes : `bool`, optional 

102 If ``True`` then register dataset types in registry, otherwise 

103 they must be already registered. 

104 saveVersions : `bool`, optional 

105 If ``False`` then do not save package versions even if 

106 ``saveInitOutputs`` is set to ``True``. 

107 """ 

108 # register dataset types or check consistency 

109 self.initializeDatasetTypes(graph, registerDatasetTypes) 

110 

111 # Save task initialization data or check that saved data 

112 # is consistent with what tasks would save 

113 if saveInitOutputs: 

114 self.saveInitOutputs(graph) 

115 self.saveConfigs(graph) 

116 if saveVersions: 

117 self.savePackageVersions(graph) 

118 

119 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

120 """Save or check DatasetTypes output by the tasks in a graph. 

121 

122 Iterates over all DatasetTypes for all tasks in a graph and either 

123 tries to add them to registry or compares them to existing ones. 

124 

125 Parameters 

126 ---------- 

127 graph : `~lsst.pipe.base.QuantumGraph` 

128 Execution graph. 

129 registerDatasetTypes : `bool`, optional 

130 If ``True`` then register dataset types in registry, otherwise 

131 they must be already registered. 

132 

133 Raises 

134 ------ 

135 ValueError 

136 Raised if existing DatasetType is different from DatasetType 

137 in a graph. 

138 KeyError 

139 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

140 does not exist in registry. 

141 """ 

142 pipeline = graph.taskGraph 

143 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

144 pipeline, registry=self.butler.registry, include_configs=True, include_packages=True 

145 ) 

146 

147 for datasetTypes, is_input in ( 

148 (pipelineDatasetTypes.initIntermediates, True), 

149 (pipelineDatasetTypes.initOutputs, False), 

150 (pipelineDatasetTypes.intermediates, True), 

151 (pipelineDatasetTypes.outputs, False), 

152 ): 

153 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input) 

154 

155 if self.mock: 

156 # register special mock data types, skip logs and metadata 

157 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline} 

158 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline} 

159 for datasetTypes, is_input in ( 

160 (pipelineDatasetTypes.intermediates, True), 

161 (pipelineDatasetTypes.outputs, False), 

162 ): 

163 mockDatasetTypes = [] 

164 for datasetType in datasetTypes: 

165 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()): 

166 mockDatasetTypes.append( 

167 DatasetType( 

168 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name), 

169 datasetType.dimensions, 

170 "StructuredDataDict", 

171 ) 

172 ) 

173 if mockDatasetTypes: 

174 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input) 

175 

176 def _register_output_dataset_types( 

177 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool 

178 ) -> None: 

179 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool: 

180 # These are output dataset types so check for compatibility on put. 

181 is_compatible = expected.is_compatible_with(datasetType) 

182 

183 if is_input: 

184 # This dataset type is also used for input so must be 

185 # compatible on get as ell. 

186 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

187 

188 if is_compatible: 

189 _LOG.debug( 

190 "The dataset type configurations differ (%s from task != %s from registry) " 

191 "but the storage classes are compatible. Can continue.", 

192 datasetType, 

193 expected, 

194 ) 

195 return is_compatible 

196 

197 missing_datasetTypes = set() 

198 for datasetType in datasetTypes: 

199 # Only composites are registered, no components, and by this point 

200 # the composite should already exist. 

201 if registerDatasetTypes and not datasetType.isComponent(): 

202 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

203 # this is a no-op if it already exists and is consistent, 

204 # and it raises if it is inconsistent. 

205 try: 

206 self.butler.registry.registerDatasetType(datasetType) 

207 except ConflictingDefinitionError: 

208 if not _check_compatibility( 

209 datasetType, self.butler.registry.getDatasetType(datasetType.name), is_input 

210 ): 

211 raise 

212 else: 

213 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

214 try: 

215 expected = self.butler.registry.getDatasetType(datasetType.name) 

216 except KeyError: 

217 # Likely means that --register-dataset-types is forgotten. 

218 missing_datasetTypes.add(datasetType.name) 

219 continue 

220 if expected != datasetType: 

221 if not _check_compatibility(datasetType, expected, is_input): 

222 raise ValueError( 

223 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

224 ) 

225 

226 if missing_datasetTypes: 

227 plural = "s" if len(missing_datasetTypes) != 1 else "" 

228 raise KeyError( 

229 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

230 "Dataset types have to be registered with either `butler register-dataset-type` or " 

231 "passing `--register-dataset-types` option to `pipetask run`." 

232 ) 

233 

234 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

235 """Write any datasets produced by initializing tasks in a graph. 

236 

237 Parameters 

238 ---------- 

239 graph : `~lsst.pipe.base.QuantumGraph` 

240 Execution graph. 

241 

242 Raises 

243 ------ 

244 TypeError 

245 Raised if ``extendRun`` is `True` but type of existing object in 

246 butler is different from new data. 

247 Exception 

248 Raised if ``extendRun`` is `False` and datasets already 

249 exists. Content of a butler collection may be changed if 

250 exception is raised. 

251 

252 Notes 

253 ----- 

254 If ``extendRun`` is `True` then existing datasets are not 

255 overwritten, instead we should check that their stored object is 

256 exactly the same as what we would save at this time. Comparing 

257 arbitrary types of object is, of course, non-trivial. Current 

258 implementation only checks the existence of the datasets and their 

259 types against the types of objects produced by tasks. Ideally we 

260 would like to check that object data is identical too but presently 

261 there is no generic way to compare objects. In the future we can 

262 potentially introduce some extensible mechanism for that. 

263 """ 

264 _LOG.debug("Will save InitOutputs for all tasks") 

265 for taskDef in graph.iterTaskGraph(): 

266 task = self.taskFactory.makeTask( 

267 taskDef.taskClass, taskDef.label, taskDef.config, None, self.butler 

268 ) 

269 for name in taskDef.connections.initOutputs: 

270 attribute = getattr(taskDef.connections, name) 

271 initOutputVar = getattr(task, name) 

272 objFromStore = None 

273 if self.extendRun: 

274 # check if it is there already 

275 _LOG.debug( 

276 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

277 ) 

278 try: 

279 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

280 # Types are supposed to be identical. 

281 # TODO: Check that object contents is identical too. 

282 if type(objFromStore) is not type(initOutputVar): 

283 raise TypeError( 

284 f"Stored initOutput object type {type(objFromStore)} " 

285 f"is different from task-generated type " 

286 f"{type(initOutputVar)} for task {taskDef}" 

287 ) 

288 except (LookupError, FileNotFoundError): 

289 # FileNotFoundError likely means execution butler 

290 # where refs do exist but datastore artifacts do not. 

291 pass 

292 if objFromStore is None: 

293 # butler will raise exception if dataset is already there 

294 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

295 self.butler.put(initOutputVar, attribute.name, {}) 

296 

297 def saveConfigs(self, graph: QuantumGraph) -> None: 

298 """Write configurations for pipeline tasks to butler or check that 

299 existing configurations are equal to the new ones. 

300 

301 Parameters 

302 ---------- 

303 graph : `~lsst.pipe.base.QuantumGraph` 

304 Execution graph. 

305 

306 Raises 

307 ------ 

308 TypeError 

309 Raised if ``extendRun`` is `True` but existing object in butler is 

310 different from new data. 

311 Exception 

312 Raised if ``extendRun`` is `False` and datasets already exists. 

313 Content of a butler collection should not be changed if exception 

314 is raised. 

315 """ 

316 

317 def logConfigMismatch(msg: str) -> None: 

318 """Log messages about configuration mismatch.""" 

319 _LOG.fatal("Comparing configuration: %s", msg) 

320 

321 _LOG.debug("Will save Configs for all tasks") 

322 # start transaction to rollback any changes on exceptions 

323 with self.butler.transaction(): 

324 for taskDef in graph.taskGraph: 

325 configName = taskDef.configDatasetName 

326 

327 oldConfig = None 

328 if self.extendRun: 

329 try: 

330 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

331 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

332 raise TypeError( 

333 f"Config does not match existing task config {configName!r} in butler; " 

334 "tasks configurations must be consistent within the same run collection" 

335 ) 

336 except (LookupError, FileNotFoundError): 

337 # FileNotFoundError likely means execution butler 

338 # where refs do exist but datastore artifacts do not. 

339 pass 

340 if oldConfig is None: 

341 # butler will raise exception if dataset is already there 

342 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

343 self.butler.put(taskDef.config, configName, {}) 

344 

345 def savePackageVersions(self, graph: QuantumGraph) -> None: 

346 """Write versions of software packages to butler. 

347 

348 Parameters 

349 ---------- 

350 graph : `~lsst.pipe.base.QuantumGraph` 

351 Execution graph. 

352 

353 Raises 

354 ------ 

355 TypeError 

356 Raised if ``extendRun`` is `True` but existing object in butler is 

357 different from new data. 

358 """ 

359 packages = Packages.fromSystem() 

360 _LOG.debug("want to save packages: %s", packages) 

361 datasetType = PipelineDatasetTypes.packagesDatasetName 

362 dataId: dict[str, Any] = {} 

363 oldPackages = None 

364 # start transaction to rollback any changes on exceptions 

365 with self.butler.transaction(): 

366 if self.extendRun: 

367 try: 

368 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

369 _LOG.debug("old packages: %s", oldPackages) 

370 except (LookupError, FileNotFoundError): 

371 # FileNotFoundError likely means execution butler where 

372 # refs do exist but datastore artifacts do not. 

373 pass 

374 if oldPackages is not None: 

375 # Note that because we can only detect python modules that have 

376 # been imported, the stored list of products may be more or 

377 # less complete than what we have now. What's important is 

378 # that the products that are in common have the same version. 

379 diff = packages.difference(oldPackages) 

380 if diff: 

381 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

382 raise TypeError(f"Package versions mismatch: ({versions_str})") 

383 else: 

384 _LOG.debug("new packages are consistent with old") 

385 # Update the old set of packages in case we have more packages 

386 # that haven't been persisted. 

387 extra = packages.extra(oldPackages) 

388 if extra: 

389 _LOG.debug("extra packages: %s", extra) 

390 oldPackages.update(packages) 

391 # have to remove existing dataset first, butler has no 

392 # replace option. 

393 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

394 assert ref is not None, "Expecting to get dataset ref which is not None." 

395 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

396 self.butler.put(oldPackages, datasetType, dataId) 

397 else: 

398 self.butler.put(packages, datasetType, dataId)