Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 10%

189 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 12:21 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28__all__ = ["SingleQuantumExecutor"] 

29 

30# ------------------------------- 

31# Imports of standard modules -- 

32# ------------------------------- 

33import logging 

34import sys 

35import time 

36from collections import defaultdict 

37from collections.abc import Callable 

38from itertools import chain 

39from typing import Any, cast 

40 

41from lsst.daf.butler import ( 

42 Butler, 

43 CollectionType, 

44 DatasetRef, 

45 DatasetType, 

46 LimitedButler, 

47 NamedKeyDict, 

48 Quantum, 

49) 

50from lsst.daf.butler.registry.wildcards import CollectionWildcard 

51from lsst.pipe.base import ( 

52 AdjustQuantumHelper, 

53 ExecutionResources, 

54 Instrument, 

55 InvalidQuantumError, 

56 NoWorkFound, 

57 PipelineTask, 

58 QuantumContext, 

59 RepeatableQuantumError, 

60 TaskDef, 

61 TaskFactory, 

62) 

63 

64# During metadata transition phase, determine metadata class by 

65# asking pipe_base 

66from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

67from lsst.utils.timer import logInfo 

68 

69# ----------------------------- 

70# Imports for other modules -- 

71# ----------------------------- 

72from .log_capture import LogCapture 

73from .quantumGraphExecutor import QuantumExecutor 

74from .reports import QuantumReport 

75 

76# ---------------------------------- 

77# Local non-exported definitions -- 

78# ---------------------------------- 

79 

80_LOG = logging.getLogger(__name__) 

81 

82 

83class SingleQuantumExecutor(QuantumExecutor): 

84 """Executor class which runs one Quantum at a time. 

85 

86 Parameters 

87 ---------- 

88 butler : `~lsst.daf.butler.Butler` or `None` 

89 Data butler, `None` means that Quantum-backed butler should be used 

90 instead. 

91 taskFactory : `~lsst.pipe.base.TaskFactory` 

92 Instance of a task factory. 

93 skipExistingIn 

94 Expressions representing the collections to search for existing 

95 output datasets. See :ref:`daf_butler_ordered_collection_searches` 

96 for allowed types. This class only checks for the presence of butler 

97 output run in the list of collections. If the output run is present 

98 in the list then the quanta whose complete outputs exist in the output 

99 run will be skipped. `None` or empty string/sequence disables skipping. 

100 clobberOutputs : `bool`, optional 

101 If `True`, then outputs from a quantum that exist in output run 

102 collection will be removed prior to executing a quantum. If 

103 ``skipExistingIn`` contains output run, then only partial outputs from 

104 a quantum will be removed. Only used when ``butler`` is not `None`. 

105 enableLsstDebug : `bool`, optional 

106 Enable debugging with ``lsstDebug`` facility for a task. 

107 exitOnKnownError : `bool`, optional 

108 If `True`, call `sys.exit` with the appropriate exit code for special 

109 known exceptions, after printing a traceback, instead of letting the 

110 exception propagate up to calling. This is always the behavior for 

111 InvalidQuantumError. 

112 limited_butler_factory : `Callable`, optional 

113 A method that creates a `~lsst.daf.butler.LimitedButler` instance 

114 for a given Quantum. This parameter must be defined if ``butler`` is 

115 `None`. If ``butler`` is not `None` then this parameter is ignored. 

116 resources : `~lsst.pipe.base.ExecutionResources`, optional 

117 The resources available to this quantum when executing. 

118 """ 

119 

120 def __init__( 

121 self, 

122 butler: Butler | None, 

123 taskFactory: TaskFactory, 

124 skipExistingIn: Any = None, 

125 clobberOutputs: bool = False, 

126 enableLsstDebug: bool = False, 

127 exitOnKnownError: bool = False, 

128 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None, 

129 resources: ExecutionResources | None = None, 

130 ): 

131 self.butler = butler 

132 self.taskFactory = taskFactory 

133 self.enableLsstDebug = enableLsstDebug 

134 self.clobberOutputs = clobberOutputs 

135 self.exitOnKnownError = exitOnKnownError 

136 self.limited_butler_factory = limited_butler_factory 

137 self.report: QuantumReport | None = None 

138 self.resources = resources 

139 

140 if self.butler is None: 

141 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None" 

142 

143 # Find whether output run is in skipExistingIn. 

144 # TODO: This duplicates logic in GraphBuilder, would be nice to have 

145 # better abstraction for this some day. 

146 self.skipExisting = False 

147 if self.butler is not None and skipExistingIn: 

148 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

149 # As optimization check in the explicit list of names first 

150 self.skipExisting = self.butler.run in skip_collections_wildcard.strings 

151 if not self.skipExisting: 

152 # need to flatten it and check again 

153 self.skipExisting = self.butler.run in self.butler.registry.queryCollections( 

154 skipExistingIn, 

155 collectionTypes=CollectionType.RUN, 

156 ) 

157 

158 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

159 # Docstring inherited from QuantumExecutor.execute 

160 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

161 

162 if self.butler is not None: 

163 self.butler.registry.refresh() 

164 

165 # Catch any exception and make a report based on that. 

166 try: 

167 result = self._execute(taskDef, quantum) 

168 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label) 

169 return result 

170 except Exception as exc: 

171 self.report = QuantumReport.from_exception( 

172 exception=exc, 

173 dataId=quantum.dataId, 

174 taskLabel=taskDef.label, 

175 ) 

176 raise 

177 

178 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

179 """Execute the quantum. 

180 

181 Internal implementation of `execute()`. 

182 """ 

183 startTime = time.time() 

184 

185 # Make a limited butler instance if needed (which should be QBB if full 

186 # butler is not defined). 

187 limited_butler: LimitedButler 

188 if self.butler is not None: 

189 limited_butler = self.butler 

190 else: 

191 # We check this in constructor, but mypy needs this check here. 

192 assert self.limited_butler_factory is not None 

193 limited_butler = self.limited_butler_factory(quantum) 

194 

195 if self.butler is not None: 

196 log_capture = LogCapture.from_full(self.butler) 

197 else: 

198 log_capture = LogCapture.from_limited(limited_butler) 

199 with log_capture.capture_logging(taskDef, quantum) as captureLog: 

200 # Save detailed resource usage before task start to metadata. 

201 quantumMetadata = _TASK_METADATA_TYPE() 

202 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type] 

203 

204 _LOG.info("Preparing execution of quantum for label=%s dataId=%s.", taskDef.label, quantum.dataId) 

205 

206 # check whether to skip or delete old outputs, if it returns True 

207 # or raises an exception do not try to store logs, as they may be 

208 # already in butler. 

209 captureLog.store = False 

210 if self.checkExistingOutputs(quantum, taskDef, limited_butler): 

211 _LOG.info( 

212 "Skipping already-successful quantum for label=%s dataId=%s.", 

213 taskDef.label, 

214 quantum.dataId, 

215 ) 

216 return quantum 

217 captureLog.store = True 

218 

219 try: 

220 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler) 

221 except NoWorkFound as exc: 

222 _LOG.info( 

223 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

224 taskDef.label, 

225 quantum.dataId, 

226 str(exc), 

227 ) 

228 # Make empty metadata that looks something like what a 

229 # do-nothing task would write (but we don't bother with empty 

230 # nested PropertySets for subtasks). This is slightly 

231 # duplicative with logic in pipe_base that we can't easily call 

232 # from here; we'll fix this on DM-29761. 

233 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

234 fullMetadata = _TASK_FULL_METADATA_TYPE() 

235 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

236 fullMetadata["quantum"] = quantumMetadata 

237 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

238 return quantum 

239 

240 # enable lsstDebug debugging 

241 if self.enableLsstDebug: 

242 try: 

243 _LOG.debug("Will try to import debug.py") 

244 import debug # type: ignore # noqa:F401 

245 except ImportError: 

246 _LOG.warn("No 'debug' module found.") 

247 

248 # initialize global state 

249 self.initGlobals(quantum) 

250 

251 # Ensure that we are executing a frozen config 

252 taskDef.config.freeze() 

253 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type] 

254 init_input_refs = list(quantum.initInputs.values()) 

255 

256 _LOG.info( 

257 "Constructing task and executing quantum for label=%s dataId=%s.", 

258 taskDef.label, 

259 quantum.dataId, 

260 ) 

261 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs) 

262 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type] 

263 try: 

264 self.runQuantum(task, quantum, taskDef, limited_butler) 

265 except Exception as e: 

266 _LOG.error( 

267 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

268 taskDef.label, 

269 quantum.dataId, 

270 e.__class__.__name__, 

271 str(e), 

272 ) 

273 raise 

274 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

275 fullMetadata = task.getFullMetadata() 

276 fullMetadata["quantum"] = quantumMetadata 

277 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

278 stopTime = time.time() 

279 _LOG.info( 

280 "Execution of task '%s' on quantum %s took %.3f seconds", 

281 taskDef.label, 

282 quantum.dataId, 

283 stopTime - startTime, 

284 ) 

285 return quantum 

286 

287 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool: 

288 """Decide whether this quantum needs to be executed. 

289 

290 If only partial outputs exist then they are removed if 

291 ``clobberOutputs`` is True, otherwise an exception is raised. 

292 

293 Parameters 

294 ---------- 

295 quantum : `~lsst.daf.butler.Quantum` 

296 Quantum to check for existing outputs 

297 taskDef : `~lsst.pipe.base.TaskDef` 

298 Task definition structure. 

299 

300 Returns 

301 ------- 

302 exist : `bool` 

303 `True` if ``self.skipExisting`` is defined, and a previous 

304 execution of this quanta appears to have completed successfully 

305 (either because metadata was written or all datasets were written). 

306 `False` otherwise. 

307 

308 Raises 

309 ------ 

310 RuntimeError 

311 Raised if some outputs exist and some not. 

312 """ 

313 if not self.butler: 

314 # Skip/prune logic only works for full butler. 

315 return False 

316 

317 if self.skipExisting: 

318 _LOG.debug( 

319 "Checking existence of metadata from previous execution of label=%s dataId=%s.", 

320 taskDef.label, 

321 quantum.dataId, 

322 ) 

323 # Metadata output exists; this is sufficient to assume the previous 

324 # run was successful and should be skipped. 

325 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName] 

326 if metadata_ref is not None: 

327 if limited_butler.stored(metadata_ref): 

328 return True 

329 

330 # Find and prune (partial) outputs if `self.clobberOutputs` is set. 

331 _LOG.debug( 

332 "Looking for existing outputs in the way for label=%s dataId=%s.", taskDef.label, quantum.dataId 

333 ) 

334 ref_dict = self.butler.stored_many(chain.from_iterable(quantum.outputs.values())) 

335 existingRefs = [ref for ref, exists in ref_dict.items() if exists] 

336 missingRefs = [ref for ref, exists in ref_dict.items() if not exists] 

337 if existingRefs: 

338 if not missingRefs: 

339 # Full outputs exist. 

340 if self.skipExisting: 

341 return True 

342 elif self.clobberOutputs: 

343 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs) 

344 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

345 else: 

346 raise RuntimeError( 

347 f"Complete outputs exists for a quantum {quantum} " 

348 "and neither clobberOutputs nor skipExisting is set: " 

349 f"collection={self.butler.run} existingRefs={existingRefs}" 

350 ) 

351 else: 

352 # Partial outputs from a failed quantum. 

353 _LOG.debug( 

354 "Partial outputs exist for quantum %s collection=%s existingRefs=%s missingRefs=%s", 

355 quantum, 

356 self.butler.run, 

357 existingRefs, 

358 missingRefs, 

359 ) 

360 if self.clobberOutputs: 

361 # only prune 

362 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

363 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

364 return False 

365 else: 

366 raise RuntimeError( 

367 "Registry inconsistency while checking for existing quantum outputs:" 

368 f" quantum={quantum} collection={self.butler.run} existingRefs={existingRefs}" 

369 f" missingRefs={missingRefs}" 

370 ) 

371 

372 # By default always execute. 

373 return False 

374 

375 def updatedQuantumInputs( 

376 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

377 ) -> Quantum: 

378 """Update quantum with extra information, returns a new updated 

379 Quantum. 

380 

381 Some methods may require input DatasetRefs to have non-None 

382 ``dataset_id``, but in case of intermediate dataset it may not be 

383 filled during QuantumGraph construction. This method will retrieve 

384 missing info from registry. 

385 

386 Parameters 

387 ---------- 

388 quantum : `~lsst.daf.butler.Quantum` 

389 Single Quantum instance. 

390 taskDef : `~lsst.pipe.base.TaskDef` 

391 Task definition structure. 

392 

393 Returns 

394 ------- 

395 update : `~lsst.daf.butler.Quantum` 

396 Updated Quantum instance 

397 """ 

398 anyChanges = False 

399 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list) 

400 for key, refsForDatasetType in quantum.inputs.items(): 

401 _LOG.debug( 

402 "Checking existence of input '%s' for label=%s dataId=%s.", 

403 key.name, 

404 taskDef.label, 

405 quantum.dataId, 

406 ) 

407 newRefsForDatasetType = updatedInputs[key] 

408 stored = limited_butler.stored_many(refsForDatasetType) 

409 for ref in refsForDatasetType: 

410 if stored[ref]: 

411 newRefsForDatasetType.append(ref) 

412 else: 

413 # This should only happen if a predicted intermediate was 

414 # not actually produced upstream, but 

415 # datastore misconfigurations can unfortunately also land 

416 # us here. 

417 _LOG.info("No dataset artifact found for %s", ref) 

418 continue 

419 if len(newRefsForDatasetType) != len(refsForDatasetType): 

420 anyChanges = True 

421 # If we removed any input datasets, let the task check if it has enough 

422 # to proceed and/or prune related datasets that it also doesn't 

423 # need/produce anymore. It will raise NoWorkFound if it can't run, 

424 # which we'll let propagate up. This is exactly what we run during QG 

425 # generation, because a task shouldn't care whether an input is missing 

426 # because some previous task didn't produce it, or because it just 

427 # wasn't there during QG generation. 

428 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items()) 

429 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs) 

430 if anyChanges: 

431 _LOG.debug("Running adjustQuantum for label=%s dataId=%s.", taskDef.label, quantum.dataId) 

432 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

433 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

434 return Quantum( 

435 taskName=quantum.taskName, 

436 taskClass=quantum.taskClass, 

437 dataId=quantum.dataId, 

438 initInputs=quantum.initInputs, 

439 inputs=helper.inputs, 

440 outputs=helper.outputs, 

441 ) 

442 

443 def runQuantum( 

444 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

445 ) -> None: 

446 """Execute task on a single quantum. 

447 

448 Parameters 

449 ---------- 

450 task : `~lsst.pipe.base.PipelineTask` 

451 Task object. 

452 quantum : `~lsst.daf.butler.Quantum` 

453 Single Quantum instance. 

454 taskDef : `~lsst.pipe.base.TaskDef` 

455 Task definition structure. 

456 """ 

457 # Create a butler that operates in the context of a quantum 

458 butlerQC = QuantumContext(limited_butler, quantum, resources=self.resources) 

459 

460 # Get the input and output references for the task 

461 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

462 

463 # Call task runQuantum() method. Catch a few known failure modes and 

464 # translate them into specific 

465 try: 

466 task.runQuantum(butlerQC, inputRefs, outputRefs) 

467 except NoWorkFound as err: 

468 # Not an error, just an early exit. 

469 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

470 pass 

471 except RepeatableQuantumError as err: 

472 if self.exitOnKnownError: 

473 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

474 _LOG.warning(err, exc_info=True) 

475 sys.exit(err.EXIT_CODE) 

476 else: 

477 raise 

478 except InvalidQuantumError as err: 

479 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

480 _LOG.fatal(err, exc_info=True) 

481 sys.exit(err.EXIT_CODE) 

482 

483 def writeMetadata( 

484 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler 

485 ) -> None: 

486 # DatasetRef has to be in the Quantum outputs, can lookup by name 

487 try: 

488 [ref] = quantum.outputs[taskDef.metadataDatasetName] 

489 except LookupError as exc: 

490 raise InvalidQuantumError( 

491 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

492 " this could happen due to inconsistent options between QuantumGraph generation" 

493 " and execution" 

494 ) from exc 

495 limited_butler.put(metadata, ref) 

496 

497 def initGlobals(self, quantum: Quantum) -> None: 

498 """Initialize global state needed for task execution. 

499 

500 Parameters 

501 ---------- 

502 quantum : `~lsst.daf.butler.Quantum` 

503 Single Quantum instance. 

504 

505 Notes 

506 ----- 

507 There is an issue with initializing filters singleton which is done 

508 by instrument, to avoid requiring tasks to do it in runQuantum() 

509 we do it here when any dataId has an instrument dimension. Also for 

510 now we only allow single instrument, verify that all instrument 

511 names in all dataIds are identical. 

512 

513 This will need revision when filter singleton disappears. 

514 """ 

515 # can only work for full butler 

516 if self.butler is None: 

517 return 

518 oneInstrument = None 

519 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

520 for datasetRef in datasetRefs: 

521 dataId = datasetRef.dataId 

522 instrument = cast(str, dataId.get("instrument")) 

523 if instrument is not None: 

524 if oneInstrument is not None: 

525 assert ( # type: ignore 

526 instrument == oneInstrument 

527 ), "Currently require that only one instrument is used per graph" 

528 else: 

529 oneInstrument = instrument 

530 Instrument.fromName(instrument, self.butler.registry) 

531 

532 def getReport(self) -> QuantumReport | None: 

533 # Docstring inherited from base class 

534 if self.report is None: 

535 raise RuntimeError("getReport() called before execute()") 

536 return self.report