Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 11%

187 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-28 03:02 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28__all__ = ["SingleQuantumExecutor"] 

29 

30# ------------------------------- 

31# Imports of standard modules -- 

32# ------------------------------- 

33import logging 

34import sys 

35import time 

36from collections import defaultdict 

37from collections.abc import Callable 

38from itertools import chain 

39from typing import Any, cast 

40 

41from lsst.daf.butler import ( 

42 Butler, 

43 CollectionType, 

44 DatasetRef, 

45 DatasetType, 

46 LimitedButler, 

47 NamedKeyDict, 

48 Quantum, 

49) 

50from lsst.daf.butler.registry.wildcards import CollectionWildcard 

51from lsst.pipe.base import ( 

52 AdjustQuantumHelper, 

53 ExecutionResources, 

54 Instrument, 

55 InvalidQuantumError, 

56 NoWorkFound, 

57 PipelineTask, 

58 QuantumContext, 

59 RepeatableQuantumError, 

60 TaskDef, 

61 TaskFactory, 

62) 

63 

64# During metadata transition phase, determine metadata class by 

65# asking pipe_base 

66from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

67from lsst.utils.timer import logInfo 

68 

69# ----------------------------- 

70# Imports for other modules -- 

71# ----------------------------- 

72from .log_capture import LogCapture 

73from .quantumGraphExecutor import QuantumExecutor 

74from .reports import QuantumReport 

75 

76# ---------------------------------- 

77# Local non-exported definitions -- 

78# ---------------------------------- 

79 

80_LOG = logging.getLogger(__name__) 

81 

82 

83class SingleQuantumExecutor(QuantumExecutor): 

84 """Executor class which runs one Quantum at a time. 

85 

86 Parameters 

87 ---------- 

88 butler : `~lsst.daf.butler.Butler` or `None` 

89 Data butler, `None` means that Quantum-backed butler should be used 

90 instead. 

91 taskFactory : `~lsst.pipe.base.TaskFactory` 

92 Instance of a task factory. 

93 skipExistingIn : `~typing.Any` 

94 Expressions representing the collections to search for existing 

95 output datasets. See :ref:`daf_butler_ordered_collection_searches` 

96 for allowed types. This class only checks for the presence of butler 

97 output run in the list of collections. If the output run is present 

98 in the list then the quanta whose complete outputs exist in the output 

99 run will be skipped. `None` or empty string/sequence disables skipping. 

100 clobberOutputs : `bool`, optional 

101 If `True`, then outputs from a quantum that exist in output run 

102 collection will be removed prior to executing a quantum. If 

103 ``skipExistingIn`` contains output run, then only partial outputs from 

104 a quantum will be removed. Only used when ``butler`` is not `None`. 

105 enableLsstDebug : `bool`, optional 

106 Enable debugging with ``lsstDebug`` facility for a task. 

107 exitOnKnownError : `bool`, optional 

108 If `True`, call `sys.exit` with the appropriate exit code for special 

109 known exceptions, after printing a traceback, instead of letting the 

110 exception propagate up to calling. This is always the behavior for 

111 InvalidQuantumError. 

112 limited_butler_factory : `Callable`, optional 

113 A method that creates a `~lsst.daf.butler.LimitedButler` instance 

114 for a given Quantum. This parameter must be defined if ``butler`` is 

115 `None`. If ``butler`` is not `None` then this parameter is ignored. 

116 resources : `~lsst.pipe.base.ExecutionResources`, optional 

117 The resources available to this quantum when executing. 

118 skipExisting : `bool`, optional 

119 If `True`, skip quanta whose metadata datasets are already stored. 

120 Unlike ``skipExistingIn``, this works with limited butlers as well as 

121 full butlers. Always set to `True` if ``skipExistingIn`` matches 

122 ``butler.run``. 

123 """ 

124 

125 def __init__( 

126 self, 

127 butler: Butler | None, 

128 taskFactory: TaskFactory, 

129 skipExistingIn: Any = None, 

130 clobberOutputs: bool = False, 

131 enableLsstDebug: bool = False, 

132 exitOnKnownError: bool = False, 

133 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None, 

134 resources: ExecutionResources | None = None, 

135 skipExisting: bool = False, 

136 ): 

137 self.butler = butler 

138 self.taskFactory = taskFactory 

139 self.enableLsstDebug = enableLsstDebug 

140 self.clobberOutputs = clobberOutputs 

141 self.exitOnKnownError = exitOnKnownError 

142 self.limited_butler_factory = limited_butler_factory 

143 self.report: QuantumReport | None = None 

144 self.resources = resources 

145 

146 if self.butler is None: 

147 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None" 

148 

149 # Find whether output run is in skipExistingIn. 

150 # TODO: This duplicates logic in GraphBuilder, would be nice to have 

151 # better abstraction for this some day. 

152 self.skipExisting = skipExisting 

153 if self.butler is not None and skipExistingIn: 

154 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

155 # As optimization check in the explicit list of names first 

156 self.skipExisting = self.butler.run in skip_collections_wildcard.strings 

157 if not self.skipExisting: 

158 # need to flatten it and check again 

159 self.skipExisting = self.butler.run in self.butler.registry.queryCollections( 

160 skipExistingIn, 

161 collectionTypes=CollectionType.RUN, 

162 ) 

163 

164 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

165 # Docstring inherited from QuantumExecutor.execute 

166 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

167 

168 if self.butler is not None: 

169 self.butler.registry.refresh() 

170 

171 # Catch any exception and make a report based on that. 

172 try: 

173 result = self._execute(taskDef, quantum) 

174 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label) 

175 return result 

176 except Exception as exc: 

177 self.report = QuantumReport.from_exception( 

178 exception=exc, 

179 dataId=quantum.dataId, 

180 taskLabel=taskDef.label, 

181 ) 

182 raise 

183 

184 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

185 """Execute the quantum. 

186 

187 Internal implementation of `execute()`. 

188 """ 

189 startTime = time.time() 

190 

191 # Make a limited butler instance if needed (which should be QBB if full 

192 # butler is not defined). 

193 limited_butler: LimitedButler 

194 if self.butler is not None: 

195 limited_butler = self.butler 

196 else: 

197 # We check this in constructor, but mypy needs this check here. 

198 assert self.limited_butler_factory is not None 

199 limited_butler = self.limited_butler_factory(quantum) 

200 

201 if self.butler is not None: 

202 log_capture = LogCapture.from_full(self.butler) 

203 else: 

204 log_capture = LogCapture.from_limited(limited_butler) 

205 with log_capture.capture_logging(taskDef, quantum) as captureLog: 

206 # Save detailed resource usage before task start to metadata. 

207 quantumMetadata = _TASK_METADATA_TYPE() 

208 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type] 

209 

210 _LOG.info("Preparing execution of quantum for label=%s dataId=%s.", taskDef.label, quantum.dataId) 

211 

212 # check whether to skip or delete old outputs, if it returns True 

213 # or raises an exception do not try to store logs, as they may be 

214 # already in butler. 

215 captureLog.store = False 

216 if self.checkExistingOutputs(quantum, taskDef, limited_butler): 

217 _LOG.info( 

218 "Skipping already-successful quantum for label=%s dataId=%s.", 

219 taskDef.label, 

220 quantum.dataId, 

221 ) 

222 return quantum 

223 captureLog.store = True 

224 

225 try: 

226 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler) 

227 except NoWorkFound as exc: 

228 _LOG.info( 

229 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

230 taskDef.label, 

231 quantum.dataId, 

232 str(exc), 

233 ) 

234 # Make empty metadata that looks something like what a 

235 # do-nothing task would write (but we don't bother with empty 

236 # nested PropertySets for subtasks). This is slightly 

237 # duplicative with logic in pipe_base that we can't easily call 

238 # from here; we'll fix this on DM-29761. 

239 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

240 fullMetadata = _TASK_FULL_METADATA_TYPE() 

241 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

242 fullMetadata["quantum"] = quantumMetadata 

243 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

244 return quantum 

245 

246 # enable lsstDebug debugging 

247 if self.enableLsstDebug: 

248 try: 

249 _LOG.debug("Will try to import debug.py") 

250 import debug # type: ignore # noqa:F401 

251 except ImportError: 

252 _LOG.warn("No 'debug' module found.") 

253 

254 # initialize global state 

255 self.initGlobals(quantum) 

256 

257 # Ensure that we are executing a frozen config 

258 taskDef.config.freeze() 

259 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type] 

260 init_input_refs = list(quantum.initInputs.values()) 

261 

262 _LOG.info( 

263 "Constructing task and executing quantum for label=%s dataId=%s.", 

264 taskDef.label, 

265 quantum.dataId, 

266 ) 

267 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs) 

268 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type] 

269 try: 

270 self.runQuantum(task, quantum, taskDef, limited_butler) 

271 except Exception as e: 

272 _LOG.error( 

273 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

274 taskDef.label, 

275 quantum.dataId, 

276 e.__class__.__name__, 

277 str(e), 

278 ) 

279 raise 

280 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

281 fullMetadata = task.getFullMetadata() 

282 fullMetadata["quantum"] = quantumMetadata 

283 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

284 stopTime = time.time() 

285 _LOG.info( 

286 "Execution of task '%s' on quantum %s took %.3f seconds", 

287 taskDef.label, 

288 quantum.dataId, 

289 stopTime - startTime, 

290 ) 

291 return quantum 

292 

293 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool: 

294 """Decide whether this quantum needs to be executed. 

295 

296 If only partial outputs exist then they are removed if 

297 ``clobberOutputs`` is True, otherwise an exception is raised. 

298 

299 The ``LimitedButler`` is used for everything, and should be set to 

300 ``self.butler`` if no separate ``LimitedButler`` is available. 

301 

302 Parameters 

303 ---------- 

304 quantum : `~lsst.daf.butler.Quantum` 

305 Quantum to check for existing outputs. 

306 taskDef : `~lsst.pipe.base.TaskDef` 

307 Task definition structure. 

308 limited_butler : `~lsst.daf.butler.LimitedButler` 

309 Butler to use for querying. 

310 

311 Returns 

312 ------- 

313 exist : `bool` 

314 `True` if ``self.skipExisting`` is defined, and a previous 

315 execution of this quanta appears to have completed successfully 

316 (either because metadata was written or all datasets were written). 

317 `False` otherwise. 

318 

319 Raises 

320 ------ 

321 RuntimeError 

322 Raised if some outputs exist and some not. 

323 """ 

324 if self.skipExisting: 

325 _LOG.debug( 

326 "Checking existence of metadata from previous execution of label=%s dataId=%s.", 

327 taskDef.label, 

328 quantum.dataId, 

329 ) 

330 # Metadata output exists; this is sufficient to assume the previous 

331 # run was successful and should be skipped. 

332 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName] 

333 if metadata_ref is not None: 

334 if limited_butler.stored(metadata_ref): 

335 return True 

336 

337 # Find and prune (partial) outputs if `self.clobberOutputs` is set. 

338 _LOG.debug( 

339 "Looking for existing outputs in the way for label=%s dataId=%s.", taskDef.label, quantum.dataId 

340 ) 

341 ref_dict = limited_butler.stored_many(chain.from_iterable(quantum.outputs.values())) 

342 existingRefs = [ref for ref, exists in ref_dict.items() if exists] 

343 missingRefs = [ref for ref, exists in ref_dict.items() if not exists] 

344 if existingRefs: 

345 if not missingRefs: 

346 # Full outputs exist. 

347 if self.skipExisting: 

348 return True 

349 elif self.clobberOutputs: 

350 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs) 

351 limited_butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

352 else: 

353 raise RuntimeError( 

354 f"Complete outputs exists for a quantum {quantum} " 

355 "and neither clobberOutputs nor skipExisting is set: " 

356 f"existingRefs={existingRefs}" 

357 ) 

358 else: 

359 # Partial outputs from a failed quantum. 

360 _LOG.debug( 

361 "Partial outputs exist for quantum %s existingRefs=%s missingRefs=%s", 

362 quantum, 

363 existingRefs, 

364 missingRefs, 

365 ) 

366 if self.clobberOutputs: 

367 # only prune 

368 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

369 limited_butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

370 return False 

371 else: 

372 raise RuntimeError( 

373 "Registry inconsistency while checking for existing quantum outputs:" 

374 f" quantum={quantum} existingRefs={existingRefs}" 

375 f" missingRefs={missingRefs}" 

376 ) 

377 

378 # By default always execute. 

379 return False 

380 

381 def updatedQuantumInputs( 

382 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

383 ) -> Quantum: 

384 """Update quantum with extra information, returns a new updated 

385 Quantum. 

386 

387 Some methods may require input DatasetRefs to have non-None 

388 ``dataset_id``, but in case of intermediate dataset it may not be 

389 filled during QuantumGraph construction. This method will retrieve 

390 missing info from registry. 

391 

392 Parameters 

393 ---------- 

394 quantum : `~lsst.daf.butler.Quantum` 

395 Single Quantum instance. 

396 taskDef : `~lsst.pipe.base.TaskDef` 

397 Task definition structure. 

398 limited_butler : `~lsst.daf.butler.LimitedButler` 

399 Butler to use for querying. 

400 

401 Returns 

402 ------- 

403 update : `~lsst.daf.butler.Quantum` 

404 Updated Quantum instance. 

405 """ 

406 anyChanges = False 

407 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list) 

408 for key, refsForDatasetType in quantum.inputs.items(): 

409 _LOG.debug( 

410 "Checking existence of input '%s' for label=%s dataId=%s.", 

411 key.name, 

412 taskDef.label, 

413 quantum.dataId, 

414 ) 

415 newRefsForDatasetType = updatedInputs[key] 

416 stored = limited_butler.stored_many(refsForDatasetType) 

417 for ref in refsForDatasetType: 

418 if stored[ref]: 

419 newRefsForDatasetType.append(ref) 

420 else: 

421 # This should only happen if a predicted intermediate was 

422 # not actually produced upstream, but 

423 # datastore misconfigurations can unfortunately also land 

424 # us here. 

425 _LOG.info("No dataset artifact found for %s", ref) 

426 continue 

427 if len(newRefsForDatasetType) != len(refsForDatasetType): 

428 anyChanges = True 

429 # If we removed any input datasets, let the task check if it has enough 

430 # to proceed and/or prune related datasets that it also doesn't 

431 # need/produce anymore. It will raise NoWorkFound if it can't run, 

432 # which we'll let propagate up. This is exactly what we run during QG 

433 # generation, because a task shouldn't care whether an input is missing 

434 # because some previous task didn't produce it, or because it just 

435 # wasn't there during QG generation. 

436 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items()) 

437 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs) 

438 if anyChanges: 

439 _LOG.debug("Running adjustQuantum for label=%s dataId=%s.", taskDef.label, quantum.dataId) 

440 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

441 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

442 return Quantum( 

443 taskName=quantum.taskName, 

444 taskClass=quantum.taskClass, 

445 dataId=quantum.dataId, 

446 initInputs=quantum.initInputs, 

447 inputs=helper.inputs, 

448 outputs=helper.outputs, 

449 ) 

450 

451 def runQuantum( 

452 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

453 ) -> None: 

454 """Execute task on a single quantum. 

455 

456 Parameters 

457 ---------- 

458 task : `~lsst.pipe.base.PipelineTask` 

459 Task object. 

460 quantum : `~lsst.daf.butler.Quantum` 

461 Single Quantum instance. 

462 taskDef : `~lsst.pipe.base.TaskDef` 

463 Task definition structure. 

464 limited_butler : `~lsst.daf.butler.LimitedButler` 

465 Butler to use for dataset I/O. 

466 """ 

467 # Create a butler that operates in the context of a quantum 

468 butlerQC = QuantumContext(limited_butler, quantum, resources=self.resources) 

469 

470 # Get the input and output references for the task 

471 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

472 

473 # Call task runQuantum() method. Catch a few known failure modes and 

474 # translate them into specific 

475 try: 

476 task.runQuantum(butlerQC, inputRefs, outputRefs) 

477 except NoWorkFound as err: 

478 # Not an error, just an early exit. 

479 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

480 pass 

481 except RepeatableQuantumError as err: 

482 if self.exitOnKnownError: 

483 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

484 _LOG.warning(err, exc_info=True) 

485 sys.exit(err.EXIT_CODE) 

486 else: 

487 raise 

488 except InvalidQuantumError as err: 

489 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

490 _LOG.fatal(err, exc_info=True) 

491 sys.exit(err.EXIT_CODE) 

492 

493 def writeMetadata( 

494 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler 

495 ) -> None: 

496 # DatasetRef has to be in the Quantum outputs, can lookup by name 

497 try: 

498 [ref] = quantum.outputs[taskDef.metadataDatasetName] 

499 except LookupError as exc: 

500 raise InvalidQuantumError( 

501 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

502 " this could happen due to inconsistent options between QuantumGraph generation" 

503 " and execution" 

504 ) from exc 

505 limited_butler.put(metadata, ref) 

506 

507 def initGlobals(self, quantum: Quantum) -> None: 

508 """Initialize global state needed for task execution. 

509 

510 Parameters 

511 ---------- 

512 quantum : `~lsst.daf.butler.Quantum` 

513 Single Quantum instance. 

514 

515 Notes 

516 ----- 

517 There is an issue with initializing filters singleton which is done 

518 by instrument, to avoid requiring tasks to do it in runQuantum() 

519 we do it here when any dataId has an instrument dimension. Also for 

520 now we only allow single instrument, verify that all instrument 

521 names in all dataIds are identical. 

522 

523 This will need revision when filter singleton disappears. 

524 """ 

525 # can only work for full butler 

526 if self.butler is None: 

527 return 

528 oneInstrument = None 

529 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

530 for datasetRef in datasetRefs: 

531 dataId = datasetRef.dataId 

532 instrument = cast(str, dataId.get("instrument")) 

533 if instrument is not None: 

534 if oneInstrument is not None: 

535 assert ( # type: ignore 

536 instrument == oneInstrument 

537 ), "Currently require that only one instrument is used per graph" 

538 else: 

539 oneInstrument = instrument 

540 Instrument.fromName(instrument, self.butler.registry) 

541 

542 def getReport(self) -> QuantumReport | None: 

543 # Docstring inherited from base class 

544 if self.report is None: 

545 raise RuntimeError("getReport() called before execute()") 

546 return self.report