Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 10%

189 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-09-01 09:30 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["SingleQuantumExecutor"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import sys 

29import time 

30from collections import defaultdict 

31from collections.abc import Callable 

32from itertools import chain 

33from typing import Any 

34 

35from lsst.daf.butler import ( 

36 Butler, 

37 CollectionType, 

38 DatasetRef, 

39 DatasetType, 

40 LimitedButler, 

41 NamedKeyDict, 

42 Quantum, 

43) 

44from lsst.daf.butler.registry.wildcards import CollectionWildcard 

45from lsst.pipe.base import ( 

46 AdjustQuantumHelper, 

47 ExecutionResources, 

48 Instrument, 

49 InvalidQuantumError, 

50 NoWorkFound, 

51 PipelineTask, 

52 QuantumContext, 

53 RepeatableQuantumError, 

54 TaskDef, 

55 TaskFactory, 

56) 

57 

58# During metadata transition phase, determine metadata class by 

59# asking pipe_base 

60from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

61from lsst.utils.timer import logInfo 

62 

63# ----------------------------- 

64# Imports for other modules -- 

65# ----------------------------- 

66from .log_capture import LogCapture 

67from .quantumGraphExecutor import QuantumExecutor 

68from .reports import QuantumReport 

69 

70# ---------------------------------- 

71# Local non-exported definitions -- 

72# ---------------------------------- 

73 

74_LOG = logging.getLogger(__name__) 

75 

76 

77class SingleQuantumExecutor(QuantumExecutor): 

78 """Executor class which runs one Quantum at a time. 

79 

80 Parameters 

81 ---------- 

82 butler : `~lsst.daf.butler.Butler` or `None` 

83 Data butler, `None` means that Quantum-backed butler should be used 

84 instead. 

85 taskFactory : `~lsst.pipe.base.TaskFactory` 

86 Instance of a task factory. 

87 skipExistingIn 

88 Expressions representing the collections to search for existing 

89 output datasets. See :ref:`daf_butler_ordered_collection_searches` 

90 for allowed types. This class only checks for the presence of butler 

91 output run in the list of collections. If the output run is present 

92 in the list then the quanta whose complete outputs exist in the output 

93 run will be skipped. `None` or empty string/sequence disables skipping. 

94 clobberOutputs : `bool`, optional 

95 If `True`, then outputs from a quantum that exist in output run 

96 collection will be removed prior to executing a quantum. If 

97 ``skipExistingIn`` contains output run, then only partial outputs from 

98 a quantum will be removed. Only used when ``butler`` is not `None`. 

99 enableLsstDebug : `bool`, optional 

100 Enable debugging with ``lsstDebug`` facility for a task. 

101 exitOnKnownError : `bool`, optional 

102 If `True`, call `sys.exit` with the appropriate exit code for special 

103 known exceptions, after printing a traceback, instead of letting the 

104 exception propagate up to calling. This is always the behavior for 

105 InvalidQuantumError. 

106 limited_butler_factory : `Callable`, optional 

107 A method that creates a `~lsst.daf.butler.LimitedButler` instance 

108 for a given Quantum. This parameter must be defined if ``butler`` is 

109 `None`. If ``butler`` is not `None` then this parameter is ignored. 

110 resources : `~lsst.pipe.base.ExecutionResources`, optional 

111 The resources available to this quantum when executing. 

112 """ 

113 

114 def __init__( 

115 self, 

116 butler: Butler | None, 

117 taskFactory: TaskFactory, 

118 skipExistingIn: Any = None, 

119 clobberOutputs: bool = False, 

120 enableLsstDebug: bool = False, 

121 exitOnKnownError: bool = False, 

122 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None, 

123 resources: ExecutionResources | None = None, 

124 ): 

125 self.butler = butler 

126 self.taskFactory = taskFactory 

127 self.enableLsstDebug = enableLsstDebug 

128 self.clobberOutputs = clobberOutputs 

129 self.exitOnKnownError = exitOnKnownError 

130 self.limited_butler_factory = limited_butler_factory 

131 self.report: QuantumReport | None = None 

132 self.resources = resources 

133 

134 if self.butler is None: 

135 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None" 

136 

137 # Find whether output run is in skipExistingIn. 

138 # TODO: This duplicates logic in GraphBuilder, would be nice to have 

139 # better abstraction for this some day. 

140 self.skipExisting = False 

141 if self.butler is not None and skipExistingIn: 

142 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

143 # As optimization check in the explicit list of names first 

144 self.skipExisting = self.butler.run in skip_collections_wildcard.strings 

145 if not self.skipExisting: 

146 # need to flatten it and check again 

147 self.skipExisting = self.butler.run in self.butler.registry.queryCollections( 

148 skipExistingIn, 

149 collectionTypes=CollectionType.RUN, 

150 ) 

151 

152 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

153 # Docstring inherited from QuantumExecutor.execute 

154 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

155 

156 if self.butler is not None: 

157 self.butler.registry.refresh() 

158 

159 # Catch any exception and make a report based on that. 

160 try: 

161 result = self._execute(taskDef, quantum) 

162 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label) 

163 return result 

164 except Exception as exc: 

165 self.report = QuantumReport.from_exception( 

166 exception=exc, 

167 dataId=quantum.dataId, 

168 taskLabel=taskDef.label, 

169 ) 

170 raise 

171 

172 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

173 """Execute the quantum. 

174 

175 Internal implementation of `execute()`. 

176 """ 

177 startTime = time.time() 

178 

179 # Make a limited butler instance if needed (which should be QBB if full 

180 # butler is not defined). 

181 limited_butler: LimitedButler 

182 if self.butler is not None: 

183 limited_butler = self.butler 

184 else: 

185 # We check this in constructor, but mypy needs this check here. 

186 assert self.limited_butler_factory is not None 

187 limited_butler = self.limited_butler_factory(quantum) 

188 

189 if self.butler is not None: 

190 log_capture = LogCapture.from_full(self.butler) 

191 else: 

192 log_capture = LogCapture.from_limited(limited_butler) 

193 with log_capture.capture_logging(taskDef, quantum) as captureLog: 

194 # Save detailed resource usage before task start to metadata. 

195 quantumMetadata = _TASK_METADATA_TYPE() 

196 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type] 

197 

198 _LOG.info("Preparing execution of quantum for label=%s dataId=%s.", taskDef.label, quantum.dataId) 

199 

200 # check whether to skip or delete old outputs, if it returns True 

201 # or raises an exception do not try to store logs, as they may be 

202 # already in butler. 

203 captureLog.store = False 

204 if self.checkExistingOutputs(quantum, taskDef, limited_butler): 

205 _LOG.info( 

206 "Skipping already-successful quantum for label=%s dataId=%s.", 

207 taskDef.label, 

208 quantum.dataId, 

209 ) 

210 return quantum 

211 captureLog.store = True 

212 

213 try: 

214 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler) 

215 except NoWorkFound as exc: 

216 _LOG.info( 

217 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

218 taskDef.label, 

219 quantum.dataId, 

220 str(exc), 

221 ) 

222 # Make empty metadata that looks something like what a 

223 # do-nothing task would write (but we don't bother with empty 

224 # nested PropertySets for subtasks). This is slightly 

225 # duplicative with logic in pipe_base that we can't easily call 

226 # from here; we'll fix this on DM-29761. 

227 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

228 fullMetadata = _TASK_FULL_METADATA_TYPE() 

229 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

230 fullMetadata["quantum"] = quantumMetadata 

231 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

232 return quantum 

233 

234 # enable lsstDebug debugging 

235 if self.enableLsstDebug: 

236 try: 

237 _LOG.debug("Will try to import debug.py") 

238 import debug # type: ignore # noqa:F401 

239 except ImportError: 

240 _LOG.warn("No 'debug' module found.") 

241 

242 # initialize global state 

243 self.initGlobals(quantum) 

244 

245 # Ensure that we are executing a frozen config 

246 taskDef.config.freeze() 

247 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type] 

248 init_input_refs = list(quantum.initInputs.values()) 

249 

250 _LOG.info( 

251 "Constructing task and executing quantum for label=%s dataId=%s.", 

252 taskDef.label, 

253 quantum.dataId, 

254 ) 

255 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs) 

256 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type] 

257 try: 

258 self.runQuantum(task, quantum, taskDef, limited_butler) 

259 except Exception as e: 

260 _LOG.error( 

261 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

262 taskDef.label, 

263 quantum.dataId, 

264 e.__class__.__name__, 

265 str(e), 

266 ) 

267 raise 

268 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

269 fullMetadata = task.getFullMetadata() 

270 fullMetadata["quantum"] = quantumMetadata 

271 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

272 stopTime = time.time() 

273 _LOG.info( 

274 "Execution of task '%s' on quantum %s took %.3f seconds", 

275 taskDef.label, 

276 quantum.dataId, 

277 stopTime - startTime, 

278 ) 

279 return quantum 

280 

281 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool: 

282 """Decide whether this quantum needs to be executed. 

283 

284 If only partial outputs exist then they are removed if 

285 ``clobberOutputs`` is True, otherwise an exception is raised. 

286 

287 Parameters 

288 ---------- 

289 quantum : `~lsst.daf.butler.Quantum` 

290 Quantum to check for existing outputs 

291 taskDef : `~lsst.pipe.base.TaskDef` 

292 Task definition structure. 

293 

294 Returns 

295 ------- 

296 exist : `bool` 

297 `True` if ``self.skipExisting`` is defined, and a previous 

298 execution of this quanta appears to have completed successfully 

299 (either because metadata was written or all datasets were written). 

300 `False` otherwise. 

301 

302 Raises 

303 ------ 

304 RuntimeError 

305 Raised if some outputs exist and some not. 

306 """ 

307 if not self.butler: 

308 # Skip/prune logic only works for full butler. 

309 return False 

310 

311 if self.skipExisting: 

312 _LOG.debug( 

313 "Checking existence of metadata from previous execution of label=%s dataId=%s.", 

314 taskDef.label, 

315 quantum.dataId, 

316 ) 

317 # Metadata output exists; this is sufficient to assume the previous 

318 # run was successful and should be skipped. 

319 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName] 

320 if metadata_ref is not None: 

321 if limited_butler.stored(metadata_ref): 

322 return True 

323 

324 # Find and prune (partial) outputs if `self.clobberOutputs` is set. 

325 _LOG.debug( 

326 "Looking for existing outputs in the way for label=%s dataId=%s.", taskDef.label, quantum.dataId 

327 ) 

328 ref_dict = self.butler.stored_many(chain.from_iterable(quantum.outputs.values())) 

329 existingRefs = [ref for ref, exists in ref_dict.items() if exists] 

330 missingRefs = [ref for ref, exists in ref_dict.items() if not exists] 

331 if existingRefs: 

332 if not missingRefs: 

333 # Full outputs exist. 

334 if self.skipExisting: 

335 return True 

336 elif self.clobberOutputs: 

337 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs) 

338 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

339 else: 

340 raise RuntimeError( 

341 f"Complete outputs exists for a quantum {quantum} " 

342 "and neither clobberOutputs nor skipExisting is set: " 

343 f"collection={self.butler.run} existingRefs={existingRefs}" 

344 ) 

345 else: 

346 # Partial outputs from a failed quantum. 

347 _LOG.debug( 

348 "Partial outputs exist for quantum %s collection=%s existingRefs=%s missingRefs=%s", 

349 quantum, 

350 self.butler.run, 

351 existingRefs, 

352 missingRefs, 

353 ) 

354 if self.clobberOutputs: 

355 # only prune 

356 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

357 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

358 return False 

359 else: 

360 raise RuntimeError( 

361 "Registry inconsistency while checking for existing quantum outputs:" 

362 f" quantum={quantum} collection={self.butler.run} existingRefs={existingRefs}" 

363 f" missingRefs={missingRefs}" 

364 ) 

365 

366 # By default always execute. 

367 return False 

368 

369 def updatedQuantumInputs( 

370 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

371 ) -> Quantum: 

372 """Update quantum with extra information, returns a new updated 

373 Quantum. 

374 

375 Some methods may require input DatasetRefs to have non-None 

376 ``dataset_id``, but in case of intermediate dataset it may not be 

377 filled during QuantumGraph construction. This method will retrieve 

378 missing info from registry. 

379 

380 Parameters 

381 ---------- 

382 quantum : `~lsst.daf.butler.Quantum` 

383 Single Quantum instance. 

384 taskDef : `~lsst.pipe.base.TaskDef` 

385 Task definition structure. 

386 

387 Returns 

388 ------- 

389 update : `~lsst.daf.butler.Quantum` 

390 Updated Quantum instance 

391 """ 

392 anyChanges = False 

393 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list) 

394 for key, refsForDatasetType in quantum.inputs.items(): 

395 _LOG.debug( 

396 "Checking existence of input '%s' for label=%s dataId=%s.", 

397 key.name, 

398 taskDef.label, 

399 quantum.dataId, 

400 ) 

401 newRefsForDatasetType = updatedInputs[key] 

402 stored = limited_butler.stored_many(refsForDatasetType) 

403 for ref in refsForDatasetType: 

404 if stored[ref]: 

405 newRefsForDatasetType.append(ref) 

406 else: 

407 # This should only happen if a predicted intermediate was 

408 # not actually produced upstream, but 

409 # datastore misconfigurations can unfortunately also land 

410 # us here. 

411 _LOG.info("No dataset artifact found for %s", ref) 

412 continue 

413 if len(newRefsForDatasetType) != len(refsForDatasetType): 

414 anyChanges = True 

415 # If we removed any input datasets, let the task check if it has enough 

416 # to proceed and/or prune related datasets that it also doesn't 

417 # need/produce anymore. It will raise NoWorkFound if it can't run, 

418 # which we'll let propagate up. This is exactly what we run during QG 

419 # generation, because a task shouldn't care whether an input is missing 

420 # because some previous task didn't produce it, or because it just 

421 # wasn't there during QG generation. 

422 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items()) 

423 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs) 

424 if anyChanges: 

425 _LOG.debug("Running adjustQuantum for label=%s dataId=%s.", taskDef.label, quantum.dataId) 

426 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

427 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

428 return Quantum( 

429 taskName=quantum.taskName, 

430 taskClass=quantum.taskClass, 

431 dataId=quantum.dataId, 

432 initInputs=quantum.initInputs, 

433 inputs=helper.inputs, 

434 outputs=helper.outputs, 

435 ) 

436 

437 def runQuantum( 

438 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

439 ) -> None: 

440 """Execute task on a single quantum. 

441 

442 Parameters 

443 ---------- 

444 task : `~lsst.pipe.base.PipelineTask` 

445 Task object. 

446 quantum : `~lsst.daf.butler.Quantum` 

447 Single Quantum instance. 

448 taskDef : `~lsst.pipe.base.TaskDef` 

449 Task definition structure. 

450 """ 

451 # Create a butler that operates in the context of a quantum 

452 butlerQC = QuantumContext(limited_butler, quantum, resources=self.resources) 

453 

454 # Get the input and output references for the task 

455 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

456 

457 # Call task runQuantum() method. Catch a few known failure modes and 

458 # translate them into specific 

459 try: 

460 task.runQuantum(butlerQC, inputRefs, outputRefs) 

461 except NoWorkFound as err: 

462 # Not an error, just an early exit. 

463 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

464 pass 

465 except RepeatableQuantumError as err: 

466 if self.exitOnKnownError: 

467 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

468 _LOG.warning(err, exc_info=True) 

469 sys.exit(err.EXIT_CODE) 

470 else: 

471 raise 

472 except InvalidQuantumError as err: 

473 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

474 _LOG.fatal(err, exc_info=True) 

475 sys.exit(err.EXIT_CODE) 

476 

477 def writeMetadata( 

478 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler 

479 ) -> None: 

480 # DatasetRef has to be in the Quantum outputs, can lookup by name 

481 try: 

482 [ref] = quantum.outputs[taskDef.metadataDatasetName] 

483 except LookupError as exc: 

484 raise InvalidQuantumError( 

485 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

486 " this could happen due to inconsistent options between QuantumGraph generation" 

487 " and execution" 

488 ) from exc 

489 limited_butler.put(metadata, ref) 

490 

491 def initGlobals(self, quantum: Quantum) -> None: 

492 """Initialize global state needed for task execution. 

493 

494 Parameters 

495 ---------- 

496 quantum : `~lsst.daf.butler.Quantum` 

497 Single Quantum instance. 

498 

499 Notes 

500 ----- 

501 There is an issue with initializing filters singleton which is done 

502 by instrument, to avoid requiring tasks to do it in runQuantum() 

503 we do it here when any dataId has an instrument dimension. Also for 

504 now we only allow single instrument, verify that all instrument 

505 names in all dataIds are identical. 

506 

507 This will need revision when filter singleton disappears. 

508 """ 

509 # can only work for full butler 

510 if self.butler is None: 

511 return 

512 oneInstrument = None 

513 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

514 for datasetRef in datasetRefs: 

515 dataId = datasetRef.dataId 

516 instrument = dataId.get("instrument") 

517 if instrument is not None: 

518 if oneInstrument is not None: 

519 assert ( # type: ignore 

520 instrument == oneInstrument 

521 ), "Currently require that only one instrument is used per graph" 

522 else: 

523 oneInstrument = instrument 

524 Instrument.fromName(instrument, self.butler.registry) 

525 

526 def getReport(self) -> QuantumReport | None: 

527 # Docstring inherited from base class 

528 if self.report is None: 

529 raise RuntimeError("getReport() called before execute()") 

530 return self.report