Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 10%

191 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-14 19:56 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["SingleQuantumExecutor"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import sys 

29import time 

30from collections import defaultdict 

31from collections.abc import Callable 

32from itertools import chain 

33from typing import Any 

34 

35from lsst.daf.butler import ( 

36 Butler, 

37 CollectionType, 

38 DatasetRef, 

39 DatasetType, 

40 LimitedButler, 

41 NamedKeyDict, 

42 Quantum, 

43) 

44from lsst.daf.butler.registry.wildcards import CollectionWildcard 

45from lsst.pipe.base import ( 

46 AdjustQuantumHelper, 

47 ExecutionResources, 

48 Instrument, 

49 InvalidQuantumError, 

50 NoWorkFound, 

51 PipelineTask, 

52 QuantumContext, 

53 RepeatableQuantumError, 

54 TaskDef, 

55 TaskFactory, 

56) 

57 

58# During metadata transition phase, determine metadata class by 

59# asking pipe_base 

60from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

61from lsst.utils.timer import logInfo 

62 

63# ----------------------------- 

64# Imports for other modules -- 

65# ----------------------------- 

66from .log_capture import LogCapture 

67from .quantumGraphExecutor import QuantumExecutor 

68from .reports import QuantumReport 

69 

70# ---------------------------------- 

71# Local non-exported definitions -- 

72# ---------------------------------- 

73 

74_LOG = logging.getLogger(__name__) 

75 

76 

77class SingleQuantumExecutor(QuantumExecutor): 

78 """Executor class which runs one Quantum at a time. 

79 

80 Parameters 

81 ---------- 

82 butler : `~lsst.daf.butler.Butler` or `None` 

83 Data butler, `None` means that Quantum-backed butler should be used 

84 instead. 

85 taskFactory : `~lsst.pipe.base.TaskFactory` 

86 Instance of a task factory. 

87 skipExistingIn 

88 Expressions representing the collections to search for existing 

89 output datasets. See :ref:`daf_butler_ordered_collection_searches` 

90 for allowed types. This class only checks for the presence of butler 

91 output run in the list of collections. If the output run is present 

92 in the list then the quanta whose complete outputs exist in the output 

93 run will be skipped. `None` or empty string/sequence disables skipping. 

94 clobberOutputs : `bool`, optional 

95 If `True`, then outputs from a quantum that exist in output run 

96 collection will be removed prior to executing a quantum. If 

97 ``skipExistingIn`` contains output run, then only partial outputs from 

98 a quantum will be removed. Only used when ``butler`` is not `None`. 

99 enableLsstDebug : `bool`, optional 

100 Enable debugging with ``lsstDebug`` facility for a task. 

101 exitOnKnownError : `bool`, optional 

102 If `True`, call `sys.exit` with the appropriate exit code for special 

103 known exceptions, after printing a traceback, instead of letting the 

104 exception propagate up to calling. This is always the behavior for 

105 InvalidQuantumError. 

106 limited_butler_factory : `Callable`, optional 

107 A method that creates a `~lsst.daf.butler.LimitedButler` instance 

108 for a given Quantum. This parameter must be defined if ``butler`` is 

109 `None`. If ``butler`` is not `None` then this parameter is ignored. 

110 resources : `~lsst.pipe.base.ExecutionResources`, optional 

111 The resources available to this quantum when executing. 

112 """ 

113 

114 def __init__( 

115 self, 

116 butler: Butler | None, 

117 taskFactory: TaskFactory, 

118 skipExistingIn: Any = None, 

119 clobberOutputs: bool = False, 

120 enableLsstDebug: bool = False, 

121 exitOnKnownError: bool = False, 

122 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None, 

123 resources: ExecutionResources | None = None, 

124 ): 

125 self.butler = butler 

126 self.taskFactory = taskFactory 

127 self.enableLsstDebug = enableLsstDebug 

128 self.clobberOutputs = clobberOutputs 

129 self.exitOnKnownError = exitOnKnownError 

130 self.limited_butler_factory = limited_butler_factory 

131 self.report: QuantumReport | None = None 

132 self.resources = resources 

133 

134 if self.butler is None: 

135 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None" 

136 

137 # Find whether output run is in skipExistingIn. 

138 # TODO: This duplicates logic in GraphBuilder, would be nice to have 

139 # better abstraction for this some day. 

140 self.skipExisting = False 

141 if self.butler is not None and skipExistingIn: 

142 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

143 # As optimization check in the explicit list of names first 

144 self.skipExisting = self.butler.run in skip_collections_wildcard.strings 

145 if not self.skipExisting: 

146 # need to flatten it and check again 

147 self.skipExisting = self.butler.run in self.butler.registry.queryCollections( 

148 skipExistingIn, 

149 collectionTypes=CollectionType.RUN, 

150 ) 

151 

152 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

153 # Docstring inherited from QuantumExecutor.execute 

154 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

155 

156 if self.butler is not None: 

157 self.butler.registry.refresh() 

158 

159 # Catch any exception and make a report based on that. 

160 try: 

161 result = self._execute(taskDef, quantum) 

162 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label) 

163 return result 

164 except Exception as exc: 

165 self.report = QuantumReport.from_exception( 

166 exception=exc, 

167 dataId=quantum.dataId, 

168 taskLabel=taskDef.label, 

169 ) 

170 raise 

171 

172 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

173 """Execute the quantum. 

174 

175 Internal implementation of `execute()`. 

176 """ 

177 startTime = time.time() 

178 

179 # Make a limited butler instance if needed (which should be QBB if full 

180 # butler is not defined). 

181 limited_butler: LimitedButler 

182 if self.butler is not None: 

183 limited_butler = self.butler 

184 else: 

185 # We check this in constructor, but mypy needs this check here. 

186 assert self.limited_butler_factory is not None 

187 limited_butler = self.limited_butler_factory(quantum) 

188 

189 if self.butler is not None: 

190 log_capture = LogCapture.from_full(self.butler) 

191 else: 

192 log_capture = LogCapture.from_limited(limited_butler) 

193 with log_capture.capture_logging(taskDef, quantum) as captureLog: 

194 # Save detailed resource usage before task start to metadata. 

195 quantumMetadata = _TASK_METADATA_TYPE() 

196 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type] 

197 

198 # check whether to skip or delete old outputs, if it returns True 

199 # or raises an exception do not try to store logs, as they may be 

200 # already in butler. 

201 captureLog.store = False 

202 if self.checkExistingOutputs(quantum, taskDef, limited_butler): 

203 _LOG.info( 

204 "Skipping already-successful quantum for label=%s dataId=%s.", 

205 taskDef.label, 

206 quantum.dataId, 

207 ) 

208 return quantum 

209 captureLog.store = True 

210 

211 try: 

212 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler) 

213 except NoWorkFound as exc: 

214 _LOG.info( 

215 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

216 taskDef.label, 

217 quantum.dataId, 

218 str(exc), 

219 ) 

220 # Make empty metadata that looks something like what a 

221 # do-nothing task would write (but we don't bother with empty 

222 # nested PropertySets for subtasks). This is slightly 

223 # duplicative with logic in pipe_base that we can't easily call 

224 # from here; we'll fix this on DM-29761. 

225 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

226 fullMetadata = _TASK_FULL_METADATA_TYPE() 

227 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

228 fullMetadata["quantum"] = quantumMetadata 

229 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

230 return quantum 

231 

232 # enable lsstDebug debugging 

233 if self.enableLsstDebug: 

234 try: 

235 _LOG.debug("Will try to import debug.py") 

236 import debug # type: ignore # noqa:F401 

237 except ImportError: 

238 _LOG.warn("No 'debug' module found.") 

239 

240 # initialize global state 

241 self.initGlobals(quantum) 

242 

243 # Ensure that we are executing a frozen config 

244 taskDef.config.freeze() 

245 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type] 

246 init_input_refs = list(quantum.initInputs.values()) 

247 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs) 

248 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type] 

249 try: 

250 self.runQuantum(task, quantum, taskDef, limited_butler) 

251 except Exception as e: 

252 _LOG.error( 

253 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

254 taskDef.label, 

255 quantum.dataId, 

256 e.__class__.__name__, 

257 str(e), 

258 ) 

259 raise 

260 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

261 fullMetadata = task.getFullMetadata() 

262 fullMetadata["quantum"] = quantumMetadata 

263 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

264 stopTime = time.time() 

265 _LOG.info( 

266 "Execution of task '%s' on quantum %s took %.3f seconds", 

267 taskDef.label, 

268 quantum.dataId, 

269 stopTime - startTime, 

270 ) 

271 return quantum 

272 

273 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool: 

274 """Decide whether this quantum needs to be executed. 

275 

276 If only partial outputs exist then they are removed if 

277 ``clobberOutputs`` is True, otherwise an exception is raised. 

278 

279 Parameters 

280 ---------- 

281 quantum : `~lsst.daf.butler.Quantum` 

282 Quantum to check for existing outputs 

283 taskDef : `~lsst.pipe.base.TaskDef` 

284 Task definition structure. 

285 

286 Returns 

287 ------- 

288 exist : `bool` 

289 `True` if ``self.skipExisting`` is defined, and a previous 

290 execution of this quanta appears to have completed successfully 

291 (either because metadata was written or all datasets were written). 

292 `False` otherwise. 

293 

294 Raises 

295 ------ 

296 RuntimeError 

297 Raised if some outputs exist and some not. 

298 """ 

299 if not self.butler: 

300 # Skip/prune logic only works for full butler. 

301 return False 

302 

303 if self.skipExisting: 

304 # Metadata output exists; this is sufficient to assume the previous 

305 # run was successful and should be skipped. 

306 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName] 

307 if metadata_ref is not None: 

308 if limited_butler.stored(metadata_ref): 

309 return True 

310 

311 # Find and prune (partial) outputs if `self.clobberOutputs` is set. 

312 ref_dict = self.butler.stored_many(chain.from_iterable(quantum.outputs.values())) 

313 existingRefs = [ref for ref, exists in ref_dict.items() if exists] 

314 missingRefs = [ref for ref, exists in ref_dict.items() if not exists] 

315 if existingRefs: 

316 if not missingRefs: 

317 # Full outputs exist. 

318 if self.skipExisting: 

319 return True 

320 elif self.clobberOutputs: 

321 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs) 

322 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

323 else: 

324 raise RuntimeError( 

325 f"Complete outputs exists for a quantum {quantum} " 

326 "and neither clobberOutputs nor skipExisting is set: " 

327 f"collection={self.butler.run} existingRefs={existingRefs}" 

328 ) 

329 else: 

330 # Partial outputs from a failed quantum. 

331 _LOG.debug( 

332 "Partial outputs exist for quantum %s collection=%s existingRefs=%s missingRefs=%s", 

333 quantum, 

334 self.butler.run, 

335 existingRefs, 

336 missingRefs, 

337 ) 

338 if self.clobberOutputs: 

339 # only prune 

340 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

341 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

342 return False 

343 else: 

344 raise RuntimeError( 

345 "Registry inconsistency while checking for existing quantum outputs:" 

346 f" quantum={quantum} collection={self.butler.run} existingRefs={existingRefs}" 

347 f" missingRefs={missingRefs}" 

348 ) 

349 

350 # By default always execute. 

351 return False 

352 

353 def updatedQuantumInputs( 

354 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

355 ) -> Quantum: 

356 """Update quantum with extra information, returns a new updated 

357 Quantum. 

358 

359 Some methods may require input DatasetRefs to have non-None 

360 ``dataset_id``, but in case of intermediate dataset it may not be 

361 filled during QuantumGraph construction. This method will retrieve 

362 missing info from registry. 

363 

364 Parameters 

365 ---------- 

366 quantum : `~lsst.daf.butler.Quantum` 

367 Single Quantum instance. 

368 taskDef : `~lsst.pipe.base.TaskDef` 

369 Task definition structure. 

370 

371 Returns 

372 ------- 

373 update : `~lsst.daf.butler.Quantum` 

374 Updated Quantum instance 

375 """ 

376 anyChanges = False 

377 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list) 

378 for key, refsForDatasetType in quantum.inputs.items(): 

379 newRefsForDatasetType = updatedInputs[key] 

380 stored = limited_butler.stored_many(refsForDatasetType) 

381 for ref in refsForDatasetType: 

382 # Inputs may already be resolved even if they do not exist, but 

383 # we have to re-resolve them because IDs are ignored on output. 

384 # Check datastore for existence first to cover calibration 

385 # dataset types, as they would need a timespan for findDataset. 

386 resolvedRef: DatasetRef | None 

387 if stored[ref]: 

388 resolvedRef = ref 

389 elif self.butler is not None: 

390 # This branch is for mock execution only which does not 

391 # generate actual outputs, only adds datasets to registry. 

392 resolvedRef = self.butler.registry.findDataset(ref.datasetType, ref.dataId) 

393 if resolvedRef is None: 

394 _LOG.info("No dataset found for %s", ref) 

395 continue 

396 else: 

397 _LOG.debug("Updated dataset ID for %s", ref) 

398 else: 

399 # QBB with missing intermediate 

400 _LOG.info("No dataset found for %s", ref) 

401 continue 

402 

403 if (ref_stored := stored.get(resolvedRef)) or ( 

404 ref_stored is None and limited_butler.stored(resolvedRef) 

405 ): 

406 # We need to ask datastore if the dataset actually exists 

407 # because the Registry of a local "execution butler" 

408 # cannot know this (because we prepopulate it with all of 

409 # the datasets that might be created). Either we have 

410 # already checked and know the answer, or the resolved 

411 # ref differed from the original and we have to ask 

412 # explicitly for that. 

413 newRefsForDatasetType.append(resolvedRef) 

414 

415 if len(newRefsForDatasetType) != len(refsForDatasetType): 

416 anyChanges = True 

417 # If we removed any input datasets, let the task check if it has enough 

418 # to proceed and/or prune related datasets that it also doesn't 

419 # need/produce anymore. It will raise NoWorkFound if it can't run, 

420 # which we'll let propagate up. This is exactly what we run during QG 

421 # generation, because a task shouldn't care whether an input is missing 

422 # because some previous task didn't produce it, or because it just 

423 # wasn't there during QG generation. 

424 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items()) 

425 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs) 

426 if anyChanges: 

427 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

428 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

429 return Quantum( 

430 taskName=quantum.taskName, 

431 taskClass=quantum.taskClass, 

432 dataId=quantum.dataId, 

433 initInputs=quantum.initInputs, 

434 inputs=helper.inputs, 

435 outputs=helper.outputs, 

436 ) 

437 

438 def runQuantum( 

439 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

440 ) -> None: 

441 """Execute task on a single quantum. 

442 

443 Parameters 

444 ---------- 

445 task : `~lsst.pipe.base.PipelineTask` 

446 Task object. 

447 quantum : `~lsst.daf.butler.Quantum` 

448 Single Quantum instance. 

449 taskDef : `~lsst.pipe.base.TaskDef` 

450 Task definition structure. 

451 """ 

452 # Create a butler that operates in the context of a quantum 

453 butlerQC = QuantumContext(limited_butler, quantum, resources=self.resources) 

454 

455 # Get the input and output references for the task 

456 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

457 

458 # Call task runQuantum() method. Catch a few known failure modes and 

459 # translate them into specific 

460 try: 

461 task.runQuantum(butlerQC, inputRefs, outputRefs) 

462 except NoWorkFound as err: 

463 # Not an error, just an early exit. 

464 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

465 pass 

466 except RepeatableQuantumError as err: 

467 if self.exitOnKnownError: 

468 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

469 _LOG.warning(err, exc_info=True) 

470 sys.exit(err.EXIT_CODE) 

471 else: 

472 raise 

473 except InvalidQuantumError as err: 

474 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

475 _LOG.fatal(err, exc_info=True) 

476 sys.exit(err.EXIT_CODE) 

477 

478 def writeMetadata( 

479 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler 

480 ) -> None: 

481 # DatasetRef has to be in the Quantum outputs, can lookup by name 

482 try: 

483 [ref] = quantum.outputs[taskDef.metadataDatasetName] 

484 except LookupError as exc: 

485 raise InvalidQuantumError( 

486 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

487 " this could happen due to inconsistent options between QuantumGraph generation" 

488 " and execution" 

489 ) from exc 

490 limited_butler.put(metadata, ref) 

491 

492 def initGlobals(self, quantum: Quantum) -> None: 

493 """Initialize global state needed for task execution. 

494 

495 Parameters 

496 ---------- 

497 quantum : `~lsst.daf.butler.Quantum` 

498 Single Quantum instance. 

499 

500 Notes 

501 ----- 

502 There is an issue with initializing filters singleton which is done 

503 by instrument, to avoid requiring tasks to do it in runQuantum() 

504 we do it here when any dataId has an instrument dimension. Also for 

505 now we only allow single instrument, verify that all instrument 

506 names in all dataIds are identical. 

507 

508 This will need revision when filter singleton disappears. 

509 """ 

510 # can only work for full butler 

511 if self.butler is None: 

512 return 

513 oneInstrument = None 

514 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

515 for datasetRef in datasetRefs: 

516 dataId = datasetRef.dataId 

517 instrument = dataId.get("instrument") 

518 if instrument is not None: 

519 if oneInstrument is not None: 

520 assert ( # type: ignore 

521 instrument == oneInstrument 

522 ), "Currently require that only one instrument is used per graph" 

523 else: 

524 oneInstrument = instrument 

525 Instrument.fromName(instrument, self.butler.registry) 

526 

527 def getReport(self) -> QuantumReport | None: 

528 # Docstring inherited from base class 

529 if self.report is None: 

530 raise RuntimeError("getReport() called before execute()") 

531 return self.report