Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 10%

190 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-09 02:48 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["SingleQuantumExecutor"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import sys 

29import time 

30from collections import defaultdict 

31from collections.abc import Callable 

32from itertools import chain 

33from typing import Any, Optional 

34 

35from lsst.daf.butler import ( 

36 Butler, 

37 CollectionType, 

38 DatasetRef, 

39 DatasetType, 

40 LimitedButler, 

41 NamedKeyDict, 

42 Quantum, 

43) 

44from lsst.daf.butler.registry.wildcards import CollectionWildcard 

45from lsst.pipe.base import ( 

46 AdjustQuantumHelper, 

47 ButlerQuantumContext, 

48 Instrument, 

49 InvalidQuantumError, 

50 NoWorkFound, 

51 PipelineTask, 

52 RepeatableQuantumError, 

53 TaskDef, 

54 TaskFactory, 

55) 

56 

57# During metadata transition phase, determine metadata class by 

58# asking pipe_base 

59from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

60from lsst.utils.timer import logInfo 

61 

62# ----------------------------- 

63# Imports for other modules -- 

64# ----------------------------- 

65from .log_capture import LogCapture 

66from .quantumGraphExecutor import QuantumExecutor 

67from .reports import QuantumReport 

68 

69# ---------------------------------- 

70# Local non-exported definitions -- 

71# ---------------------------------- 

72 

73_LOG = logging.getLogger(__name__) 

74 

75 

76class SingleQuantumExecutor(QuantumExecutor): 

77 """Executor class which runs one Quantum at a time. 

78 

79 Parameters 

80 ---------- 

81 butler : `~lsst.daf.butler.Butler` or `None` 

82 Data butler, `None` means that Quantum-backed butler should be used 

83 instead. 

84 taskFactory : `~lsst.pipe.base.TaskFactory` 

85 Instance of a task factory. 

86 skipExistingIn 

87 Expressions representing the collections to search for existing 

88 output datasets. See :ref:`daf_butler_ordered_collection_searches` 

89 for allowed types. This class only checks for the presence of butler 

90 output run in the list of collections. If the output run is present 

91 in the list then the quanta whose complete outputs exist in the output 

92 run will be skipped. `None` or empty string/sequence disables skipping. 

93 clobberOutputs : `bool`, optional 

94 If `True`, then outputs from a quantum that exist in output run 

95 collection will be removed prior to executing a quantum. If 

96 ``skipExistingIn`` contains output run, then only partial outputs from 

97 a quantum will be removed. Only used when ``butler`` is not `None`. 

98 enableLsstDebug : `bool`, optional 

99 Enable debugging with ``lsstDebug`` facility for a task. 

100 exitOnKnownError : `bool`, optional 

101 If `True`, call `sys.exit` with the appropriate exit code for special 

102 known exceptions, after printing a traceback, instead of letting the 

103 exception propagate up to calling. This is always the behavior for 

104 InvalidQuantumError. 

105 limited_butler_factory : `Callable`, optional 

106 A method that creates a `~lsst.daf.butler.LimitedButler` instance 

107 for a given Quantum. This parameter must be defined if ``butler`` is 

108 `None`. If ``butler`` is not `None` then this parameter is ignored. 

109 """ 

110 

111 def __init__( 

112 self, 

113 butler: Butler | None, 

114 taskFactory: TaskFactory, 

115 skipExistingIn: Any = None, 

116 clobberOutputs: bool = False, 

117 enableLsstDebug: bool = False, 

118 exitOnKnownError: bool = False, 

119 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None, 

120 ): 

121 self.butler = butler 

122 self.taskFactory = taskFactory 

123 self.enableLsstDebug = enableLsstDebug 

124 self.clobberOutputs = clobberOutputs 

125 self.exitOnKnownError = exitOnKnownError 

126 self.limited_butler_factory = limited_butler_factory 

127 self.report: Optional[QuantumReport] = None 

128 

129 if self.butler is None: 

130 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None" 

131 

132 # Find whether output run is in skipExistingIn. 

133 # TODO: This duplicates logic in GraphBuilder, would be nice to have 

134 # better abstraction for this some day. 

135 self.skipExisting = False 

136 if self.butler is not None and skipExistingIn: 

137 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

138 # As optimization check in the explicit list of names first 

139 self.skipExisting = self.butler.run in skip_collections_wildcard.strings 

140 if not self.skipExisting: 

141 # need to flatten it and check again 

142 self.skipExisting = self.butler.run in self.butler.registry.queryCollections( 

143 skipExistingIn, 

144 collectionTypes=CollectionType.RUN, 

145 ) 

146 

147 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

148 # Docstring inherited from QuantumExecutor.execute 

149 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

150 

151 if self.butler is not None: 

152 self.butler.registry.refresh() 

153 

154 # Catch any exception and make a report based on that. 

155 try: 

156 result = self._execute(taskDef, quantum) 

157 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label) 

158 return result 

159 except Exception as exc: 

160 self.report = QuantumReport.from_exception( 

161 exception=exc, 

162 dataId=quantum.dataId, 

163 taskLabel=taskDef.label, 

164 ) 

165 raise 

166 

167 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

168 """Internal implementation of execute()""" 

169 startTime = time.time() 

170 

171 # Make a limited butler instance if needed (which should be QBB if full 

172 # butler is not defined). 

173 limited_butler: LimitedButler 

174 if self.butler is not None: 

175 limited_butler = self.butler 

176 else: 

177 # We check this in constructor, but mypy needs this check here. 

178 assert self.limited_butler_factory is not None 

179 limited_butler = self.limited_butler_factory(quantum) 

180 

181 if self.butler is not None: 

182 log_capture = LogCapture.from_full(self.butler) 

183 else: 

184 log_capture = LogCapture.from_limited(limited_butler) 

185 with log_capture.capture_logging(taskDef, quantum) as captureLog: 

186 # Save detailed resource usage before task start to metadata. 

187 quantumMetadata = _TASK_METADATA_TYPE() 

188 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type] 

189 

190 # check whether to skip or delete old outputs, if it returns True 

191 # or raises an exception do not try to store logs, as they may be 

192 # already in butler. 

193 captureLog.store = False 

194 if self.checkExistingOutputs(quantum, taskDef, limited_butler): 

195 _LOG.info( 

196 "Skipping already-successful quantum for label=%s dataId=%s.", 

197 taskDef.label, 

198 quantum.dataId, 

199 ) 

200 return quantum 

201 captureLog.store = True 

202 

203 try: 

204 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler) 

205 except NoWorkFound as exc: 

206 _LOG.info( 

207 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

208 taskDef.label, 

209 quantum.dataId, 

210 str(exc), 

211 ) 

212 # Make empty metadata that looks something like what a 

213 # do-nothing task would write (but we don't bother with empty 

214 # nested PropertySets for subtasks). This is slightly 

215 # duplicative with logic in pipe_base that we can't easily call 

216 # from here; we'll fix this on DM-29761. 

217 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

218 fullMetadata = _TASK_FULL_METADATA_TYPE() 

219 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

220 fullMetadata["quantum"] = quantumMetadata 

221 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

222 return quantum 

223 

224 # enable lsstDebug debugging 

225 if self.enableLsstDebug: 

226 try: 

227 _LOG.debug("Will try to import debug.py") 

228 import debug # type: ignore # noqa:F401 

229 except ImportError: 

230 _LOG.warn("No 'debug' module found.") 

231 

232 # initialize global state 

233 self.initGlobals(quantum) 

234 

235 # Ensure that we are executing a frozen config 

236 taskDef.config.freeze() 

237 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type] 

238 init_input_refs = list(quantum.initInputs.values()) 

239 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs) 

240 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type] 

241 try: 

242 self.runQuantum(task, quantum, taskDef, limited_butler) 

243 except Exception as e: 

244 _LOG.error( 

245 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

246 taskDef.label, 

247 quantum.dataId, 

248 e.__class__.__name__, 

249 str(e), 

250 ) 

251 raise 

252 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

253 fullMetadata = task.getFullMetadata() 

254 fullMetadata["quantum"] = quantumMetadata 

255 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

256 stopTime = time.time() 

257 _LOG.info( 

258 "Execution of task '%s' on quantum %s took %.3f seconds", 

259 taskDef.label, 

260 quantum.dataId, 

261 stopTime - startTime, 

262 ) 

263 return quantum 

264 

265 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool: 

266 """Decide whether this quantum needs to be executed. 

267 

268 If only partial outputs exist then they are removed if 

269 ``clobberOutputs`` is True, otherwise an exception is raised. 

270 

271 Parameters 

272 ---------- 

273 quantum : `~lsst.daf.butler.Quantum` 

274 Quantum to check for existing outputs 

275 taskDef : `~lsst.pipe.base.TaskDef` 

276 Task definition structure. 

277 

278 Returns 

279 ------- 

280 exist : `bool` 

281 `True` if ``self.skipExisting`` is defined, and a previous 

282 execution of this quanta appears to have completed successfully 

283 (either because metadata was written or all datasets were written). 

284 `False` otherwise. 

285 

286 Raises 

287 ------ 

288 RuntimeError 

289 Raised if some outputs exist and some not. 

290 """ 

291 if not self.butler: 

292 # Skip/prune logic only works for full butler. 

293 return False 

294 

295 if self.skipExisting: 

296 # Metadata output exists; this is sufficient to assume the previous 

297 # run was successful and should be skipped. 

298 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName] 

299 if metadata_ref is not None: 

300 if limited_butler.stored(metadata_ref): 

301 return True 

302 

303 # Find and prune (partial) outputs if `self.clobberOutputs` is set. 

304 ref_dict = self.butler.stored_many(chain.from_iterable(quantum.outputs.values())) 

305 existingRefs = [ref for ref, exists in ref_dict.items() if exists] 

306 missingRefs = [ref for ref, exists in ref_dict.items() if not exists] 

307 if existingRefs: 

308 if not missingRefs: 

309 # Full outputs exist. 

310 if self.skipExisting: 

311 return True 

312 elif self.clobberOutputs: 

313 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs) 

314 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

315 else: 

316 raise RuntimeError( 

317 f"Complete outputs exists for a quantum {quantum} " 

318 "and neither clobberOutputs nor skipExisting is set: " 

319 f"collection={self.butler.run} existingRefs={existingRefs}" 

320 ) 

321 else: 

322 # Partial outputs from a failed quantum. 

323 _LOG.debug( 

324 "Partial outputs exist for quantum %s collection=%s existingRefs=%s missingRefs=%s", 

325 quantum, 

326 self.butler.run, 

327 existingRefs, 

328 missingRefs, 

329 ) 

330 if self.clobberOutputs: 

331 # only prune 

332 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

333 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

334 return False 

335 else: 

336 raise RuntimeError( 

337 "Registry inconsistency while checking for existing quantum outputs:" 

338 f" quantum={quantum} collection={self.butler.run} existingRefs={existingRefs}" 

339 f" missingRefs={missingRefs}" 

340 ) 

341 

342 # By default always execute. 

343 return False 

344 

345 def updatedQuantumInputs( 

346 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

347 ) -> Quantum: 

348 """Update quantum with extra information, returns a new updated 

349 Quantum. 

350 

351 Some methods may require input DatasetRefs to have non-None 

352 ``dataset_id``, but in case of intermediate dataset it may not be 

353 filled during QuantumGraph construction. This method will retrieve 

354 missing info from registry. 

355 

356 Parameters 

357 ---------- 

358 quantum : `~lsst.daf.butler.Quantum` 

359 Single Quantum instance. 

360 taskDef : `~lsst.pipe.base.TaskDef` 

361 Task definition structure. 

362 

363 Returns 

364 ------- 

365 update : `~lsst.daf.butler.Quantum` 

366 Updated Quantum instance 

367 """ 

368 anyChanges = False 

369 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list) 

370 for key, refsForDatasetType in quantum.inputs.items(): 

371 newRefsForDatasetType = updatedInputs[key] 

372 stored = limited_butler.stored_many(refsForDatasetType) 

373 for ref in refsForDatasetType: 

374 # Inputs may already be resolved even if they do not exist, but 

375 # we have to re-resolve them because IDs are ignored on output. 

376 # Check datastore for existence first to cover calibration 

377 # dataset types, as they would need a timespan for findDataset. 

378 resolvedRef: DatasetRef | None 

379 if stored[ref]: 

380 resolvedRef = ref 

381 elif self.butler is not None: 

382 # This branch is for mock execution only which does not 

383 # generate actual outputs, only adds datasets to registry. 

384 resolvedRef = self.butler.registry.findDataset(ref.datasetType, ref.dataId) 

385 if resolvedRef is None: 

386 _LOG.info("No dataset found for %s", ref) 

387 continue 

388 else: 

389 _LOG.debug("Updated dataset ID for %s", ref) 

390 else: 

391 # QBB with missing intermediate 

392 _LOG.info("No dataset found for %s", ref) 

393 continue 

394 

395 if (ref_stored := stored.get(resolvedRef)) or ( 

396 ref_stored is None and limited_butler.stored(resolvedRef) 

397 ): 

398 # We need to ask datastore if the dataset actually exists 

399 # because the Registry of a local "execution butler" 

400 # cannot know this (because we prepopulate it with all of 

401 # the datasets that might be created). Either we have 

402 # already checked and know the answer, or the resolved 

403 # ref differed from the original and we have to ask 

404 # explicitly for that. 

405 newRefsForDatasetType.append(resolvedRef) 

406 

407 if len(newRefsForDatasetType) != len(refsForDatasetType): 

408 anyChanges = True 

409 # If we removed any input datasets, let the task check if it has enough 

410 # to proceed and/or prune related datasets that it also doesn't 

411 # need/produce anymore. It will raise NoWorkFound if it can't run, 

412 # which we'll let propagate up. This is exactly what we run during QG 

413 # generation, because a task shouldn't care whether an input is missing 

414 # because some previous task didn't produce it, or because it just 

415 # wasn't there during QG generation. 

416 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items()) 

417 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs) 

418 if anyChanges: 

419 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

420 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

421 return Quantum( 

422 taskName=quantum.taskName, 

423 taskClass=quantum.taskClass, 

424 dataId=quantum.dataId, 

425 initInputs=quantum.initInputs, 

426 inputs=helper.inputs, 

427 outputs=helper.outputs, 

428 ) 

429 

430 def runQuantum( 

431 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

432 ) -> None: 

433 """Execute task on a single quantum. 

434 

435 Parameters 

436 ---------- 

437 task : `~lsst.pipe.base.PipelineTask` 

438 Task object. 

439 quantum : `~lsst.daf.butler.Quantum` 

440 Single Quantum instance. 

441 taskDef : `~lsst.pipe.base.TaskDef` 

442 Task definition structure. 

443 """ 

444 # Create a butler that operates in the context of a quantum 

445 butlerQC = ButlerQuantumContext(limited_butler, quantum) 

446 

447 # Get the input and output references for the task 

448 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

449 

450 # Call task runQuantum() method. Catch a few known failure modes and 

451 # translate them into specific 

452 try: 

453 task.runQuantum(butlerQC, inputRefs, outputRefs) 

454 except NoWorkFound as err: 

455 # Not an error, just an early exit. 

456 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

457 pass 

458 except RepeatableQuantumError as err: 

459 if self.exitOnKnownError: 

460 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

461 _LOG.warning(err, exc_info=True) 

462 sys.exit(err.EXIT_CODE) 

463 else: 

464 raise 

465 except InvalidQuantumError as err: 

466 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

467 _LOG.fatal(err, exc_info=True) 

468 sys.exit(err.EXIT_CODE) 

469 

470 def writeMetadata( 

471 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler 

472 ) -> None: 

473 # DatasetRef has to be in the Quantum outputs, can lookup by name 

474 try: 

475 [ref] = quantum.outputs[taskDef.metadataDatasetName] 

476 except LookupError as exc: 

477 raise InvalidQuantumError( 

478 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

479 " this could happen due to inconsistent options between QuantumGraph generation" 

480 " and execution" 

481 ) from exc 

482 limited_butler.put(metadata, ref) 

483 

484 def initGlobals(self, quantum: Quantum) -> None: 

485 """Initialize global state needed for task execution. 

486 

487 Parameters 

488 ---------- 

489 quantum : `~lsst.daf.butler.Quantum` 

490 Single Quantum instance. 

491 

492 Notes 

493 ----- 

494 There is an issue with initializing filters singleton which is done 

495 by instrument, to avoid requiring tasks to do it in runQuantum() 

496 we do it here when any dataId has an instrument dimension. Also for 

497 now we only allow single instrument, verify that all instrument 

498 names in all dataIds are identical. 

499 

500 This will need revision when filter singleton disappears. 

501 """ 

502 # can only work for full butler 

503 if self.butler is None: 

504 return 

505 oneInstrument = None 

506 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

507 for datasetRef in datasetRefs: 

508 dataId = datasetRef.dataId 

509 instrument = dataId.get("instrument") 

510 if instrument is not None: 

511 if oneInstrument is not None: 

512 assert ( # type: ignore 

513 instrument == oneInstrument 

514 ), "Currently require that only one instrument is used per graph" 

515 else: 

516 oneInstrument = instrument 

517 Instrument.fromName(instrument, self.butler.registry) 

518 

519 def getReport(self) -> Optional[QuantumReport]: 

520 # Docstring inherited from base class 

521 if self.report is None: 

522 raise RuntimeError("getReport() called before execute()") 

523 return self.report