Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 10%

190 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-14 02:12 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["SingleQuantumExecutor"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import sys 

29import time 

30from collections import defaultdict 

31from collections.abc import Callable 

32from itertools import chain 

33from typing import Any 

34 

35from lsst.daf.butler import ( 

36 Butler, 

37 CollectionType, 

38 DatasetRef, 

39 DatasetType, 

40 LimitedButler, 

41 NamedKeyDict, 

42 Quantum, 

43) 

44from lsst.daf.butler.registry.wildcards import CollectionWildcard 

45from lsst.pipe.base import ( 

46 AdjustQuantumHelper, 

47 ButlerQuantumContext, 

48 Instrument, 

49 InvalidQuantumError, 

50 NoWorkFound, 

51 PipelineTask, 

52 RepeatableQuantumError, 

53 TaskDef, 

54 TaskFactory, 

55) 

56 

57# During metadata transition phase, determine metadata class by 

58# asking pipe_base 

59from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

60from lsst.utils.timer import logInfo 

61 

62# ----------------------------- 

63# Imports for other modules -- 

64# ----------------------------- 

65from .log_capture import LogCapture 

66from .quantumGraphExecutor import QuantumExecutor 

67from .reports import QuantumReport 

68 

69# ---------------------------------- 

70# Local non-exported definitions -- 

71# ---------------------------------- 

72 

73_LOG = logging.getLogger(__name__) 

74 

75 

76class SingleQuantumExecutor(QuantumExecutor): 

77 """Executor class which runs one Quantum at a time. 

78 

79 Parameters 

80 ---------- 

81 butler : `~lsst.daf.butler.Butler` or `None` 

82 Data butler, `None` means that Quantum-backed butler should be used 

83 instead. 

84 taskFactory : `~lsst.pipe.base.TaskFactory` 

85 Instance of a task factory. 

86 skipExistingIn 

87 Expressions representing the collections to search for existing 

88 output datasets. See :ref:`daf_butler_ordered_collection_searches` 

89 for allowed types. This class only checks for the presence of butler 

90 output run in the list of collections. If the output run is present 

91 in the list then the quanta whose complete outputs exist in the output 

92 run will be skipped. `None` or empty string/sequence disables skipping. 

93 clobberOutputs : `bool`, optional 

94 If `True`, then outputs from a quantum that exist in output run 

95 collection will be removed prior to executing a quantum. If 

96 ``skipExistingIn`` contains output run, then only partial outputs from 

97 a quantum will be removed. Only used when ``butler`` is not `None`. 

98 enableLsstDebug : `bool`, optional 

99 Enable debugging with ``lsstDebug`` facility for a task. 

100 exitOnKnownError : `bool`, optional 

101 If `True`, call `sys.exit` with the appropriate exit code for special 

102 known exceptions, after printing a traceback, instead of letting the 

103 exception propagate up to calling. This is always the behavior for 

104 InvalidQuantumError. 

105 limited_butler_factory : `Callable`, optional 

106 A method that creates a `~lsst.daf.butler.LimitedButler` instance 

107 for a given Quantum. This parameter must be defined if ``butler`` is 

108 `None`. If ``butler`` is not `None` then this parameter is ignored. 

109 """ 

110 

111 def __init__( 

112 self, 

113 butler: Butler | None, 

114 taskFactory: TaskFactory, 

115 skipExistingIn: Any = None, 

116 clobberOutputs: bool = False, 

117 enableLsstDebug: bool = False, 

118 exitOnKnownError: bool = False, 

119 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None, 

120 ): 

121 self.butler = butler 

122 self.taskFactory = taskFactory 

123 self.enableLsstDebug = enableLsstDebug 

124 self.clobberOutputs = clobberOutputs 

125 self.exitOnKnownError = exitOnKnownError 

126 self.limited_butler_factory = limited_butler_factory 

127 self.report: QuantumReport | None = None 

128 

129 if self.butler is None: 

130 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None" 

131 

132 # Find whether output run is in skipExistingIn. 

133 # TODO: This duplicates logic in GraphBuilder, would be nice to have 

134 # better abstraction for this some day. 

135 self.skipExisting = False 

136 if self.butler is not None and skipExistingIn: 

137 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

138 # As optimization check in the explicit list of names first 

139 self.skipExisting = self.butler.run in skip_collections_wildcard.strings 

140 if not self.skipExisting: 

141 # need to flatten it and check again 

142 self.skipExisting = self.butler.run in self.butler.registry.queryCollections( 

143 skipExistingIn, 

144 collectionTypes=CollectionType.RUN, 

145 ) 

146 

147 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

148 # Docstring inherited from QuantumExecutor.execute 

149 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

150 

151 if self.butler is not None: 

152 self.butler.registry.refresh() 

153 

154 # Catch any exception and make a report based on that. 

155 try: 

156 result = self._execute(taskDef, quantum) 

157 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label) 

158 return result 

159 except Exception as exc: 

160 self.report = QuantumReport.from_exception( 

161 exception=exc, 

162 dataId=quantum.dataId, 

163 taskLabel=taskDef.label, 

164 ) 

165 raise 

166 

167 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum: 

168 """Execute the quantum. 

169 

170 Internal implementation of `execute()`. 

171 """ 

172 startTime = time.time() 

173 

174 # Make a limited butler instance if needed (which should be QBB if full 

175 # butler is not defined). 

176 limited_butler: LimitedButler 

177 if self.butler is not None: 

178 limited_butler = self.butler 

179 else: 

180 # We check this in constructor, but mypy needs this check here. 

181 assert self.limited_butler_factory is not None 

182 limited_butler = self.limited_butler_factory(quantum) 

183 

184 if self.butler is not None: 

185 log_capture = LogCapture.from_full(self.butler) 

186 else: 

187 log_capture = LogCapture.from_limited(limited_butler) 

188 with log_capture.capture_logging(taskDef, quantum) as captureLog: 

189 # Save detailed resource usage before task start to metadata. 

190 quantumMetadata = _TASK_METADATA_TYPE() 

191 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type] 

192 

193 # check whether to skip or delete old outputs, if it returns True 

194 # or raises an exception do not try to store logs, as they may be 

195 # already in butler. 

196 captureLog.store = False 

197 if self.checkExistingOutputs(quantum, taskDef, limited_butler): 

198 _LOG.info( 

199 "Skipping already-successful quantum for label=%s dataId=%s.", 

200 taskDef.label, 

201 quantum.dataId, 

202 ) 

203 return quantum 

204 captureLog.store = True 

205 

206 try: 

207 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler) 

208 except NoWorkFound as exc: 

209 _LOG.info( 

210 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

211 taskDef.label, 

212 quantum.dataId, 

213 str(exc), 

214 ) 

215 # Make empty metadata that looks something like what a 

216 # do-nothing task would write (but we don't bother with empty 

217 # nested PropertySets for subtasks). This is slightly 

218 # duplicative with logic in pipe_base that we can't easily call 

219 # from here; we'll fix this on DM-29761. 

220 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

221 fullMetadata = _TASK_FULL_METADATA_TYPE() 

222 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

223 fullMetadata["quantum"] = quantumMetadata 

224 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

225 return quantum 

226 

227 # enable lsstDebug debugging 

228 if self.enableLsstDebug: 

229 try: 

230 _LOG.debug("Will try to import debug.py") 

231 import debug # type: ignore # noqa:F401 

232 except ImportError: 

233 _LOG.warn("No 'debug' module found.") 

234 

235 # initialize global state 

236 self.initGlobals(quantum) 

237 

238 # Ensure that we are executing a frozen config 

239 taskDef.config.freeze() 

240 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type] 

241 init_input_refs = list(quantum.initInputs.values()) 

242 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs) 

243 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type] 

244 try: 

245 self.runQuantum(task, quantum, taskDef, limited_butler) 

246 except Exception as e: 

247 _LOG.error( 

248 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

249 taskDef.label, 

250 quantum.dataId, 

251 e.__class__.__name__, 

252 str(e), 

253 ) 

254 raise 

255 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type] 

256 fullMetadata = task.getFullMetadata() 

257 fullMetadata["quantum"] = quantumMetadata 

258 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler) 

259 stopTime = time.time() 

260 _LOG.info( 

261 "Execution of task '%s' on quantum %s took %.3f seconds", 

262 taskDef.label, 

263 quantum.dataId, 

264 stopTime - startTime, 

265 ) 

266 return quantum 

267 

268 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool: 

269 """Decide whether this quantum needs to be executed. 

270 

271 If only partial outputs exist then they are removed if 

272 ``clobberOutputs`` is True, otherwise an exception is raised. 

273 

274 Parameters 

275 ---------- 

276 quantum : `~lsst.daf.butler.Quantum` 

277 Quantum to check for existing outputs 

278 taskDef : `~lsst.pipe.base.TaskDef` 

279 Task definition structure. 

280 

281 Returns 

282 ------- 

283 exist : `bool` 

284 `True` if ``self.skipExisting`` is defined, and a previous 

285 execution of this quanta appears to have completed successfully 

286 (either because metadata was written or all datasets were written). 

287 `False` otherwise. 

288 

289 Raises 

290 ------ 

291 RuntimeError 

292 Raised if some outputs exist and some not. 

293 """ 

294 if not self.butler: 

295 # Skip/prune logic only works for full butler. 

296 return False 

297 

298 if self.skipExisting: 

299 # Metadata output exists; this is sufficient to assume the previous 

300 # run was successful and should be skipped. 

301 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName] 

302 if metadata_ref is not None: 

303 if limited_butler.stored(metadata_ref): 

304 return True 

305 

306 # Find and prune (partial) outputs if `self.clobberOutputs` is set. 

307 ref_dict = self.butler.stored_many(chain.from_iterable(quantum.outputs.values())) 

308 existingRefs = [ref for ref, exists in ref_dict.items() if exists] 

309 missingRefs = [ref for ref, exists in ref_dict.items() if not exists] 

310 if existingRefs: 

311 if not missingRefs: 

312 # Full outputs exist. 

313 if self.skipExisting: 

314 return True 

315 elif self.clobberOutputs: 

316 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs) 

317 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

318 else: 

319 raise RuntimeError( 

320 f"Complete outputs exists for a quantum {quantum} " 

321 "and neither clobberOutputs nor skipExisting is set: " 

322 f"collection={self.butler.run} existingRefs={existingRefs}" 

323 ) 

324 else: 

325 # Partial outputs from a failed quantum. 

326 _LOG.debug( 

327 "Partial outputs exist for quantum %s collection=%s existingRefs=%s missingRefs=%s", 

328 quantum, 

329 self.butler.run, 

330 existingRefs, 

331 missingRefs, 

332 ) 

333 if self.clobberOutputs: 

334 # only prune 

335 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

336 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

337 return False 

338 else: 

339 raise RuntimeError( 

340 "Registry inconsistency while checking for existing quantum outputs:" 

341 f" quantum={quantum} collection={self.butler.run} existingRefs={existingRefs}" 

342 f" missingRefs={missingRefs}" 

343 ) 

344 

345 # By default always execute. 

346 return False 

347 

348 def updatedQuantumInputs( 

349 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

350 ) -> Quantum: 

351 """Update quantum with extra information, returns a new updated 

352 Quantum. 

353 

354 Some methods may require input DatasetRefs to have non-None 

355 ``dataset_id``, but in case of intermediate dataset it may not be 

356 filled during QuantumGraph construction. This method will retrieve 

357 missing info from registry. 

358 

359 Parameters 

360 ---------- 

361 quantum : `~lsst.daf.butler.Quantum` 

362 Single Quantum instance. 

363 taskDef : `~lsst.pipe.base.TaskDef` 

364 Task definition structure. 

365 

366 Returns 

367 ------- 

368 update : `~lsst.daf.butler.Quantum` 

369 Updated Quantum instance 

370 """ 

371 anyChanges = False 

372 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list) 

373 for key, refsForDatasetType in quantum.inputs.items(): 

374 newRefsForDatasetType = updatedInputs[key] 

375 stored = limited_butler.stored_many(refsForDatasetType) 

376 for ref in refsForDatasetType: 

377 # Inputs may already be resolved even if they do not exist, but 

378 # we have to re-resolve them because IDs are ignored on output. 

379 # Check datastore for existence first to cover calibration 

380 # dataset types, as they would need a timespan for findDataset. 

381 resolvedRef: DatasetRef | None 

382 if stored[ref]: 

383 resolvedRef = ref 

384 elif self.butler is not None: 

385 # This branch is for mock execution only which does not 

386 # generate actual outputs, only adds datasets to registry. 

387 resolvedRef = self.butler.registry.findDataset(ref.datasetType, ref.dataId) 

388 if resolvedRef is None: 

389 _LOG.info("No dataset found for %s", ref) 

390 continue 

391 else: 

392 _LOG.debug("Updated dataset ID for %s", ref) 

393 else: 

394 # QBB with missing intermediate 

395 _LOG.info("No dataset found for %s", ref) 

396 continue 

397 

398 if (ref_stored := stored.get(resolvedRef)) or ( 

399 ref_stored is None and limited_butler.stored(resolvedRef) 

400 ): 

401 # We need to ask datastore if the dataset actually exists 

402 # because the Registry of a local "execution butler" 

403 # cannot know this (because we prepopulate it with all of 

404 # the datasets that might be created). Either we have 

405 # already checked and know the answer, or the resolved 

406 # ref differed from the original and we have to ask 

407 # explicitly for that. 

408 newRefsForDatasetType.append(resolvedRef) 

409 

410 if len(newRefsForDatasetType) != len(refsForDatasetType): 

411 anyChanges = True 

412 # If we removed any input datasets, let the task check if it has enough 

413 # to proceed and/or prune related datasets that it also doesn't 

414 # need/produce anymore. It will raise NoWorkFound if it can't run, 

415 # which we'll let propagate up. This is exactly what we run during QG 

416 # generation, because a task shouldn't care whether an input is missing 

417 # because some previous task didn't produce it, or because it just 

418 # wasn't there during QG generation. 

419 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items()) 

420 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs) 

421 if anyChanges: 

422 assert quantum.dataId is not None, "Quantum DataId cannot be None" 

423 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

424 return Quantum( 

425 taskName=quantum.taskName, 

426 taskClass=quantum.taskClass, 

427 dataId=quantum.dataId, 

428 initInputs=quantum.initInputs, 

429 inputs=helper.inputs, 

430 outputs=helper.outputs, 

431 ) 

432 

433 def runQuantum( 

434 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler 

435 ) -> None: 

436 """Execute task on a single quantum. 

437 

438 Parameters 

439 ---------- 

440 task : `~lsst.pipe.base.PipelineTask` 

441 Task object. 

442 quantum : `~lsst.daf.butler.Quantum` 

443 Single Quantum instance. 

444 taskDef : `~lsst.pipe.base.TaskDef` 

445 Task definition structure. 

446 """ 

447 # Create a butler that operates in the context of a quantum 

448 butlerQC = ButlerQuantumContext(limited_butler, quantum) 

449 

450 # Get the input and output references for the task 

451 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

452 

453 # Call task runQuantum() method. Catch a few known failure modes and 

454 # translate them into specific 

455 try: 

456 task.runQuantum(butlerQC, inputRefs, outputRefs) 

457 except NoWorkFound as err: 

458 # Not an error, just an early exit. 

459 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

460 pass 

461 except RepeatableQuantumError as err: 

462 if self.exitOnKnownError: 

463 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

464 _LOG.warning(err, exc_info=True) 

465 sys.exit(err.EXIT_CODE) 

466 else: 

467 raise 

468 except InvalidQuantumError as err: 

469 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

470 _LOG.fatal(err, exc_info=True) 

471 sys.exit(err.EXIT_CODE) 

472 

473 def writeMetadata( 

474 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler 

475 ) -> None: 

476 # DatasetRef has to be in the Quantum outputs, can lookup by name 

477 try: 

478 [ref] = quantum.outputs[taskDef.metadataDatasetName] 

479 except LookupError as exc: 

480 raise InvalidQuantumError( 

481 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

482 " this could happen due to inconsistent options between QuantumGraph generation" 

483 " and execution" 

484 ) from exc 

485 limited_butler.put(metadata, ref) 

486 

487 def initGlobals(self, quantum: Quantum) -> None: 

488 """Initialize global state needed for task execution. 

489 

490 Parameters 

491 ---------- 

492 quantum : `~lsst.daf.butler.Quantum` 

493 Single Quantum instance. 

494 

495 Notes 

496 ----- 

497 There is an issue with initializing filters singleton which is done 

498 by instrument, to avoid requiring tasks to do it in runQuantum() 

499 we do it here when any dataId has an instrument dimension. Also for 

500 now we only allow single instrument, verify that all instrument 

501 names in all dataIds are identical. 

502 

503 This will need revision when filter singleton disappears. 

504 """ 

505 # can only work for full butler 

506 if self.butler is None: 

507 return 

508 oneInstrument = None 

509 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

510 for datasetRef in datasetRefs: 

511 dataId = datasetRef.dataId 

512 instrument = dataId.get("instrument") 

513 if instrument is not None: 

514 if oneInstrument is not None: 

515 assert ( # type: ignore 

516 instrument == oneInstrument 

517 ), "Currently require that only one instrument is used per graph" 

518 else: 

519 oneInstrument = instrument 

520 Instrument.fromName(instrument, self.butler.registry) 

521 

522 def getReport(self) -> QuantumReport | None: 

523 # Docstring inherited from base class 

524 if self.report is None: 

525 raise RuntimeError("getReport() called before execute()") 

526 return self.report