Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

223 statements  

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["SingleQuantumExecutor"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import os 

29import shutil 

30import sys 

31import tempfile 

32import time 

33from collections import defaultdict 

34from contextlib import contextmanager 

35from itertools import chain 

36from logging import FileHandler 

37from typing import List 

38 

39from lsst.daf.butler import DatasetRef, DatasetType, FileDataset, NamedKeyDict, Quantum 

40from lsst.daf.butler.core.logging import ButlerLogRecordHandler, ButlerLogRecords, ButlerMDC, JsonLogFormatter 

41from lsst.obs.base import Instrument 

42from lsst.pipe.base import ( 

43 AdjustQuantumHelper, 

44 ButlerQuantumContext, 

45 InvalidQuantumError, 

46 NoWorkFound, 

47 RepeatableQuantumError, 

48) 

49 

50# During metadata transition phase, determine metadata class by 

51# asking pipe_base 

52from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE 

53from lsst.utils.timer import logInfo 

54 

55# ----------------------------- 

56# Imports for other modules -- 

57# ----------------------------- 

58from .quantumGraphExecutor import QuantumExecutor 

59 

60# ---------------------------------- 

61# Local non-exported definitions -- 

62# ---------------------------------- 

63 

64_LOG = logging.getLogger(__name__) 

65 

66 

67class _LogCaptureFlag: 

68 """Simple flag to enable/disable log-to-butler saving.""" 

69 

70 store: bool = True 

71 

72 

73class SingleQuantumExecutor(QuantumExecutor): 

74 """Executor class which runs one Quantum at a time. 

75 

76 Parameters 

77 ---------- 

78 butler : `~lsst.daf.butler.Butler` 

79 Data butler. 

80 taskFactory : `~lsst.pipe.base.TaskFactory` 

81 Instance of a task factory. 

82 skipExistingIn : `list` [ `str` ], optional 

83 Accepts list of collections, if all Quantum outputs already exist in 

84 the specified list of collections then that Quantum will not be rerun. 

85 clobberOutputs : `bool`, optional 

86 If `True`, then existing outputs in output run collection will be 

87 overwritten. If ``skipExistingIn`` is defined, only outputs from 

88 failed quanta will be overwritten. 

89 enableLsstDebug : `bool`, optional 

90 Enable debugging with ``lsstDebug`` facility for a task. 

91 exitOnKnownError : `bool`, optional 

92 If `True`, call `sys.exit` with the appropriate exit code for special 

93 known exceptions, after printing a traceback, instead of letting the 

94 exception propagate up to calling. This is always the behavior for 

95 InvalidQuantumError. 

96 """ 

97 

98 stream_json_logs = True 

99 """If True each log record is written to a temporary file and ingested 

100 when quantum completes. If False the records are accumulated in memory 

101 and stored in butler on quantum completion.""" 

102 

103 def __init__( 

104 self, 

105 taskFactory, 

106 skipExistingIn=None, 

107 clobberOutputs=False, 

108 enableLsstDebug=False, 

109 exitOnKnownError=False, 

110 ): 

111 self.taskFactory = taskFactory 

112 self.skipExistingIn = skipExistingIn 

113 self.enableLsstDebug = enableLsstDebug 

114 self.clobberOutputs = clobberOutputs 

115 self.exitOnKnownError = exitOnKnownError 

116 self.log_handler = None 

117 

118 def execute(self, taskDef, quantum, butler): 

119 # Docstring inherited from QuantumExecutor.execute 

120 startTime = time.time() 

121 

122 with self.captureLogging(taskDef, quantum, butler) as captureLog: 

123 

124 # Save detailed resource usage before task start to metadata. 

125 quantumMetadata = _TASK_METADATA_TYPE() 

126 logInfo(None, "prep", metadata=quantumMetadata) 

127 

128 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config 

129 

130 # check whether to skip or delete old outputs, if it returns True 

131 # or raises an exception do not try to store logs, as they may be 

132 # already in butler. 

133 captureLog.store = False 

134 if self.checkExistingOutputs(quantum, butler, taskDef): 

135 _LOG.info( 

136 "Skipping already-successful quantum for label=%s dataId=%s.", label, quantum.dataId 

137 ) 

138 return 

139 captureLog.store = True 

140 

141 try: 

142 quantum = self.updatedQuantumInputs(quantum, butler, taskDef) 

143 except NoWorkFound as exc: 

144 _LOG.info( 

145 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

146 taskDef.label, 

147 quantum.dataId, 

148 str(exc), 

149 ) 

150 # Make empty metadata that looks something like what a 

151 # do-nothing task would write (but we don't bother with empty 

152 # nested PropertySets for subtasks). This is slightly 

153 # duplicative with logic in pipe_base that we can't easily call 

154 # from here; we'll fix this on DM-29761. 

155 logInfo(None, "end", metadata=quantumMetadata) 

156 fullMetadata = _TASK_FULL_METADATA_TYPE() 

157 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE() 

158 fullMetadata["quantum"] = quantumMetadata 

159 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

160 return 

161 

162 # enable lsstDebug debugging 

163 if self.enableLsstDebug: 

164 try: 

165 _LOG.debug("Will try to import debug.py") 

166 import debug # noqa:F401 

167 except ImportError: 

168 _LOG.warn("No 'debug' module found.") 

169 

170 # initialize global state 

171 self.initGlobals(quantum, butler) 

172 

173 # Ensure that we are executing a frozen config 

174 config.freeze() 

175 logInfo(None, "init", metadata=quantumMetadata) 

176 task = self.makeTask(taskClass, label, config, butler) 

177 logInfo(None, "start", metadata=quantumMetadata) 

178 try: 

179 self.runQuantum(task, quantum, taskDef, butler) 

180 except Exception as e: 

181 _LOG.exception( 

182 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

183 taskDef.label, 

184 quantum.dataId, 

185 e.__class__.__name__, 

186 str(e), 

187 ) 

188 raise 

189 logInfo(None, "end", metadata=quantumMetadata) 

190 fullMetadata = task.getFullMetadata() 

191 fullMetadata["quantum"] = quantumMetadata 

192 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

193 stopTime = time.time() 

194 _LOG.info( 

195 "Execution of task '%s' on quantum %s took %.3f seconds", 

196 taskDef.label, 

197 quantum.dataId, 

198 stopTime - startTime, 

199 ) 

200 return quantum 

201 

202 @contextmanager 

203 def captureLogging(self, taskDef, quantum, butler): 

204 """Configure logging system to capture logs for execution of this task. 

205 

206 Parameters 

207 ---------- 

208 taskDef : `lsst.pipe.base.TaskDef` 

209 The task definition. 

210 quantum : `~lsst.daf.butler.Quantum` 

211 Single Quantum instance. 

212 butler : `~lsst.daf.butler.Butler` 

213 Butler to write logs to. 

214 

215 Notes 

216 ----- 

217 Expected to be used as a context manager to ensure that logging 

218 records are inserted into the butler once the quantum has been 

219 executed: 

220 

221 .. code-block:: py 

222 

223 with self.captureLogging(taskDef, quantum, butler): 

224 # Run quantum and capture logs. 

225 

226 Ths method can also setup logging to attach task- or 

227 quantum-specific information to log messages. Potentially this can 

228 take into account some info from task configuration as well. 

229 """ 

230 # Add a handler to the root logger to capture execution log output. 

231 # How does it get removed reliably? 

232 if taskDef.logOutputDatasetName is not None: 

233 # Either accumulate into ButlerLogRecords or stream 

234 # JSON records to file and ingest that. 

235 tmpdir = None 

236 if self.stream_json_logs: 

237 # Create the log file in a temporary directory rather than 

238 # creating a temporary file. This is necessary because 

239 # temporary files are created with restrictive permissions 

240 # and during file ingest these permissions persist in the 

241 # datastore. Using a temp directory allows us to create 

242 # a file with umask default permissions. 

243 tmpdir = tempfile.mkdtemp(prefix="butler-temp-logs-") 

244 

245 # Construct a file to receive the log records and "touch" it. 

246 log_file = os.path.join(tmpdir, f"butler-log-{taskDef.label}.json") 

247 with open(log_file, "w"): 

248 pass 

249 self.log_handler = FileHandler(log_file) 

250 self.log_handler.setFormatter(JsonLogFormatter()) 

251 else: 

252 self.log_handler = ButlerLogRecordHandler() 

253 

254 logging.getLogger().addHandler(self.log_handler) 

255 

256 # include quantum dataId and task label into MDC 

257 label = taskDef.label 

258 if quantum.dataId: 

259 label += f":{quantum.dataId}" 

260 

261 ctx = _LogCaptureFlag() 

262 try: 

263 with ButlerMDC.set_mdc({"LABEL": label, "RUN": butler.run}): 

264 yield ctx 

265 finally: 

266 # Ensure that the logs are stored in butler. 

267 self.writeLogRecords(quantum, taskDef, butler, ctx.store) 

268 if tmpdir: 

269 shutil.rmtree(tmpdir, ignore_errors=True) 

270 

271 def checkExistingOutputs(self, quantum, butler, taskDef): 

272 """Decide whether this quantum needs to be executed. 

273 

274 If only partial outputs exist then they are removed if 

275 ``clobberOutputs`` is True, otherwise an exception is raised. 

276 

277 Parameters 

278 ---------- 

279 quantum : `~lsst.daf.butler.Quantum` 

280 Quantum to check for existing outputs 

281 butler : `~lsst.daf.butler.Butler` 

282 Data butler. 

283 taskDef : `~lsst.pipe.base.TaskDef` 

284 Task definition structure. 

285 

286 Returns 

287 ------- 

288 exist : `bool` 

289 `True` if ``self.skipExistingIn`` is defined, and a previous 

290 execution of this quanta appears to have completed successfully 

291 (either because metadata was written or all datasets were written). 

292 `False` otherwise. 

293 

294 Raises 

295 ------ 

296 RuntimeError 

297 Raised if some outputs exist and some not. 

298 """ 

299 if self.skipExistingIn and taskDef.metadataDatasetName is not None: 

300 # Metadata output exists; this is sufficient to assume the previous 

301 # run was successful and should be skipped. 

302 ref = butler.registry.findDataset( 

303 taskDef.metadataDatasetName, quantum.dataId, collections=self.skipExistingIn 

304 ) 

305 if ref is not None: 

306 if butler.datastore.exists(ref): 

307 return True 

308 

309 # Previously we always checked for existing outputs in `butler.run`, 

310 # now logic gets more complicated as we only want to skip quantum 

311 # whose outputs exist in `self.skipExistingIn` but pruning should only 

312 # be done for outputs existing in `butler.run`. 

313 

314 def findOutputs(collections): 

315 """Find quantum outputs in specified collections.""" 

316 existingRefs = [] 

317 missingRefs = [] 

318 for datasetRefs in quantum.outputs.values(): 

319 for datasetRef in datasetRefs: 

320 ref = butler.registry.findDataset( 

321 datasetRef.datasetType, datasetRef.dataId, collections=collections 

322 ) 

323 if ref is not None and butler.datastore.exists(ref): 

324 existingRefs.append(ref) 

325 else: 

326 missingRefs.append(datasetRef) 

327 return existingRefs, missingRefs 

328 

329 existingRefs, missingRefs = findOutputs(self.skipExistingIn) 

330 if self.skipExistingIn: 

331 if existingRefs and not missingRefs: 

332 # everything is already there 

333 return True 

334 

335 # If we are to re-run quantum then prune datasets that exists in 

336 # output run collection, only if `self.clobberOutputs` is set. 

337 if existingRefs: 

338 existingRefs, missingRefs = findOutputs(butler.run) 

339 if existingRefs and missingRefs: 

340 _LOG.debug( 

341 "Partial outputs exist for task %s dataId=%s collection=%s " 

342 "existingRefs=%s missingRefs=%s", 

343 taskDef, 

344 quantum.dataId, 

345 butler.run, 

346 existingRefs, 

347 missingRefs, 

348 ) 

349 if self.clobberOutputs: 

350 # only prune 

351 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

352 # Do not purge registry records if this looks like 

353 # an execution butler. This ensures that the UUID 

354 # of the dataset doesn't change. 

355 if butler._allow_put_of_predefined_dataset: 

356 purge = False 

357 disassociate = False 

358 else: 

359 purge = True 

360 disassociate = True 

361 butler.pruneDatasets(existingRefs, disassociate=disassociate, unstore=True, purge=purge) 

362 return False 

363 else: 

364 raise RuntimeError( 

365 f"Registry inconsistency while checking for existing outputs:" 

366 f" collection={butler.run} existingRefs={existingRefs}" 

367 f" missingRefs={missingRefs}" 

368 ) 

369 

370 # need to re-run 

371 return False 

372 

373 def makeTask(self, taskClass, name, config, butler): 

374 """Make new task instance. 

375 

376 Parameters 

377 ---------- 

378 taskClass : `type` 

379 Sub-class of `~lsst.pipe.base.PipelineTask`. 

380 name : `str` 

381 Name for this task. 

382 config : `~lsst.pipe.base.PipelineTaskConfig` 

383 Configuration object for this task 

384 

385 Returns 

386 ------- 

387 task : `~lsst.pipe.base.PipelineTask` 

388 Instance of ``taskClass`` type. 

389 butler : `~lsst.daf.butler.Butler` 

390 Data butler. 

391 """ 

392 # call task factory for that 

393 return self.taskFactory.makeTask(taskClass, name, config, None, butler) 

394 

395 def updatedQuantumInputs(self, quantum, butler, taskDef): 

396 """Update quantum with extra information, returns a new updated 

397 Quantum. 

398 

399 Some methods may require input DatasetRefs to have non-None 

400 ``dataset_id``, but in case of intermediate dataset it may not be 

401 filled during QuantumGraph construction. This method will retrieve 

402 missing info from registry. 

403 

404 Parameters 

405 ---------- 

406 quantum : `~lsst.daf.butler.Quantum` 

407 Single Quantum instance. 

408 butler : `~lsst.daf.butler.Butler` 

409 Data butler. 

410 taskDef : `~lsst.pipe.base.TaskDef` 

411 Task definition structure. 

412 

413 Returns 

414 ------- 

415 update : `~lsst.daf.butler.Quantum` 

416 Updated Quantum instance 

417 """ 

418 anyChanges = False 

419 updatedInputs = defaultdict(list) 

420 for key, refsForDatasetType in quantum.inputs.items(): 

421 newRefsForDatasetType = updatedInputs[key] 

422 for ref in refsForDatasetType: 

423 if ref.id is None: 

424 resolvedRef = butler.registry.findDataset( 

425 ref.datasetType, ref.dataId, collections=butler.collections 

426 ) 

427 if resolvedRef is None: 

428 _LOG.info("No dataset found for %s", ref) 

429 continue 

430 else: 

431 _LOG.debug("Updated dataset ID for %s", ref) 

432 else: 

433 resolvedRef = ref 

434 # We need to ask datastore if the dataset actually exists 

435 # because the Registry of a local "execution butler" cannot 

436 # know this (because we prepopulate it with all of the datasets 

437 # that might be created). 

438 if butler.datastore.exists(resolvedRef): 

439 newRefsForDatasetType.append(resolvedRef) 

440 if len(newRefsForDatasetType) != len(refsForDatasetType): 

441 anyChanges = True 

442 # If we removed any input datasets, let the task check if it has enough 

443 # to proceed and/or prune related datasets that it also doesn't 

444 # need/produce anymore. It will raise NoWorkFound if it can't run, 

445 # which we'll let propagate up. This is exactly what we run during QG 

446 # generation, because a task shouldn't care whether an input is missing 

447 # because some previous task didn't produce it, or because it just 

448 # wasn't there during QG generation. 

449 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items()) 

450 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs) 

451 if anyChanges: 

452 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

453 return Quantum( 

454 taskName=quantum.taskName, 

455 taskClass=quantum.taskClass, 

456 dataId=quantum.dataId, 

457 initInputs=quantum.initInputs, 

458 inputs=helper.inputs, 

459 outputs=helper.outputs, 

460 ) 

461 

462 def runQuantum(self, task, quantum, taskDef, butler): 

463 """Execute task on a single quantum. 

464 

465 Parameters 

466 ---------- 

467 task : `~lsst.pipe.base.PipelineTask` 

468 Task object. 

469 quantum : `~lsst.daf.butler.Quantum` 

470 Single Quantum instance. 

471 taskDef : `~lsst.pipe.base.TaskDef` 

472 Task definition structure. 

473 butler : `~lsst.daf.butler.Butler` 

474 Data butler. 

475 """ 

476 # Create a butler that operates in the context of a quantum 

477 butlerQC = ButlerQuantumContext(butler, quantum) 

478 

479 # Get the input and output references for the task 

480 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

481 

482 # Call task runQuantum() method. Catch a few known failure modes and 

483 # translate them into specific 

484 try: 

485 task.runQuantum(butlerQC, inputRefs, outputRefs) 

486 except NoWorkFound as err: 

487 # Not an error, just an early exit. 

488 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err)) 

489 pass 

490 except RepeatableQuantumError as err: 

491 if self.exitOnKnownError: 

492 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

493 _LOG.warning(err, exc_info=True) 

494 sys.exit(err.EXIT_CODE) 

495 else: 

496 raise 

497 except InvalidQuantumError as err: 

498 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

499 _LOG.fatal(err, exc_info=True) 

500 sys.exit(err.EXIT_CODE) 

501 

502 def writeMetadata(self, quantum, metadata, taskDef, butler): 

503 if taskDef.metadataDatasetName is not None: 

504 # DatasetRef has to be in the Quantum outputs, can lookup by name 

505 try: 

506 ref = quantum.outputs[taskDef.metadataDatasetName] 

507 except LookupError as exc: 

508 raise InvalidQuantumError( 

509 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

510 f" this could happen due to inconsistent options between QuantumGraph generation" 

511 f" and execution" 

512 ) from exc 

513 butler.put(metadata, ref[0]) 

514 

515 def writeLogRecords(self, quantum, taskDef, butler, store): 

516 # If we are logging to an external file we must always try to 

517 # close it. 

518 filename = None 

519 if isinstance(self.log_handler, FileHandler): 

520 filename = self.log_handler.stream.name 

521 self.log_handler.close() 

522 

523 if self.log_handler is not None: 

524 # Remove the handler so we stop accumulating log messages. 

525 logging.getLogger().removeHandler(self.log_handler) 

526 

527 try: 

528 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None: 

529 # DatasetRef has to be in the Quantum outputs, can lookup by 

530 # name 

531 try: 

532 ref = quantum.outputs[taskDef.logOutputDatasetName] 

533 except LookupError as exc: 

534 raise InvalidQuantumError( 

535 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};" 

536 f" this could happen due to inconsistent options between QuantumGraph generation" 

537 f" and execution" 

538 ) from exc 

539 

540 if isinstance(self.log_handler, ButlerLogRecordHandler): 

541 butler.put(self.log_handler.records, ref[0]) 

542 

543 # Clear the records in case the handler is reused. 

544 self.log_handler.records.clear() 

545 else: 

546 assert filename is not None, "Somehow unable to extract filename from file handler" 

547 

548 # Need to ingest this file directly into butler. 

549 dataset = FileDataset(path=filename, refs=ref[0]) 

550 try: 

551 butler.ingest(dataset, transfer="move") 

552 filename = None 

553 except NotImplementedError: 

554 # Some datastores can't receive files (e.g. in-memory 

555 # datastore when testing), we store empty list for 

556 # those just to have a dataset. Alternative is to read 

557 # the file as a ButlerLogRecords object and put it. 

558 _LOG.info( 

559 "Log records could not be stored in this butler because the" 

560 " datastore can not ingest files, empty record list is stored instead." 

561 ) 

562 records = ButlerLogRecords.from_records([]) 

563 butler.put(records, ref[0]) 

564 finally: 

565 # remove file if it is not ingested 

566 if filename is not None: 

567 try: 

568 os.remove(filename) 

569 except OSError: 

570 pass 

571 

572 def initGlobals(self, quantum, butler): 

573 """Initialize global state needed for task execution. 

574 

575 Parameters 

576 ---------- 

577 quantum : `~lsst.daf.butler.Quantum` 

578 Single Quantum instance. 

579 butler : `~lsst.daf.butler.Butler` 

580 Data butler. 

581 

582 Notes 

583 ----- 

584 There is an issue with initializing filters singleton which is done 

585 by instrument, to avoid requiring tasks to do it in runQuantum() 

586 we do it here when any dataId has an instrument dimension. Also for 

587 now we only allow single instrument, verify that all instrument 

588 names in all dataIds are identical. 

589 

590 This will need revision when filter singleton disappears. 

591 """ 

592 oneInstrument = None 

593 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

594 for datasetRef in datasetRefs: 

595 dataId = datasetRef.dataId 

596 instrument = dataId.get("instrument") 

597 if instrument is not None: 

598 if oneInstrument is not None: 

599 assert ( 

600 instrument == oneInstrument 

601 ), "Currently require that only one instrument is used per graph" 

602 else: 

603 oneInstrument = instrument 

604 Instrument.fromName(instrument, butler.registry)