Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 11%

221 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-05 18:04 -0800

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['SingleQuantumExecutor'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import os 

29import shutil 

30import sys 

31import tempfile 

32import time 

33from contextlib import contextmanager 

34from collections import defaultdict 

35from itertools import chain 

36from logging import FileHandler 

37from typing import List 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .quantumGraphExecutor import QuantumExecutor 

43from lsst.daf.base import PropertyList, PropertySet 

44from lsst.obs.base import Instrument 

45from lsst.pipe.base import ( 

46 AdjustQuantumHelper, 

47 ButlerQuantumContext, 

48 InvalidQuantumError, 

49 NoWorkFound, 

50 RepeatableQuantumError, 

51 logInfo, 

52) 

53from lsst.daf.butler import ( 

54 DatasetRef, 

55 DatasetType, 

56 FileDataset, 

57 NamedKeyDict, 

58 Quantum, 

59) 

60from lsst.daf.butler.core.logging import ( 

61 ButlerLogRecordHandler, 

62 ButlerLogRecords, 

63 ButlerMDC, 

64 JsonLogFormatter, 

65) 

66# ---------------------------------- 

67# Local non-exported definitions -- 

68# ---------------------------------- 

69 

70_LOG = logging.getLogger(__name__.partition(".")[2]) 

71 

72 

73class _LogCaptureFlag: 

74 """Simple flag to enable/disable log-to-butler saving. 

75 """ 

76 store: bool = True 

77 

78 

79class SingleQuantumExecutor(QuantumExecutor): 

80 """Executor class which runs one Quantum at a time. 

81 

82 Parameters 

83 ---------- 

84 butler : `~lsst.daf.butler.Butler` 

85 Data butler. 

86 taskFactory : `~lsst.pipe.base.TaskFactory` 

87 Instance of a task factory. 

88 skipExistingIn : `list` [ `str` ], optional 

89 Accepts list of collections, if all Quantum outputs already exist in 

90 the specified list of collections then that Quantum will not be rerun. 

91 clobberOutputs : `bool`, optional 

92 If `True`, then existing outputs in output run collection will be 

93 overwritten. If ``skipExistingIn`` is defined, only outputs from 

94 failed quanta will be overwritten. 

95 enableLsstDebug : `bool`, optional 

96 Enable debugging with ``lsstDebug`` facility for a task. 

97 exitOnKnownError : `bool`, optional 

98 If `True`, call `sys.exit` with the appropriate exit code for special 

99 known exceptions, after printing a traceback, instead of letting the 

100 exception propagate up to calling. This is always the behavior for 

101 InvalidQuantumError. 

102 """ 

103 

104 stream_json_logs = True 

105 """If True each log record is written to a temporary file and ingested 

106 when quantum completes. If False the records are accumulated in memory 

107 and stored in butler on quantum completion.""" 

108 

109 def __init__(self, taskFactory, skipExistingIn=None, clobberOutputs=False, enableLsstDebug=False, 

110 exitOnKnownError=False): 

111 self.taskFactory = taskFactory 

112 self.skipExistingIn = skipExistingIn 

113 self.enableLsstDebug = enableLsstDebug 

114 self.clobberOutputs = clobberOutputs 

115 self.exitOnKnownError = exitOnKnownError 

116 self.log_handler = None 

117 

118 def execute(self, taskDef, quantum, butler): 

119 # Docstring inherited from QuantumExecutor.execute 

120 startTime = time.time() 

121 

122 with self.captureLogging(taskDef, quantum, butler) as captureLog: 

123 

124 # Save detailed resource usage before task start to metadata. 

125 quantumMetadata = PropertyList() 

126 logInfo(None, "prep", metadata=quantumMetadata) 

127 

128 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config 

129 

130 # check whether to skip or delete old outputs, if it returns True 

131 # or raises an exception do not try to store logs, as they may be 

132 # already in butler. 

133 captureLog.store = False 

134 if self.checkExistingOutputs(quantum, butler, taskDef): 

135 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label, 

136 quantum.dataId) 

137 return 

138 captureLog.store = True 

139 

140 try: 

141 quantum = self.updatedQuantumInputs(quantum, butler, taskDef) 

142 except NoWorkFound as exc: 

143 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

144 taskDef.label, quantum.dataId, str(exc)) 

145 # Make empty metadata that looks something like what a 

146 # do-nothing task would write (but we don't bother with empty 

147 # nested PropertySets for subtasks). This is slightly 

148 # duplicative with logic in pipe_base that we can't easily call 

149 # from here; we'll fix this on DM-29761. 

150 logInfo(None, "end", metadata=quantumMetadata) 

151 fullMetadata = PropertySet() 

152 fullMetadata[taskDef.label] = PropertyList() 

153 fullMetadata["quantum"] = quantumMetadata 

154 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

155 return 

156 

157 # enable lsstDebug debugging 

158 if self.enableLsstDebug: 

159 try: 

160 _LOG.debug("Will try to import debug.py") 

161 import debug # noqa:F401 

162 except ImportError: 

163 _LOG.warn("No 'debug' module found.") 

164 

165 # initialize global state 

166 self.initGlobals(quantum, butler) 

167 

168 # Ensure that we are executing a frozen config 

169 config.freeze() 

170 logInfo(None, "init", metadata=quantumMetadata) 

171 task = self.makeTask(taskClass, label, config, butler) 

172 logInfo(None, "start", metadata=quantumMetadata) 

173 try: 

174 self.runQuantum(task, quantum, taskDef, butler) 

175 except Exception as e: 

176 _LOG.error( 

177 "Execution of task '%s' on quantum %s failed. Exception %s: %s", 

178 taskDef.label, 

179 quantum.dataId, 

180 e.__class__.__name__, 

181 str(e), 

182 ) 

183 raise 

184 logInfo(None, "end", metadata=quantumMetadata) 

185 fullMetadata = task.getFullMetadata() 

186 fullMetadata["quantum"] = quantumMetadata 

187 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

188 stopTime = time.time() 

189 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds", 

190 taskDef.label, quantum.dataId, stopTime - startTime) 

191 

192 @contextmanager 

193 def captureLogging(self, taskDef, quantum, butler): 

194 """Configure logging system to capture logs for execution of this task. 

195 

196 Parameters 

197 ---------- 

198 taskDef : `lsst.pipe.base.TaskDef` 

199 The task definition. 

200 quantum : `~lsst.daf.butler.Quantum` 

201 Single Quantum instance. 

202 butler : `~lsst.daf.butler.Butler` 

203 Butler to write logs to. 

204 

205 Notes 

206 ----- 

207 Expected to be used as a context manager to ensure that logging 

208 records are inserted into the butler once the quantum has been 

209 executed: 

210 

211 .. code-block:: py 

212 

213 with self.captureLogging(taskDef, quantum, butler): 

214 # Run quantum and capture logs. 

215 

216 Ths method can also setup logging to attach task- or 

217 quantum-specific information to log messages. Potentially this can 

218 take into account some info from task configuration as well. 

219 """ 

220 # Add a handler to the root logger to capture execution log output. 

221 # How does it get removed reliably? 

222 if taskDef.logOutputDatasetName is not None: 

223 # Either accumulate into ButlerLogRecords or stream 

224 # JSON records to file and ingest that. 

225 tmpdir = None 

226 if self.stream_json_logs: 

227 # Create the log file in a temporary directory rather than 

228 # creating a temporary file. This is necessary because 

229 # temporary files are created with restrictive permissions 

230 # and during file ingest these permissions persist in the 

231 # datastore. Using a temp directory allows us to create 

232 # a file with umask default permissions. 

233 tmpdir = tempfile.mkdtemp(prefix="butler-temp-logs-") 

234 

235 # Construct a file to receive the log records and "touch" it. 

236 log_file = os.path.join(tmpdir, f"butler-log-{taskDef.label}.json") 

237 with open(log_file, "w"): 

238 pass 

239 self.log_handler = FileHandler(log_file) 

240 self.log_handler.setFormatter(JsonLogFormatter()) 

241 else: 

242 self.log_handler = ButlerLogRecordHandler() 

243 

244 logging.getLogger().addHandler(self.log_handler) 

245 

246 # include quantum dataId and task label into MDC 

247 label = taskDef.label 

248 if quantum.dataId: 

249 label += f":{quantum.dataId}" 

250 

251 ctx = _LogCaptureFlag() 

252 try: 

253 with ButlerMDC.set_mdc({"LABEL": label, "RUN": butler.run}): 

254 yield ctx 

255 finally: 

256 # Ensure that the logs are stored in butler. 

257 self.writeLogRecords(quantum, taskDef, butler, ctx.store) 

258 if tmpdir: 

259 shutil.rmtree(tmpdir, ignore_errors=True) 

260 

261 def checkExistingOutputs(self, quantum, butler, taskDef): 

262 """Decide whether this quantum needs to be executed. 

263 

264 If only partial outputs exist then they are removed if 

265 ``clobberOutputs`` is True, otherwise an exception is raised. 

266 

267 Parameters 

268 ---------- 

269 quantum : `~lsst.daf.butler.Quantum` 

270 Quantum to check for existing outputs 

271 butler : `~lsst.daf.butler.Butler` 

272 Data butler. 

273 taskDef : `~lsst.pipe.base.TaskDef` 

274 Task definition structure. 

275 

276 Returns 

277 ------- 

278 exist : `bool` 

279 `True` if ``self.skipExistingIn`` is defined, and a previous 

280 execution of this quanta appears to have completed successfully 

281 (either because metadata was written or all datasets were written). 

282 `False` otherwise. 

283 

284 Raises 

285 ------ 

286 RuntimeError 

287 Raised if some outputs exist and some not. 

288 """ 

289 if self.skipExistingIn and taskDef.metadataDatasetName is not None: 

290 # Metadata output exists; this is sufficient to assume the previous 

291 # run was successful and should be skipped. 

292 ref = butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId, 

293 collections=self.skipExistingIn) 

294 if ref is not None: 

295 if butler.datastore.exists(ref): 

296 return True 

297 

298 # Previously we always checked for existing outputs in `butler.run`, 

299 # now logic gets more complicated as we only want to skip quantum 

300 # whose outputs exist in `self.skipExistingIn` but pruning should only 

301 # be done for outputs existing in `butler.run`. 

302 

303 def findOutputs(collections): 

304 """Find quantum outputs in specified collections. 

305 """ 

306 existingRefs = [] 

307 missingRefs = [] 

308 for datasetRefs in quantum.outputs.values(): 

309 for datasetRef in datasetRefs: 

310 ref = butler.registry.findDataset(datasetRef.datasetType, datasetRef.dataId, 

311 collections=collections) 

312 if ref is not None and butler.datastore.exists(ref): 

313 existingRefs.append(ref) 

314 else: 

315 missingRefs.append(datasetRef) 

316 return existingRefs, missingRefs 

317 

318 existingRefs, missingRefs = findOutputs(self.skipExistingIn) 

319 if self.skipExistingIn: 

320 if existingRefs and not missingRefs: 

321 # everything is already there 

322 return True 

323 

324 # If we are to re-run quantum then prune datasets that exists in 

325 # output run collection, only if `self.clobberOutputs` is set. 

326 if existingRefs: 

327 existingRefs, missingRefs = findOutputs(butler.run) 

328 if existingRefs and missingRefs: 

329 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s " 

330 "existingRefs=%s missingRefs=%s", 

331 taskDef, quantum.dataId, butler.run, existingRefs, missingRefs) 

332 if self.clobberOutputs: 

333 # only prune 

334 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

335 # Do not purge registry records if this looks like 

336 # an execution butler. This ensures that the UUID 

337 # of the dataset doesn't change. 

338 if butler._allow_put_of_predefined_dataset: 

339 purge = False 

340 disassociate = False 

341 else: 

342 purge = True 

343 disassociate = True 

344 butler.pruneDatasets(existingRefs, disassociate=disassociate, unstore=True, purge=purge) 

345 return False 

346 else: 

347 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:" 

348 f" collection={butler.run} existingRefs={existingRefs}" 

349 f" missingRefs={missingRefs}") 

350 

351 # need to re-run 

352 return False 

353 

354 def makeTask(self, taskClass, name, config, butler): 

355 """Make new task instance. 

356 

357 Parameters 

358 ---------- 

359 taskClass : `type` 

360 Sub-class of `~lsst.pipe.base.PipelineTask`. 

361 name : `str` 

362 Name for this task. 

363 config : `~lsst.pipe.base.PipelineTaskConfig` 

364 Configuration object for this task 

365 

366 Returns 

367 ------- 

368 task : `~lsst.pipe.base.PipelineTask` 

369 Instance of ``taskClass`` type. 

370 butler : `~lsst.daf.butler.Butler` 

371 Data butler. 

372 """ 

373 # call task factory for that 

374 return self.taskFactory.makeTask(taskClass, name, config, None, butler) 

375 

376 def updatedQuantumInputs(self, quantum, butler, taskDef): 

377 """Update quantum with extra information, returns a new updated 

378 Quantum. 

379 

380 Some methods may require input DatasetRefs to have non-None 

381 ``dataset_id``, but in case of intermediate dataset it may not be 

382 filled during QuantumGraph construction. This method will retrieve 

383 missing info from registry. 

384 

385 Parameters 

386 ---------- 

387 quantum : `~lsst.daf.butler.Quantum` 

388 Single Quantum instance. 

389 butler : `~lsst.daf.butler.Butler` 

390 Data butler. 

391 taskDef : `~lsst.pipe.base.TaskDef` 

392 Task definition structure. 

393 

394 Returns 

395 ------- 

396 update : `~lsst.daf.butler.Quantum` 

397 Updated Quantum instance 

398 """ 

399 anyChanges = False 

400 updatedInputs = defaultdict(list) 

401 for key, refsForDatasetType in quantum.inputs.items(): 

402 newRefsForDatasetType = updatedInputs[key] 

403 for ref in refsForDatasetType: 

404 if ref.id is None: 

405 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId, 

406 collections=butler.collections) 

407 if resolvedRef is None: 

408 _LOG.info("No dataset found for %s", ref) 

409 continue 

410 else: 

411 _LOG.debug("Updated dataset ID for %s", ref) 

412 else: 

413 resolvedRef = ref 

414 # We need to ask datastore if the dataset actually exists 

415 # because the Registry of a local "execution butler" cannot 

416 # know this (because we prepopulate it with all of the datasets 

417 # that might be created). 

418 if butler.datastore.exists(resolvedRef): 

419 newRefsForDatasetType.append(resolvedRef) 

420 if len(newRefsForDatasetType) != len(refsForDatasetType): 

421 anyChanges = True 

422 # If we removed any input datasets, let the task check if it has enough 

423 # to proceed and/or prune related datasets that it also doesn't 

424 # need/produce anymore. It will raise NoWorkFound if it can't run, 

425 # which we'll let propagate up. This is exactly what we run during QG 

426 # generation, because a task shouldn't care whether an input is missing 

427 # because some previous task didn't produce it, or because it just 

428 # wasn't there during QG generation. 

429 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items()) 

430 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs) 

431 if anyChanges: 

432 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

433 return Quantum(taskName=quantum.taskName, 

434 taskClass=quantum.taskClass, 

435 dataId=quantum.dataId, 

436 initInputs=quantum.initInputs, 

437 inputs=helper.inputs, 

438 outputs=helper.outputs 

439 ) 

440 

441 def runQuantum(self, task, quantum, taskDef, butler): 

442 """Execute task on a single quantum. 

443 

444 Parameters 

445 ---------- 

446 task : `~lsst.pipe.base.PipelineTask` 

447 Task object. 

448 quantum : `~lsst.daf.butler.Quantum` 

449 Single Quantum instance. 

450 taskDef : `~lsst.pipe.base.TaskDef` 

451 Task definition structure. 

452 butler : `~lsst.daf.butler.Butler` 

453 Data butler. 

454 """ 

455 # Create a butler that operates in the context of a quantum 

456 butlerQC = ButlerQuantumContext(butler, quantum) 

457 

458 # Get the input and output references for the task 

459 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

460 

461 # Call task runQuantum() method. Catch a few known failure modes and 

462 # translate them into specific 

463 try: 

464 task.runQuantum(butlerQC, inputRefs, outputRefs) 

465 except NoWorkFound as err: 

466 # Not an error, just an early exit. 

467 _LOG.info("Task '%s' on quantum %s exited early: %s", 

468 taskDef.label, quantum.dataId, str(err)) 

469 pass 

470 except RepeatableQuantumError as err: 

471 if self.exitOnKnownError: 

472 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

473 _LOG.warning(err, exc_info=True) 

474 sys.exit(err.EXIT_CODE) 

475 else: 

476 raise 

477 except InvalidQuantumError as err: 

478 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

479 _LOG.fatal(err, exc_info=True) 

480 sys.exit(err.EXIT_CODE) 

481 

482 def writeMetadata(self, quantum, metadata, taskDef, butler): 

483 if taskDef.metadataDatasetName is not None: 

484 # DatasetRef has to be in the Quantum outputs, can lookup by name 

485 try: 

486 ref = quantum.outputs[taskDef.metadataDatasetName] 

487 except LookupError as exc: 

488 raise InvalidQuantumError( 

489 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

490 f" this could happen due to inconsistent options between QuantumGraph generation" 

491 f" and execution") from exc 

492 butler.put(metadata, ref[0]) 

493 

494 def writeLogRecords(self, quantum, taskDef, butler, store): 

495 # If we are logging to an external file we must always try to 

496 # close it. 

497 filename = None 

498 if isinstance(self.log_handler, FileHandler): 

499 filename = self.log_handler.stream.name 

500 self.log_handler.close() 

501 

502 if self.log_handler is not None: 

503 # Remove the handler so we stop accumulating log messages. 

504 logging.getLogger().removeHandler(self.log_handler) 

505 

506 try: 

507 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None: 

508 # DatasetRef has to be in the Quantum outputs, can lookup by 

509 # name 

510 try: 

511 ref = quantum.outputs[taskDef.logOutputDatasetName] 

512 except LookupError as exc: 

513 raise InvalidQuantumError( 

514 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};" 

515 f" this could happen due to inconsistent options between QuantumGraph generation" 

516 f" and execution") from exc 

517 

518 if isinstance(self.log_handler, ButlerLogRecordHandler): 

519 butler.put(self.log_handler.records, ref[0]) 

520 

521 # Clear the records in case the handler is reused. 

522 self.log_handler.records.clear() 

523 else: 

524 assert filename is not None, "Somehow unable to extract filename from file handler" 

525 

526 # Need to ingest this file directly into butler. 

527 dataset = FileDataset(path=filename, refs=ref[0]) 

528 try: 

529 butler.ingest(dataset, transfer="move") 

530 filename = None 

531 except NotImplementedError: 

532 # Some datastores can't receive files (e.g. in-memory 

533 # datastore when testing), we store empty list for 

534 # those just to have a dataset. Alternative is to read 

535 # the file as a ButlerLogRecords object and put it. 

536 _LOG.info("Log records could not be stored in this butler because the" 

537 " datastore can not ingest files, empty record list is stored instead.") 

538 records = ButlerLogRecords.from_records([]) 

539 butler.put(records, ref[0]) 

540 finally: 

541 # remove file if it is not ingested 

542 if filename is not None: 

543 try: 

544 os.remove(filename) 

545 except OSError: 

546 pass 

547 

548 def initGlobals(self, quantum, butler): 

549 """Initialize global state needed for task execution. 

550 

551 Parameters 

552 ---------- 

553 quantum : `~lsst.daf.butler.Quantum` 

554 Single Quantum instance. 

555 butler : `~lsst.daf.butler.Butler` 

556 Data butler. 

557 

558 Notes 

559 ----- 

560 There is an issue with initializing filters singleton which is done 

561 by instrument, to avoid requiring tasks to do it in runQuantum() 

562 we do it here when any dataId has an instrument dimension. Also for 

563 now we only allow single instrument, verify that all instrument 

564 names in all dataIds are identical. 

565 

566 This will need revision when filter singleton disappears. 

567 """ 

568 oneInstrument = None 

569 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

570 for datasetRef in datasetRefs: 

571 dataId = datasetRef.dataId 

572 instrument = dataId.get("instrument") 

573 if instrument is not None: 

574 if oneInstrument is not None: 

575 assert instrument == oneInstrument, \ 

576 "Currently require that only one instrument is used per graph" 

577 else: 

578 oneInstrument = instrument 

579 Instrument.fromName(instrument, butler.registry)