Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['SingleQuantumExecutor'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import os 

29import shutil 

30import sys 

31import tempfile 

32import time 

33from contextlib import contextmanager 

34from collections import defaultdict 

35from itertools import chain 

36from logging import FileHandler 

37from typing import List 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .quantumGraphExecutor import QuantumExecutor 

43from lsst.daf.base import PropertyList, PropertySet 

44from lsst.obs.base import Instrument 

45from lsst.pipe.base import ( 

46 AdjustQuantumHelper, 

47 ButlerQuantumContext, 

48 InvalidQuantumError, 

49 NoWorkFound, 

50 RepeatableQuantumError, 

51 logInfo, 

52) 

53from lsst.daf.butler import ( 

54 DatasetRef, 

55 DatasetType, 

56 FileDataset, 

57 NamedKeyDict, 

58 Quantum, 

59) 

60from lsst.daf.butler.core.logging import ( 

61 ButlerLogRecordHandler, 

62 ButlerLogRecords, 

63 ButlerMDC, 

64 JsonLogFormatter, 

65) 

66# ---------------------------------- 

67# Local non-exported definitions -- 

68# ---------------------------------- 

69 

70_LOG = logging.getLogger(__name__.partition(".")[2]) 

71 

72 

73class _LogCaptureFlag: 

74 """Simple flag to enable/disable log-to-butler saving. 

75 """ 

76 store: bool = True 

77 

78 

79class SingleQuantumExecutor(QuantumExecutor): 

80 """Executor class which runs one Quantum at a time. 

81 

82 Parameters 

83 ---------- 

84 butler : `~lsst.daf.butler.Butler` 

85 Data butler. 

86 taskFactory : `~lsst.pipe.base.TaskFactory` 

87 Instance of a task factory. 

88 skipExistingIn : `list` [ `str` ], optional 

89 Accepts list of collections, if all Quantum outputs already exist in 

90 the specified list of collections then that Quantum will not be rerun. 

91 clobberOutputs : `bool`, optional 

92 If `True`, then existing outputs in output run collection will be 

93 overwritten. If ``skipExistingIn`` is defined, only outputs from 

94 failed quanta will be overwritten. 

95 enableLsstDebug : `bool`, optional 

96 Enable debugging with ``lsstDebug`` facility for a task. 

97 exitOnKnownError : `bool`, optional 

98 If `True`, call `sys.exit` with the appropriate exit code for special 

99 known exceptions, after printing a traceback, instead of letting the 

100 exception propagate up to calling. This is always the behavior for 

101 InvalidQuantumError. 

102 """ 

103 

104 stream_json_logs = True 

105 """If True each log record is written to a temporary file and ingested 

106 when quantum completes. If False the records are accumulated in memory 

107 and stored in butler on quantum completion.""" 

108 

109 def __init__(self, taskFactory, skipExistingIn=None, clobberOutputs=False, enableLsstDebug=False, 

110 exitOnKnownError=False): 

111 self.taskFactory = taskFactory 

112 self.skipExistingIn = skipExistingIn 

113 self.enableLsstDebug = enableLsstDebug 

114 self.clobberOutputs = clobberOutputs 

115 self.exitOnKnownError = exitOnKnownError 

116 self.log_handler = None 

117 

118 def execute(self, taskDef, quantum, butler): 

119 # Docstring inherited from QuantumExecutor.execute 

120 startTime = time.time() 

121 

122 with self.captureLogging(taskDef, quantum, butler) as captureLog: 

123 

124 # Save detailed resource usage before task start to metadata. 

125 quantumMetadata = PropertyList() 

126 logInfo(None, "prep", metadata=quantumMetadata) 

127 

128 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config 

129 

130 # check whether to skip or delete old outputs, if it returns True 

131 # or raises an exception do not try to store logs, as they may be 

132 # already in butler. 

133 captureLog.store = False 

134 if self.checkExistingOutputs(quantum, butler, taskDef): 

135 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label, 

136 quantum.dataId) 

137 return 

138 captureLog.store = True 

139 

140 try: 

141 quantum = self.updatedQuantumInputs(quantum, butler, taskDef) 

142 except NoWorkFound as exc: 

143 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

144 taskDef.label, quantum.dataId, str(exc)) 

145 # Make empty metadata that looks something like what a 

146 # do-nothing task would write (but we don't bother with empty 

147 # nested PropertySets for subtasks). This is slightly 

148 # duplicative with logic in pipe_base that we can't easily call 

149 # from here; we'll fix this on DM-29761. 

150 logInfo(None, "end", metadata=quantumMetadata) 

151 fullMetadata = PropertySet() 

152 fullMetadata[taskDef.label] = PropertyList() 

153 fullMetadata["quantum"] = quantumMetadata 

154 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

155 return 

156 

157 # enable lsstDebug debugging 

158 if self.enableLsstDebug: 

159 try: 

160 _LOG.debug("Will try to import debug.py") 

161 import debug # noqa:F401 

162 except ImportError: 

163 _LOG.warn("No 'debug' module found.") 

164 

165 # initialize global state 

166 self.initGlobals(quantum, butler) 

167 

168 # Ensure that we are executing a frozen config 

169 config.freeze() 

170 logInfo(None, "init", metadata=quantumMetadata) 

171 task = self.makeTask(taskClass, label, config, butler) 

172 logInfo(None, "start", metadata=quantumMetadata) 

173 try: 

174 self.runQuantum(task, quantum, taskDef, butler) 

175 except Exception: 

176 _LOG.exception("Execution of task '%s' on quantum %s failed", 

177 taskDef.label, quantum.dataId) 

178 raise 

179 logInfo(None, "end", metadata=quantumMetadata) 

180 fullMetadata = task.getFullMetadata() 

181 fullMetadata["quantum"] = quantumMetadata 

182 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

183 stopTime = time.time() 

184 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds", 

185 taskDef.label, quantum.dataId, stopTime - startTime) 

186 

187 @contextmanager 

188 def captureLogging(self, taskDef, quantum, butler): 

189 """Configure logging system to capture logs for execution of this task. 

190 

191 Parameters 

192 ---------- 

193 taskDef : `lsst.pipe.base.TaskDef` 

194 The task definition. 

195 quantum : `~lsst.daf.butler.Quantum` 

196 Single Quantum instance. 

197 butler : `~lsst.daf.butler.Butler` 

198 Butler to write logs to. 

199 

200 Notes 

201 ----- 

202 Expected to be used as a context manager to ensure that logging 

203 records are inserted into the butler once the quantum has been 

204 executed: 

205 

206 .. code-block:: py 

207 

208 with self.captureLogging(taskDef, quantum, butler): 

209 # Run quantum and capture logs. 

210 

211 Ths method can also setup logging to attach task- or 

212 quantum-specific information to log messages. Potentially this can 

213 take into account some info from task configuration as well. 

214 """ 

215 # Add a handler to the root logger to capture execution log output. 

216 # How does it get removed reliably? 

217 if taskDef.logOutputDatasetName is not None: 

218 # Either accumulate into ButlerLogRecords or stream 

219 # JSON records to file and ingest that. 

220 tmpdir = None 

221 if self.stream_json_logs: 

222 # Create the log file in a temporary directory rather than 

223 # creating a temporary file. This is necessary because 

224 # temporary files are created with restrictive permissions 

225 # and during file ingest these permissions persist in the 

226 # datastore. Using a temp directory allows us to create 

227 # a file with umask default permissions. 

228 tmpdir = tempfile.mkdtemp(prefix="butler-temp-logs-") 

229 

230 # Construct a file to receive the log records and "touch" it. 

231 log_file = os.path.join(tmpdir, f"butler-log-{taskDef.label}.json") 

232 with open(log_file, "w"): 

233 pass 

234 self.log_handler = FileHandler(log_file) 

235 self.log_handler.setFormatter(JsonLogFormatter()) 

236 else: 

237 self.log_handler = ButlerLogRecordHandler() 

238 

239 logging.getLogger().addHandler(self.log_handler) 

240 

241 # include quantum dataId and task label into MDC 

242 label = taskDef.label 

243 if quantum.dataId: 

244 label += f":{quantum.dataId}" 

245 

246 ctx = _LogCaptureFlag() 

247 try: 

248 with ButlerMDC.set_mdc({"LABEL": label, "RUN": butler.run}): 

249 yield ctx 

250 finally: 

251 # Ensure that the logs are stored in butler. 

252 self.writeLogRecords(quantum, taskDef, butler, ctx.store) 

253 if tmpdir: 

254 shutil.rmtree(tmpdir, ignore_errors=True) 

255 

256 def checkExistingOutputs(self, quantum, butler, taskDef): 

257 """Decide whether this quantum needs to be executed. 

258 

259 If only partial outputs exist then they are removed if 

260 ``clobberOutputs`` is True, otherwise an exception is raised. 

261 

262 Parameters 

263 ---------- 

264 quantum : `~lsst.daf.butler.Quantum` 

265 Quantum to check for existing outputs 

266 butler : `~lsst.daf.butler.Butler` 

267 Data butler. 

268 taskDef : `~lsst.pipe.base.TaskDef` 

269 Task definition structure. 

270 

271 Returns 

272 ------- 

273 exist : `bool` 

274 `True` if ``self.skipExistingIn`` is defined, and a previous 

275 execution of this quanta appears to have completed successfully 

276 (either because metadata was written or all datasets were written). 

277 `False` otherwise. 

278 

279 Raises 

280 ------ 

281 RuntimeError 

282 Raised if some outputs exist and some not. 

283 """ 

284 if self.skipExistingIn and taskDef.metadataDatasetName is not None: 

285 # Metadata output exists; this is sufficient to assume the previous 

286 # run was successful and should be skipped. 

287 ref = butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId, 

288 collections=self.skipExistingIn) 

289 if ref is not None: 

290 if butler.datastore.exists(ref): 

291 return True 

292 

293 # Previously we always checked for existing outputs in `butler.run`, 

294 # now logic gets more complicated as we only want to skip quantum 

295 # whose outputs exist in `self.skipExistingIn` but pruning should only 

296 # be done for outputs existing in `butler.run`. 

297 

298 def findOutputs(collections): 

299 """Find quantum outputs in specified collections. 

300 """ 

301 existingRefs = [] 

302 missingRefs = [] 

303 for datasetRefs in quantum.outputs.values(): 

304 for datasetRef in datasetRefs: 

305 ref = butler.registry.findDataset(datasetRef.datasetType, datasetRef.dataId, 

306 collections=collections) 

307 if ref is not None and butler.datastore.exists(ref): 

308 existingRefs.append(ref) 

309 else: 

310 missingRefs.append(datasetRef) 

311 return existingRefs, missingRefs 

312 

313 existingRefs, missingRefs = findOutputs(self.skipExistingIn) 

314 if self.skipExistingIn: 

315 if existingRefs and not missingRefs: 

316 # everything is already there 

317 return True 

318 

319 # If we are to re-run quantum then prune datasets that exists in 

320 # output run collection, only if `self.clobberOutputs` is set. 

321 if existingRefs: 

322 existingRefs, missingRefs = findOutputs(butler.run) 

323 if existingRefs and missingRefs: 

324 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s " 

325 "existingRefs=%s missingRefs=%s", 

326 taskDef, quantum.dataId, butler.run, existingRefs, missingRefs) 

327 if self.clobberOutputs: 

328 # only prune 

329 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

330 # Do not purge registry records if this looks like 

331 # an execution butler. This ensures that the UUID 

332 # of the dataset doesn't change. 

333 if butler._allow_put_of_predefined_dataset: 

334 purge = False 

335 disassociate = False 

336 else: 

337 purge = True 

338 disassociate = True 

339 butler.pruneDatasets(existingRefs, disassociate=disassociate, unstore=True, purge=purge) 

340 return False 

341 else: 

342 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:" 

343 f" collection={butler.run} existingRefs={existingRefs}" 

344 f" missingRefs={missingRefs}") 

345 

346 # need to re-run 

347 return False 

348 

349 def makeTask(self, taskClass, name, config, butler): 

350 """Make new task instance. 

351 

352 Parameters 

353 ---------- 

354 taskClass : `type` 

355 Sub-class of `~lsst.pipe.base.PipelineTask`. 

356 name : `str` 

357 Name for this task. 

358 config : `~lsst.pipe.base.PipelineTaskConfig` 

359 Configuration object for this task 

360 

361 Returns 

362 ------- 

363 task : `~lsst.pipe.base.PipelineTask` 

364 Instance of ``taskClass`` type. 

365 butler : `~lsst.daf.butler.Butler` 

366 Data butler. 

367 """ 

368 # call task factory for that 

369 return self.taskFactory.makeTask(taskClass, name, config, None, butler) 

370 

371 def updatedQuantumInputs(self, quantum, butler, taskDef): 

372 """Update quantum with extra information, returns a new updated 

373 Quantum. 

374 

375 Some methods may require input DatasetRefs to have non-None 

376 ``dataset_id``, but in case of intermediate dataset it may not be 

377 filled during QuantumGraph construction. This method will retrieve 

378 missing info from registry. 

379 

380 Parameters 

381 ---------- 

382 quantum : `~lsst.daf.butler.Quantum` 

383 Single Quantum instance. 

384 butler : `~lsst.daf.butler.Butler` 

385 Data butler. 

386 taskDef : `~lsst.pipe.base.TaskDef` 

387 Task definition structure. 

388 

389 Returns 

390 ------- 

391 update : `~lsst.daf.butler.Quantum` 

392 Updated Quantum instance 

393 """ 

394 anyChanges = False 

395 updatedInputs = defaultdict(list) 

396 for key, refsForDatasetType in quantum.inputs.items(): 

397 newRefsForDatasetType = updatedInputs[key] 

398 for ref in refsForDatasetType: 

399 if ref.id is None: 

400 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId, 

401 collections=butler.collections) 

402 if resolvedRef is None: 

403 _LOG.info("No dataset found for %s", ref) 

404 continue 

405 else: 

406 _LOG.debug("Updated dataset ID for %s", ref) 

407 else: 

408 resolvedRef = ref 

409 # We need to ask datastore if the dataset actually exists 

410 # because the Registry of a local "execution butler" cannot 

411 # know this (because we prepopulate it with all of the datasets 

412 # that might be created). 

413 if butler.datastore.exists(resolvedRef): 

414 newRefsForDatasetType.append(resolvedRef) 

415 if len(newRefsForDatasetType) != len(refsForDatasetType): 

416 anyChanges = True 

417 # If we removed any input datasets, let the task check if it has enough 

418 # to proceed and/or prune related datasets that it also doesn't 

419 # need/produce anymore. It will raise NoWorkFound if it can't run, 

420 # which we'll let propagate up. This is exactly what we run during QG 

421 # generation, because a task shouldn't care whether an input is missing 

422 # because some previous task didn't produce it, or because it just 

423 # wasn't there during QG generation. 

424 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items()) 

425 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs) 

426 if anyChanges: 

427 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

428 return Quantum(taskName=quantum.taskName, 

429 taskClass=quantum.taskClass, 

430 dataId=quantum.dataId, 

431 initInputs=quantum.initInputs, 

432 inputs=helper.inputs, 

433 outputs=helper.outputs 

434 ) 

435 

436 def runQuantum(self, task, quantum, taskDef, butler): 

437 """Execute task on a single quantum. 

438 

439 Parameters 

440 ---------- 

441 task : `~lsst.pipe.base.PipelineTask` 

442 Task object. 

443 quantum : `~lsst.daf.butler.Quantum` 

444 Single Quantum instance. 

445 taskDef : `~lsst.pipe.base.TaskDef` 

446 Task definition structure. 

447 butler : `~lsst.daf.butler.Butler` 

448 Data butler. 

449 """ 

450 # Create a butler that operates in the context of a quantum 

451 butlerQC = ButlerQuantumContext(butler, quantum) 

452 

453 # Get the input and output references for the task 

454 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

455 

456 # Call task runQuantum() method. Catch a few known failure modes and 

457 # translate them into specific 

458 try: 

459 task.runQuantum(butlerQC, inputRefs, outputRefs) 

460 except NoWorkFound as err: 

461 # Not an error, just an early exit. 

462 _LOG.info("Task '%s' on quantum %s exited early: %s", 

463 taskDef.label, quantum.dataId, str(err)) 

464 pass 

465 except RepeatableQuantumError as err: 

466 if self.exitOnKnownError: 

467 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

468 _LOG.warning(err, exc_info=True) 

469 sys.exit(err.EXIT_CODE) 

470 else: 

471 raise 

472 except InvalidQuantumError as err: 

473 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

474 _LOG.fatal(err, exc_info=True) 

475 sys.exit(err.EXIT_CODE) 

476 

477 def writeMetadata(self, quantum, metadata, taskDef, butler): 

478 if taskDef.metadataDatasetName is not None: 

479 # DatasetRef has to be in the Quantum outputs, can lookup by name 

480 try: 

481 ref = quantum.outputs[taskDef.metadataDatasetName] 

482 except LookupError as exc: 

483 raise InvalidQuantumError( 

484 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

485 f" this could happen due to inconsistent options between QuantumGraph generation" 

486 f" and execution") from exc 

487 butler.put(metadata, ref[0]) 

488 

489 def writeLogRecords(self, quantum, taskDef, butler, store): 

490 # If we are logging to an external file we must always try to 

491 # close it. 

492 filename = None 

493 if isinstance(self.log_handler, FileHandler): 

494 filename = self.log_handler.stream.name 

495 self.log_handler.close() 

496 

497 if self.log_handler is not None: 

498 # Remove the handler so we stop accumulating log messages. 

499 logging.getLogger().removeHandler(self.log_handler) 

500 

501 try: 

502 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None: 

503 # DatasetRef has to be in the Quantum outputs, can lookup by 

504 # name 

505 try: 

506 ref = quantum.outputs[taskDef.logOutputDatasetName] 

507 except LookupError as exc: 

508 raise InvalidQuantumError( 

509 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};" 

510 f" this could happen due to inconsistent options between QuantumGraph generation" 

511 f" and execution") from exc 

512 

513 if isinstance(self.log_handler, ButlerLogRecordHandler): 

514 butler.put(self.log_handler.records, ref[0]) 

515 

516 # Clear the records in case the handler is reused. 

517 self.log_handler.records.clear() 

518 else: 

519 assert filename is not None, "Somehow unable to extract filename from file handler" 

520 

521 # Need to ingest this file directly into butler. 

522 dataset = FileDataset(path=filename, refs=ref[0]) 

523 try: 

524 butler.ingest(dataset, transfer="move") 

525 filename = None 

526 except NotImplementedError: 

527 # Some datastores can't receive files (e.g. in-memory 

528 # datastore when testing), we store empty list for 

529 # those just to have a dataset. Alternative is to read 

530 # the file as a ButlerLogRecords object and put it. 

531 _LOG.info("Log records could not be stored in this butler because the" 

532 " datastore can not ingest files, empty record list is stored instead.") 

533 records = ButlerLogRecords.from_records([]) 

534 butler.put(records, ref[0]) 

535 finally: 

536 # remove file if it is not ingested 

537 if filename is not None: 

538 try: 

539 os.remove(filename) 

540 except OSError: 

541 pass 

542 

543 def initGlobals(self, quantum, butler): 

544 """Initialize global state needed for task execution. 

545 

546 Parameters 

547 ---------- 

548 quantum : `~lsst.daf.butler.Quantum` 

549 Single Quantum instance. 

550 butler : `~lsst.daf.butler.Butler` 

551 Data butler. 

552 

553 Notes 

554 ----- 

555 There is an issue with initializing filters singleton which is done 

556 by instrument, to avoid requiring tasks to do it in runQuantum() 

557 we do it here when any dataId has an instrument dimension. Also for 

558 now we only allow single instrument, verify that all instrument 

559 names in all dataIds are identical. 

560 

561 This will need revision when filter singleton disappears. 

562 """ 

563 oneInstrument = None 

564 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

565 for datasetRef in datasetRefs: 

566 dataId = datasetRef.dataId 

567 instrument = dataId.get("instrument") 

568 if instrument is not None: 

569 if oneInstrument is not None: 

570 assert instrument == oneInstrument, \ 

571 "Currently require that only one instrument is used per graph" 

572 else: 

573 oneInstrument = instrument 

574 Instrument.fromName(instrument, butler.registry)