Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['SingleQuantumExecutor'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import os 

29import shutil 

30import sys 

31import tempfile 

32import time 

33from contextlib import contextmanager 

34from collections import defaultdict 

35from itertools import chain 

36from logging import FileHandler 

37from typing import List 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .quantumGraphExecutor import QuantumExecutor 

43from lsst.utils.timer import logInfo 

44from lsst.daf.base import PropertyList, PropertySet 

45from lsst.obs.base import Instrument 

46from lsst.pipe.base import ( 

47 AdjustQuantumHelper, 

48 ButlerQuantumContext, 

49 InvalidQuantumError, 

50 NoWorkFound, 

51 RepeatableQuantumError, 

52) 

53from lsst.daf.butler import ( 

54 DatasetRef, 

55 DatasetType, 

56 FileDataset, 

57 NamedKeyDict, 

58 Quantum, 

59) 

60from lsst.daf.butler.core.logging import ( 

61 ButlerLogRecordHandler, 

62 ButlerLogRecords, 

63 ButlerMDC, 

64 JsonLogFormatter, 

65) 

66# ---------------------------------- 

67# Local non-exported definitions -- 

68# ---------------------------------- 

69 

70_LOG = logging.getLogger(__name__.partition(".")[2]) 

71 

72 

73class _LogCaptureFlag: 

74 """Simple flag to enable/disable log-to-butler saving. 

75 """ 

76 store: bool = True 

77 

78 

79class SingleQuantumExecutor(QuantumExecutor): 

80 """Executor class which runs one Quantum at a time. 

81 

82 Parameters 

83 ---------- 

84 butler : `~lsst.daf.butler.Butler` 

85 Data butler. 

86 taskFactory : `~lsst.pipe.base.TaskFactory` 

87 Instance of a task factory. 

88 skipExistingIn : `list` [ `str` ], optional 

89 Accepts list of collections, if all Quantum outputs already exist in 

90 the specified list of collections then that Quantum will not be rerun. 

91 clobberOutputs : `bool`, optional 

92 If `True`, then existing outputs in output run collection will be 

93 overwritten. If ``skipExistingIn`` is defined, only outputs from 

94 failed quanta will be overwritten. 

95 enableLsstDebug : `bool`, optional 

96 Enable debugging with ``lsstDebug`` facility for a task. 

97 exitOnKnownError : `bool`, optional 

98 If `True`, call `sys.exit` with the appropriate exit code for special 

99 known exceptions, after printing a traceback, instead of letting the 

100 exception propagate up to calling. This is always the behavior for 

101 InvalidQuantumError. 

102 """ 

103 

104 stream_json_logs = True 

105 """If True each log record is written to a temporary file and ingested 

106 when quantum completes. If False the records are accumulated in memory 

107 and stored in butler on quantum completion.""" 

108 

109 def __init__(self, taskFactory, skipExistingIn=None, clobberOutputs=False, enableLsstDebug=False, 

110 exitOnKnownError=False): 

111 self.taskFactory = taskFactory 

112 self.skipExistingIn = skipExistingIn 

113 self.enableLsstDebug = enableLsstDebug 

114 self.clobberOutputs = clobberOutputs 

115 self.exitOnKnownError = exitOnKnownError 

116 self.log_handler = None 

117 

118 def execute(self, taskDef, quantum, butler): 

119 # Docstring inherited from QuantumExecutor.execute 

120 startTime = time.time() 

121 

122 with self.captureLogging(taskDef, quantum, butler) as captureLog: 

123 

124 # Save detailed resource usage before task start to metadata. 

125 quantumMetadata = PropertyList() 

126 logInfo(None, "prep", metadata=quantumMetadata) 

127 

128 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config 

129 

130 # check whether to skip or delete old outputs, if it returns True 

131 # or raises an exception do not try to store logs, as they may be 

132 # already in butler. 

133 captureLog.store = False 

134 if self.checkExistingOutputs(quantum, butler, taskDef): 

135 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label, 

136 quantum.dataId) 

137 return 

138 captureLog.store = True 

139 

140 try: 

141 quantum = self.updatedQuantumInputs(quantum, butler, taskDef) 

142 except NoWorkFound as exc: 

143 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

144 taskDef.label, quantum.dataId, str(exc)) 

145 # Make empty metadata that looks something like what a 

146 # do-nothing task would write (but we don't bother with empty 

147 # nested PropertySets for subtasks). This is slightly 

148 # duplicative with logic in pipe_base that we can't easily call 

149 # from here; we'll fix this on DM-29761. 

150 logInfo(None, "end", metadata=quantumMetadata) 

151 fullMetadata = PropertySet() 

152 fullMetadata[taskDef.label] = PropertyList() 

153 fullMetadata["quantum"] = quantumMetadata 

154 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

155 return 

156 

157 # enable lsstDebug debugging 

158 if self.enableLsstDebug: 

159 try: 

160 _LOG.debug("Will try to import debug.py") 

161 import debug # noqa:F401 

162 except ImportError: 

163 _LOG.warn("No 'debug' module found.") 

164 

165 # initialize global state 

166 self.initGlobals(quantum, butler) 

167 

168 # Ensure that we are executing a frozen config 

169 config.freeze() 

170 logInfo(None, "init", metadata=quantumMetadata) 

171 task = self.makeTask(taskClass, label, config, butler) 

172 logInfo(None, "start", metadata=quantumMetadata) 

173 try: 

174 self.runQuantum(task, quantum, taskDef, butler) 

175 except Exception: 

176 _LOG.exception("Execution of task '%s' on quantum %s failed", 

177 taskDef.label, quantum.dataId) 

178 raise 

179 logInfo(None, "end", metadata=quantumMetadata) 

180 fullMetadata = task.getFullMetadata() 

181 fullMetadata["quantum"] = quantumMetadata 

182 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

183 stopTime = time.time() 

184 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds", 

185 taskDef.label, quantum.dataId, stopTime - startTime) 

186 return quantum 

187 

188 @contextmanager 

189 def captureLogging(self, taskDef, quantum, butler): 

190 """Configure logging system to capture logs for execution of this task. 

191 

192 Parameters 

193 ---------- 

194 taskDef : `lsst.pipe.base.TaskDef` 

195 The task definition. 

196 quantum : `~lsst.daf.butler.Quantum` 

197 Single Quantum instance. 

198 butler : `~lsst.daf.butler.Butler` 

199 Butler to write logs to. 

200 

201 Notes 

202 ----- 

203 Expected to be used as a context manager to ensure that logging 

204 records are inserted into the butler once the quantum has been 

205 executed: 

206 

207 .. code-block:: py 

208 

209 with self.captureLogging(taskDef, quantum, butler): 

210 # Run quantum and capture logs. 

211 

212 Ths method can also setup logging to attach task- or 

213 quantum-specific information to log messages. Potentially this can 

214 take into account some info from task configuration as well. 

215 """ 

216 # Add a handler to the root logger to capture execution log output. 

217 # How does it get removed reliably? 

218 if taskDef.logOutputDatasetName is not None: 

219 # Either accumulate into ButlerLogRecords or stream 

220 # JSON records to file and ingest that. 

221 tmpdir = None 

222 if self.stream_json_logs: 

223 # Create the log file in a temporary directory rather than 

224 # creating a temporary file. This is necessary because 

225 # temporary files are created with restrictive permissions 

226 # and during file ingest these permissions persist in the 

227 # datastore. Using a temp directory allows us to create 

228 # a file with umask default permissions. 

229 tmpdir = tempfile.mkdtemp(prefix="butler-temp-logs-") 

230 

231 # Construct a file to receive the log records and "touch" it. 

232 log_file = os.path.join(tmpdir, f"butler-log-{taskDef.label}.json") 

233 with open(log_file, "w"): 

234 pass 

235 self.log_handler = FileHandler(log_file) 

236 self.log_handler.setFormatter(JsonLogFormatter()) 

237 else: 

238 self.log_handler = ButlerLogRecordHandler() 

239 

240 logging.getLogger().addHandler(self.log_handler) 

241 

242 # include quantum dataId and task label into MDC 

243 label = taskDef.label 

244 if quantum.dataId: 

245 label += f":{quantum.dataId}" 

246 

247 ctx = _LogCaptureFlag() 

248 try: 

249 with ButlerMDC.set_mdc({"LABEL": label, "RUN": butler.run}): 

250 yield ctx 

251 finally: 

252 # Ensure that the logs are stored in butler. 

253 self.writeLogRecords(quantum, taskDef, butler, ctx.store) 

254 if tmpdir: 

255 shutil.rmtree(tmpdir, ignore_errors=True) 

256 

257 def checkExistingOutputs(self, quantum, butler, taskDef): 

258 """Decide whether this quantum needs to be executed. 

259 

260 If only partial outputs exist then they are removed if 

261 ``clobberOutputs`` is True, otherwise an exception is raised. 

262 

263 Parameters 

264 ---------- 

265 quantum : `~lsst.daf.butler.Quantum` 

266 Quantum to check for existing outputs 

267 butler : `~lsst.daf.butler.Butler` 

268 Data butler. 

269 taskDef : `~lsst.pipe.base.TaskDef` 

270 Task definition structure. 

271 

272 Returns 

273 ------- 

274 exist : `bool` 

275 `True` if ``self.skipExistingIn`` is defined, and a previous 

276 execution of this quanta appears to have completed successfully 

277 (either because metadata was written or all datasets were written). 

278 `False` otherwise. 

279 

280 Raises 

281 ------ 

282 RuntimeError 

283 Raised if some outputs exist and some not. 

284 """ 

285 if self.skipExistingIn and taskDef.metadataDatasetName is not None: 

286 # Metadata output exists; this is sufficient to assume the previous 

287 # run was successful and should be skipped. 

288 ref = butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId, 

289 collections=self.skipExistingIn) 

290 if ref is not None: 

291 if butler.datastore.exists(ref): 

292 return True 

293 

294 # Previously we always checked for existing outputs in `butler.run`, 

295 # now logic gets more complicated as we only want to skip quantum 

296 # whose outputs exist in `self.skipExistingIn` but pruning should only 

297 # be done for outputs existing in `butler.run`. 

298 

299 def findOutputs(collections): 

300 """Find quantum outputs in specified collections. 

301 """ 

302 existingRefs = [] 

303 missingRefs = [] 

304 for datasetRefs in quantum.outputs.values(): 

305 for datasetRef in datasetRefs: 

306 ref = butler.registry.findDataset(datasetRef.datasetType, datasetRef.dataId, 

307 collections=collections) 

308 if ref is not None and butler.datastore.exists(ref): 

309 existingRefs.append(ref) 

310 else: 

311 missingRefs.append(datasetRef) 

312 return existingRefs, missingRefs 

313 

314 existingRefs, missingRefs = findOutputs(self.skipExistingIn) 

315 if self.skipExistingIn: 

316 if existingRefs and not missingRefs: 

317 # everything is already there 

318 return True 

319 

320 # If we are to re-run quantum then prune datasets that exists in 

321 # output run collection, only if `self.clobberOutputs` is set. 

322 if existingRefs: 

323 existingRefs, missingRefs = findOutputs(butler.run) 

324 if existingRefs and missingRefs: 

325 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s " 

326 "existingRefs=%s missingRefs=%s", 

327 taskDef, quantum.dataId, butler.run, existingRefs, missingRefs) 

328 if self.clobberOutputs: 

329 # only prune 

330 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

331 # Do not purge registry records if this looks like 

332 # an execution butler. This ensures that the UUID 

333 # of the dataset doesn't change. 

334 if butler._allow_put_of_predefined_dataset: 

335 purge = False 

336 disassociate = False 

337 else: 

338 purge = True 

339 disassociate = True 

340 butler.pruneDatasets(existingRefs, disassociate=disassociate, unstore=True, purge=purge) 

341 return False 

342 else: 

343 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:" 

344 f" collection={butler.run} existingRefs={existingRefs}" 

345 f" missingRefs={missingRefs}") 

346 

347 # need to re-run 

348 return False 

349 

350 def makeTask(self, taskClass, name, config, butler): 

351 """Make new task instance. 

352 

353 Parameters 

354 ---------- 

355 taskClass : `type` 

356 Sub-class of `~lsst.pipe.base.PipelineTask`. 

357 name : `str` 

358 Name for this task. 

359 config : `~lsst.pipe.base.PipelineTaskConfig` 

360 Configuration object for this task 

361 

362 Returns 

363 ------- 

364 task : `~lsst.pipe.base.PipelineTask` 

365 Instance of ``taskClass`` type. 

366 butler : `~lsst.daf.butler.Butler` 

367 Data butler. 

368 """ 

369 # call task factory for that 

370 return self.taskFactory.makeTask(taskClass, name, config, None, butler) 

371 

372 def updatedQuantumInputs(self, quantum, butler, taskDef): 

373 """Update quantum with extra information, returns a new updated 

374 Quantum. 

375 

376 Some methods may require input DatasetRefs to have non-None 

377 ``dataset_id``, but in case of intermediate dataset it may not be 

378 filled during QuantumGraph construction. This method will retrieve 

379 missing info from registry. 

380 

381 Parameters 

382 ---------- 

383 quantum : `~lsst.daf.butler.Quantum` 

384 Single Quantum instance. 

385 butler : `~lsst.daf.butler.Butler` 

386 Data butler. 

387 taskDef : `~lsst.pipe.base.TaskDef` 

388 Task definition structure. 

389 

390 Returns 

391 ------- 

392 update : `~lsst.daf.butler.Quantum` 

393 Updated Quantum instance 

394 """ 

395 anyChanges = False 

396 updatedInputs = defaultdict(list) 

397 for key, refsForDatasetType in quantum.inputs.items(): 

398 newRefsForDatasetType = updatedInputs[key] 

399 for ref in refsForDatasetType: 

400 if ref.id is None: 

401 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId, 

402 collections=butler.collections) 

403 if resolvedRef is None: 

404 _LOG.info("No dataset found for %s", ref) 

405 continue 

406 else: 

407 _LOG.debug("Updated dataset ID for %s", ref) 

408 else: 

409 resolvedRef = ref 

410 # We need to ask datastore if the dataset actually exists 

411 # because the Registry of a local "execution butler" cannot 

412 # know this (because we prepopulate it with all of the datasets 

413 # that might be created). 

414 if butler.datastore.exists(resolvedRef): 

415 newRefsForDatasetType.append(resolvedRef) 

416 if len(newRefsForDatasetType) != len(refsForDatasetType): 

417 anyChanges = True 

418 # If we removed any input datasets, let the task check if it has enough 

419 # to proceed and/or prune related datasets that it also doesn't 

420 # need/produce anymore. It will raise NoWorkFound if it can't run, 

421 # which we'll let propagate up. This is exactly what we run during QG 

422 # generation, because a task shouldn't care whether an input is missing 

423 # because some previous task didn't produce it, or because it just 

424 # wasn't there during QG generation. 

425 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items()) 

426 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs) 

427 if anyChanges: 

428 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

429 return Quantum(taskName=quantum.taskName, 

430 taskClass=quantum.taskClass, 

431 dataId=quantum.dataId, 

432 initInputs=quantum.initInputs, 

433 inputs=helper.inputs, 

434 outputs=helper.outputs 

435 ) 

436 

437 def runQuantum(self, task, quantum, taskDef, butler): 

438 """Execute task on a single quantum. 

439 

440 Parameters 

441 ---------- 

442 task : `~lsst.pipe.base.PipelineTask` 

443 Task object. 

444 quantum : `~lsst.daf.butler.Quantum` 

445 Single Quantum instance. 

446 taskDef : `~lsst.pipe.base.TaskDef` 

447 Task definition structure. 

448 butler : `~lsst.daf.butler.Butler` 

449 Data butler. 

450 """ 

451 # Create a butler that operates in the context of a quantum 

452 butlerQC = ButlerQuantumContext(butler, quantum) 

453 

454 # Get the input and output references for the task 

455 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

456 

457 # Call task runQuantum() method. Catch a few known failure modes and 

458 # translate them into specific 

459 try: 

460 task.runQuantum(butlerQC, inputRefs, outputRefs) 

461 except NoWorkFound as err: 

462 # Not an error, just an early exit. 

463 _LOG.info("Task '%s' on quantum %s exited early: %s", 

464 taskDef.label, quantum.dataId, str(err)) 

465 pass 

466 except RepeatableQuantumError as err: 

467 if self.exitOnKnownError: 

468 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

469 _LOG.warning(err, exc_info=True) 

470 sys.exit(err.EXIT_CODE) 

471 else: 

472 raise 

473 except InvalidQuantumError as err: 

474 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

475 _LOG.fatal(err, exc_info=True) 

476 sys.exit(err.EXIT_CODE) 

477 

478 def writeMetadata(self, quantum, metadata, taskDef, butler): 

479 if taskDef.metadataDatasetName is not None: 

480 # DatasetRef has to be in the Quantum outputs, can lookup by name 

481 try: 

482 ref = quantum.outputs[taskDef.metadataDatasetName] 

483 except LookupError as exc: 

484 raise InvalidQuantumError( 

485 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

486 f" this could happen due to inconsistent options between QuantumGraph generation" 

487 f" and execution") from exc 

488 butler.put(metadata, ref[0]) 

489 

490 def writeLogRecords(self, quantum, taskDef, butler, store): 

491 # If we are logging to an external file we must always try to 

492 # close it. 

493 filename = None 

494 if isinstance(self.log_handler, FileHandler): 

495 filename = self.log_handler.stream.name 

496 self.log_handler.close() 

497 

498 if self.log_handler is not None: 

499 # Remove the handler so we stop accumulating log messages. 

500 logging.getLogger().removeHandler(self.log_handler) 

501 

502 try: 

503 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None: 

504 # DatasetRef has to be in the Quantum outputs, can lookup by 

505 # name 

506 try: 

507 ref = quantum.outputs[taskDef.logOutputDatasetName] 

508 except LookupError as exc: 

509 raise InvalidQuantumError( 

510 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};" 

511 f" this could happen due to inconsistent options between QuantumGraph generation" 

512 f" and execution") from exc 

513 

514 if isinstance(self.log_handler, ButlerLogRecordHandler): 

515 butler.put(self.log_handler.records, ref[0]) 

516 

517 # Clear the records in case the handler is reused. 

518 self.log_handler.records.clear() 

519 else: 

520 assert filename is not None, "Somehow unable to extract filename from file handler" 

521 

522 # Need to ingest this file directly into butler. 

523 dataset = FileDataset(path=filename, refs=ref[0]) 

524 try: 

525 butler.ingest(dataset, transfer="move") 

526 filename = None 

527 except NotImplementedError: 

528 # Some datastores can't receive files (e.g. in-memory 

529 # datastore when testing), we store empty list for 

530 # those just to have a dataset. Alternative is to read 

531 # the file as a ButlerLogRecords object and put it. 

532 _LOG.info("Log records could not be stored in this butler because the" 

533 " datastore can not ingest files, empty record list is stored instead.") 

534 records = ButlerLogRecords.from_records([]) 

535 butler.put(records, ref[0]) 

536 finally: 

537 # remove file if it is not ingested 

538 if filename is not None: 

539 try: 

540 os.remove(filename) 

541 except OSError: 

542 pass 

543 

544 def initGlobals(self, quantum, butler): 

545 """Initialize global state needed for task execution. 

546 

547 Parameters 

548 ---------- 

549 quantum : `~lsst.daf.butler.Quantum` 

550 Single Quantum instance. 

551 butler : `~lsst.daf.butler.Butler` 

552 Data butler. 

553 

554 Notes 

555 ----- 

556 There is an issue with initializing filters singleton which is done 

557 by instrument, to avoid requiring tasks to do it in runQuantum() 

558 we do it here when any dataId has an instrument dimension. Also for 

559 now we only allow single instrument, verify that all instrument 

560 names in all dataIds are identical. 

561 

562 This will need revision when filter singleton disappears. 

563 """ 

564 oneInstrument = None 

565 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

566 for datasetRef in datasetRefs: 

567 dataId = datasetRef.dataId 

568 instrument = dataId.get("instrument") 

569 if instrument is not None: 

570 if oneInstrument is not None: 

571 assert instrument == oneInstrument, \ 

572 "Currently require that only one instrument is used per graph" 

573 else: 

574 oneInstrument = instrument 

575 Instrument.fromName(instrument, butler.registry)