Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['SingleQuantumExecutor'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import os 

29import sys 

30import tempfile 

31import time 

32from contextlib import contextmanager 

33from collections import defaultdict 

34from itertools import chain 

35from logging import FileHandler 

36from typing import List 

37 

38# ----------------------------- 

39# Imports for other modules -- 

40# ----------------------------- 

41from .quantumGraphExecutor import QuantumExecutor 

42from lsst.daf.base import PropertyList, PropertySet 

43from lsst.obs.base import Instrument 

44from lsst.pipe.base import ( 

45 AdjustQuantumHelper, 

46 ButlerQuantumContext, 

47 InvalidQuantumError, 

48 NoWorkFound, 

49 RepeatableQuantumError, 

50 logInfo, 

51) 

52from lsst.daf.butler import ( 

53 DatasetRef, 

54 DatasetType, 

55 FileDataset, 

56 NamedKeyDict, 

57 Quantum, 

58) 

59from lsst.daf.butler.core.logging import ( 

60 ButlerLogRecordHandler, 

61 ButlerLogRecords, 

62 ButlerMDC, 

63 JsonLogFormatter, 

64) 

65# ---------------------------------- 

66# Local non-exported definitions -- 

67# ---------------------------------- 

68 

69_LOG = logging.getLogger(__name__.partition(".")[2]) 

70 

71 

72class _LogCaptureFlag: 

73 """Simple flag to enable/disable log-to-butler saving. 

74 """ 

75 store: bool = True 

76 

77 

78class SingleQuantumExecutor(QuantumExecutor): 

79 """Executor class which runs one Quantum at a time. 

80 

81 Parameters 

82 ---------- 

83 butler : `~lsst.daf.butler.Butler` 

84 Data butler. 

85 taskFactory : `~lsst.pipe.base.TaskFactory` 

86 Instance of a task factory. 

87 skipExistingIn : `list` [ `str` ], optional 

88 Accepts list of collections, if all Quantum outputs already exist in 

89 the specified list of collections then that Quantum will not be rerun. 

90 clobberOutputs : `bool`, optional 

91 If `True`, then existing outputs in output run collection will be 

92 overwritten. If ``skipExistingIn`` is defined, only outputs from 

93 failed quanta will be overwritten. 

94 enableLsstDebug : `bool`, optional 

95 Enable debugging with ``lsstDebug`` facility for a task. 

96 exitOnKnownError : `bool`, optional 

97 If `True`, call `sys.exit` with the appropriate exit code for special 

98 known exceptions, after printing a traceback, instead of letting the 

99 exception propagate up to calling. This is always the behavior for 

100 InvalidQuantumError. 

101 """ 

102 

103 stream_json_logs = True 

104 """If True each log record is written to a temporary file and ingested 

105 when quantum completes. If False the records are accumulated in memory 

106 and stored in butler on quantum completion.""" 

107 

108 def __init__(self, taskFactory, skipExistingIn=None, clobberOutputs=False, enableLsstDebug=False, 

109 exitOnKnownError=False): 

110 self.taskFactory = taskFactory 

111 self.skipExistingIn = skipExistingIn 

112 self.enableLsstDebug = enableLsstDebug 

113 self.clobberOutputs = clobberOutputs 

114 self.exitOnKnownError = exitOnKnownError 

115 self.log_handler = None 

116 

117 def execute(self, taskDef, quantum, butler): 

118 # Docstring inherited from QuantumExecutor.execute 

119 startTime = time.time() 

120 

121 with self.captureLogging(taskDef, quantum, butler) as captureLog: 

122 

123 # Save detailed resource usage before task start to metadata. 

124 quantumMetadata = PropertyList() 

125 logInfo(None, "prep", metadata=quantumMetadata) 

126 

127 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config 

128 

129 # check whether to skip or delete old outputs, if it returns True 

130 # or raises an exception do not try to store logs, as they may be 

131 # already in butler. 

132 captureLog.store = False 

133 if self.checkExistingOutputs(quantum, butler, taskDef): 

134 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label, 

135 quantum.dataId) 

136 return 

137 captureLog.store = True 

138 

139 try: 

140 quantum = self.updatedQuantumInputs(quantum, butler, taskDef) 

141 except NoWorkFound as exc: 

142 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

143 taskDef.label, quantum.dataId, str(exc)) 

144 # Make empty metadata that looks something like what a 

145 # do-nothing task would write (but we don't bother with empty 

146 # nested PropertySets for subtasks). This is slightly 

147 # duplicative with logic in pipe_base that we can't easily call 

148 # from here; we'll fix this on DM-29761. 

149 logInfo(None, "end", metadata=quantumMetadata) 

150 fullMetadata = PropertySet() 

151 fullMetadata[taskDef.label] = PropertyList() 

152 fullMetadata["quantum"] = quantumMetadata 

153 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

154 return 

155 

156 # enable lsstDebug debugging 

157 if self.enableLsstDebug: 

158 try: 

159 _LOG.debug("Will try to import debug.py") 

160 import debug # noqa:F401 

161 except ImportError: 

162 _LOG.warn("No 'debug' module found.") 

163 

164 # initialize global state 

165 self.initGlobals(quantum, butler) 

166 

167 # Ensure that we are executing a frozen config 

168 config.freeze() 

169 logInfo(None, "init", metadata=quantumMetadata) 

170 task = self.makeTask(taskClass, label, config, butler) 

171 logInfo(None, "start", metadata=quantumMetadata) 

172 try: 

173 self.runQuantum(task, quantum, taskDef, butler) 

174 except Exception: 

175 _LOG.exception("Execution of task '%s' on quantum %s failed", 

176 taskDef.label, quantum.dataId) 

177 raise 

178 logInfo(None, "end", metadata=quantumMetadata) 

179 fullMetadata = task.getFullMetadata() 

180 fullMetadata["quantum"] = quantumMetadata 

181 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

182 stopTime = time.time() 

183 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds", 

184 taskDef.label, quantum.dataId, stopTime - startTime) 

185 

186 @contextmanager 

187 def captureLogging(self, taskDef, quantum, butler): 

188 """Configure logging system to capture logs for execution of this task. 

189 

190 Parameters 

191 ---------- 

192 taskDef : `lsst.pipe.base.TaskDef` 

193 The task definition. 

194 quantum : `~lsst.daf.butler.Quantum` 

195 Single Quantum instance. 

196 butler : `~lsst.daf.butler.Butler` 

197 Butler to write logs to. 

198 

199 Notes 

200 ----- 

201 Expected to be used as a context manager to ensure that logging 

202 records are inserted into the butler once the quantum has been 

203 executed: 

204 

205 .. code-block:: py 

206 

207 with self.captureLogging(taskDef, quantum, butler): 

208 # Run quantum and capture logs. 

209 

210 Ths method can also setup logging to attach task- or 

211 quantum-specific information to log messages. Potentially this can 

212 take into account some info from task configuration as well. 

213 """ 

214 # Add a handler to the root logger to capture execution log output. 

215 # How does it get removed reliably? 

216 if taskDef.logOutputDatasetName is not None: 

217 # Either accumulate into ButlerLogRecords or stream 

218 # JSON records to file and ingest that. 

219 if self.stream_json_logs: 

220 tmp = tempfile.NamedTemporaryFile(mode="w", 

221 suffix=".json", 

222 prefix=f"butler-log-{taskDef.label}-", 

223 delete=False) 

224 self.log_handler = FileHandler(tmp.name) 

225 tmp.close() 

226 self.log_handler.setFormatter(JsonLogFormatter()) 

227 else: 

228 self.log_handler = ButlerLogRecordHandler() 

229 

230 logging.getLogger().addHandler(self.log_handler) 

231 

232 # include quantum dataId and task label into MDC 

233 label = taskDef.label 

234 if quantum.dataId: 

235 label += f":{quantum.dataId}" 

236 

237 ctx = _LogCaptureFlag() 

238 try: 

239 with ButlerMDC.set_mdc({"LABEL": label}): 

240 yield ctx 

241 finally: 

242 # Ensure that the logs are stored in butler. 

243 self.writeLogRecords(quantum, taskDef, butler, ctx.store) 

244 

245 def checkExistingOutputs(self, quantum, butler, taskDef): 

246 """Decide whether this quantum needs to be executed. 

247 

248 If only partial outputs exist then they are removed if 

249 ``clobberOutputs`` is True, otherwise an exception is raised. 

250 

251 Parameters 

252 ---------- 

253 quantum : `~lsst.daf.butler.Quantum` 

254 Quantum to check for existing outputs 

255 butler : `~lsst.daf.butler.Butler` 

256 Data butler. 

257 taskDef : `~lsst.pipe.base.TaskDef` 

258 Task definition structure. 

259 

260 Returns 

261 ------- 

262 exist : `bool` 

263 `True` if ``self.skipExistingIn`` is defined, and a previous 

264 execution of this quanta appears to have completed successfully 

265 (either because metadata was written or all datasets were written). 

266 `False` otherwise. 

267 

268 Raises 

269 ------ 

270 RuntimeError 

271 Raised if some outputs exist and some not. 

272 """ 

273 if self.skipExistingIn and taskDef.metadataDatasetName is not None: 

274 # Metadata output exists; this is sufficient to assume the previous 

275 # run was successful and should be skipped. 

276 ref = butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId, 

277 collections=self.skipExistingIn) 

278 if ref is not None: 

279 if butler.datastore.exists(ref): 

280 return True 

281 

282 # Previously we always checked for existing outputs in `butler.run`, 

283 # now logic gets more complicated as we only want to skip quantum 

284 # whose outputs exist in `self.skipExistingIn` but pruning should only 

285 # be done for outputs existing in `butler.run`. 

286 

287 def findOutputs(collections): 

288 """Find quantum outputs in specified collections. 

289 """ 

290 existingRefs = [] 

291 missingRefs = [] 

292 for datasetRefs in quantum.outputs.values(): 

293 for datasetRef in datasetRefs: 

294 ref = butler.registry.findDataset(datasetRef.datasetType, datasetRef.dataId, 

295 collections=collections) 

296 if ref is not None and butler.datastore.exists(ref): 

297 existingRefs.append(ref) 

298 else: 

299 missingRefs.append(datasetRef) 

300 return existingRefs, missingRefs 

301 

302 existingRefs, missingRefs = findOutputs(self.skipExistingIn) 

303 if self.skipExistingIn: 

304 if existingRefs and not missingRefs: 

305 # everything is already there 

306 return True 

307 

308 # If we are to re-run quantum then prune datasets that exists in 

309 # output run collection, only if `self.clobberOutputs` is set. 

310 if existingRefs: 

311 existingRefs, missingRefs = findOutputs(butler.run) 

312 if existingRefs and missingRefs: 

313 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s " 

314 "existingRefs=%s missingRefs=%s", 

315 taskDef, quantum.dataId, butler.run, existingRefs, missingRefs) 

316 if self.clobberOutputs: 

317 # only prune 

318 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

319 butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

320 return False 

321 else: 

322 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:" 

323 f" collection={butler.run} existingRefs={existingRefs}" 

324 f" missingRefs={missingRefs}") 

325 

326 # need to re-run 

327 return False 

328 

329 def makeTask(self, taskClass, name, config, butler): 

330 """Make new task instance. 

331 

332 Parameters 

333 ---------- 

334 taskClass : `type` 

335 Sub-class of `~lsst.pipe.base.PipelineTask`. 

336 name : `str` 

337 Name for this task. 

338 config : `~lsst.pipe.base.PipelineTaskConfig` 

339 Configuration object for this task 

340 

341 Returns 

342 ------- 

343 task : `~lsst.pipe.base.PipelineTask` 

344 Instance of ``taskClass`` type. 

345 butler : `~lsst.daf.butler.Butler` 

346 Data butler. 

347 """ 

348 # call task factory for that 

349 return self.taskFactory.makeTask(taskClass, name, config, None, butler) 

350 

351 def updatedQuantumInputs(self, quantum, butler, taskDef): 

352 """Update quantum with extra information, returns a new updated 

353 Quantum. 

354 

355 Some methods may require input DatasetRefs to have non-None 

356 ``dataset_id``, but in case of intermediate dataset it may not be 

357 filled during QuantumGraph construction. This method will retrieve 

358 missing info from registry. 

359 

360 Parameters 

361 ---------- 

362 quantum : `~lsst.daf.butler.Quantum` 

363 Single Quantum instance. 

364 butler : `~lsst.daf.butler.Butler` 

365 Data butler. 

366 taskDef : `~lsst.pipe.base.TaskDef` 

367 Task definition structure. 

368 

369 Returns 

370 ------- 

371 update : `~lsst.daf.butler.Quantum` 

372 Updated Quantum instance 

373 """ 

374 anyChanges = False 

375 updatedInputs = defaultdict(list) 

376 for key, refsForDatasetType in quantum.inputs.items(): 

377 newRefsForDatasetType = updatedInputs[key] 

378 for ref in refsForDatasetType: 

379 if ref.id is None: 

380 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId, 

381 collections=butler.collections) 

382 if resolvedRef is None: 

383 _LOG.info("No dataset found for %s", ref) 

384 continue 

385 else: 

386 _LOG.debug("Updated dataset ID for %s", ref) 

387 else: 

388 resolvedRef = ref 

389 # We need to ask datastore if the dataset actually exists 

390 # because the Registry of a local "execution butler" cannot 

391 # know this (because we prepopulate it with all of the datasets 

392 # that might be created). 

393 if butler.datastore.exists(resolvedRef): 

394 newRefsForDatasetType.append(resolvedRef) 

395 if len(newRefsForDatasetType) != len(refsForDatasetType): 

396 anyChanges = True 

397 # If we removed any input datasets, let the task check if it has enough 

398 # to proceed and/or prune related datasets that it also doesn't 

399 # need/produce anymore. It will raise NoWorkFound if it can't run, 

400 # which we'll let propagate up. This is exactly what we run during QG 

401 # generation, because a task shouldn't care whether an input is missing 

402 # because some previous task didn't produce it, or because it just 

403 # wasn't there during QG generation. 

404 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items()) 

405 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs) 

406 if anyChanges: 

407 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

408 return Quantum(taskName=quantum.taskName, 

409 taskClass=quantum.taskClass, 

410 dataId=quantum.dataId, 

411 initInputs=quantum.initInputs, 

412 inputs=helper.inputs, 

413 outputs=helper.outputs 

414 ) 

415 

416 def runQuantum(self, task, quantum, taskDef, butler): 

417 """Execute task on a single quantum. 

418 

419 Parameters 

420 ---------- 

421 task : `~lsst.pipe.base.PipelineTask` 

422 Task object. 

423 quantum : `~lsst.daf.butler.Quantum` 

424 Single Quantum instance. 

425 taskDef : `~lsst.pipe.base.TaskDef` 

426 Task definition structure. 

427 butler : `~lsst.daf.butler.Butler` 

428 Data butler. 

429 """ 

430 # Create a butler that operates in the context of a quantum 

431 butlerQC = ButlerQuantumContext(butler, quantum) 

432 

433 # Get the input and output references for the task 

434 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

435 

436 # Call task runQuantum() method. Catch a few known failure modes and 

437 # translate them into specific 

438 try: 

439 task.runQuantum(butlerQC, inputRefs, outputRefs) 

440 except NoWorkFound as err: 

441 # Not an error, just an early exit. 

442 _LOG.info("Task '%s' on quantum %s exited early: %s", 

443 taskDef.label, quantum.dataId, str(err)) 

444 pass 

445 except RepeatableQuantumError as err: 

446 if self.exitOnKnownError: 

447 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

448 _LOG.warning(err, exc_info=True) 

449 sys.exit(err.EXIT_CODE) 

450 else: 

451 raise 

452 except InvalidQuantumError as err: 

453 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

454 _LOG.fatal(err, exc_info=True) 

455 sys.exit(err.EXIT_CODE) 

456 

457 def writeMetadata(self, quantum, metadata, taskDef, butler): 

458 if taskDef.metadataDatasetName is not None: 

459 # DatasetRef has to be in the Quantum outputs, can lookup by name 

460 try: 

461 ref = quantum.outputs[taskDef.metadataDatasetName] 

462 except LookupError as exc: 

463 raise InvalidQuantumError( 

464 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

465 f" this could happen due to inconsistent options between QuantumGraph generation" 

466 f" and execution") from exc 

467 butler.put(metadata, ref[0]) 

468 

469 def writeLogRecords(self, quantum, taskDef, butler, store): 

470 # If we are logging to an external file we must always try to 

471 # close it. 

472 filename = None 

473 if isinstance(self.log_handler, FileHandler): 

474 filename = self.log_handler.stream.name 

475 self.log_handler.close() 

476 

477 if self.log_handler is not None: 

478 # Remove the handler so we stop accumulating log messages. 

479 logging.getLogger().removeHandler(self.log_handler) 

480 

481 try: 

482 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None: 

483 # DatasetRef has to be in the Quantum outputs, can lookup by 

484 # name 

485 try: 

486 ref = quantum.outputs[taskDef.logOutputDatasetName] 

487 except LookupError as exc: 

488 raise InvalidQuantumError( 

489 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};" 

490 f" this could happen due to inconsistent options between QuantumGraph generation" 

491 f" and execution") from exc 

492 

493 if isinstance(self.log_handler, ButlerLogRecordHandler): 

494 butler.put(self.log_handler.records, ref[0]) 

495 

496 # Clear the records in case the handler is reused. 

497 self.log_handler.records.clear() 

498 else: 

499 assert filename is not None, "Somehow unable to extract filename from file handler" 

500 

501 # Need to ingest this file directly into butler. 

502 dataset = FileDataset(path=filename, refs=ref[0]) 

503 try: 

504 butler.ingest(dataset, transfer="move") 

505 filename = None 

506 except NotImplementedError: 

507 # Some datastores can't receive files (e.g. in-memory 

508 # datastore when testing), we store empty list for 

509 # those just to have a dataset. Alternative is to read 

510 # the file as a ButlerLogRecords object and put it. 

511 _LOG.info("Log records could not be stored in this butler because the" 

512 " datastore can not ingest files, empty record list is stored instead.") 

513 records = ButlerLogRecords.from_records([]) 

514 butler.put(records, ref[0]) 

515 finally: 

516 # remove file if it is not ingested 

517 if filename is not None: 

518 try: 

519 os.remove(filename) 

520 except OSError: 

521 pass 

522 

523 def initGlobals(self, quantum, butler): 

524 """Initialize global state needed for task execution. 

525 

526 Parameters 

527 ---------- 

528 quantum : `~lsst.daf.butler.Quantum` 

529 Single Quantum instance. 

530 butler : `~lsst.daf.butler.Butler` 

531 Data butler. 

532 

533 Notes 

534 ----- 

535 There is an issue with initializing filters singleton which is done 

536 by instrument, to avoid requiring tasks to do it in runQuantum() 

537 we do it here when any dataId has an instrument dimension. Also for 

538 now we only allow single instrument, verify that all instrument 

539 names in all dataIds are identical. 

540 

541 This will need revision when filter singleton disappears. 

542 """ 

543 oneInstrument = None 

544 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

545 for datasetRef in datasetRefs: 

546 dataId = datasetRef.dataId 

547 instrument = dataId.get("instrument") 

548 if instrument is not None: 

549 if oneInstrument is not None: 

550 assert instrument == oneInstrument, \ 

551 "Currently require that only one instrument is used per graph" 

552 else: 

553 oneInstrument = instrument 

554 Instrument.fromName(instrument, butler.registry)