Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['SingleQuantumExecutor'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import os 

29import sys 

30import tempfile 

31import time 

32from contextlib import contextmanager 

33from collections import defaultdict 

34from itertools import chain 

35from logging import FileHandler 

36from typing import List 

37 

38# ----------------------------- 

39# Imports for other modules -- 

40# ----------------------------- 

41from .quantumGraphExecutor import QuantumExecutor 

42from lsst.daf.base import PropertyList, PropertySet 

43from lsst.obs.base import Instrument 

44from lsst.pipe.base import ( 

45 AdjustQuantumHelper, 

46 ButlerQuantumContext, 

47 InvalidQuantumError, 

48 NoWorkFound, 

49 RepeatableQuantumError, 

50 logInfo, 

51) 

52from lsst.daf.butler import ( 

53 DatasetRef, 

54 DatasetType, 

55 FileDataset, 

56 NamedKeyDict, 

57 Quantum, 

58) 

59from lsst.daf.butler.core.logging import ( 

60 ButlerLogRecordHandler, 

61 ButlerLogRecords, 

62 ButlerMDC, 

63 JsonLogFormatter, 

64) 

65# ---------------------------------- 

66# Local non-exported definitions -- 

67# ---------------------------------- 

68 

69_LOG = logging.getLogger(__name__.partition(".")[2]) 

70 

71 

72class _LogCaptureFlag: 

73 """Simple flag to enable/disable log-to-butler saving. 

74 """ 

75 store: bool = True 

76 

77 

78class SingleQuantumExecutor(QuantumExecutor): 

79 """Executor class which runs one Quantum at a time. 

80 

81 Parameters 

82 ---------- 

83 butler : `~lsst.daf.butler.Butler` 

84 Data butler. 

85 taskFactory : `~lsst.pipe.base.TaskFactory` 

86 Instance of a task factory. 

87 skipExisting : `bool`, optional 

88 If `True`, then quanta that succeeded will not be rerun. 

89 clobberOutputs : `bool`, optional 

90 If `True`, then existing outputs will be overwritten. If 

91 `skipExisting` is also `True`, only outputs from failed quanta will 

92 be overwritten. 

93 enableLsstDebug : `bool`, optional 

94 Enable debugging with ``lsstDebug`` facility for a task. 

95 exitOnKnownError : `bool`, optional 

96 If `True`, call `sys.exit` with the appropriate exit code for special 

97 known exceptions, after printing a traceback, instead of letting the 

98 exception propagate up to calling. This is always the behavior for 

99 InvalidQuantumError. 

100 """ 

101 

102 stream_json_logs = True 

103 """If True each log record is written to a temporary file and ingested 

104 when quantum completes. If False the records are accumulated in memory 

105 and stored in butler on quantum completion.""" 

106 

107 def __init__(self, taskFactory, skipExisting=False, clobberOutputs=False, enableLsstDebug=False, 

108 exitOnKnownError=False): 

109 self.taskFactory = taskFactory 

110 self.skipExisting = skipExisting 

111 self.enableLsstDebug = enableLsstDebug 

112 self.clobberOutputs = clobberOutputs 

113 self.exitOnKnownError = exitOnKnownError 

114 self.log_handler = None 

115 

116 def execute(self, taskDef, quantum, butler): 

117 # Docstring inherited from QuantumExecutor.execute 

118 startTime = time.time() 

119 

120 with self.captureLogging(taskDef, quantum, butler) as captureLog: 

121 

122 # Save detailed resource usage before task start to metadata. 

123 quantumMetadata = PropertyList() 

124 logInfo(None, "prep", metadata=quantumMetadata) 

125 

126 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config 

127 

128 # check whether to skip or delete old outputs, if it returns True 

129 # or raises an exception do not try to store logs, as they may be 

130 # already in butler. 

131 captureLog.store = False 

132 if self.checkExistingOutputs(quantum, butler, taskDef): 

133 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label, 

134 quantum.dataId) 

135 return 

136 captureLog.store = True 

137 

138 try: 

139 quantum = self.updatedQuantumInputs(quantum, butler, taskDef) 

140 except NoWorkFound as exc: 

141 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s", 

142 taskDef.label, quantum.dataId, str(exc)) 

143 # Make empty metadata that looks something like what a 

144 # do-nothing task would write (but we don't bother with empty 

145 # nested PropertySets for subtasks). This is slightly 

146 # duplicative with logic in pipe_base that we can't easily call 

147 # from here; we'll fix this on DM-29761. 

148 logInfo(None, "end", metadata=quantumMetadata) 

149 fullMetadata = PropertySet() 

150 fullMetadata[taskDef.label] = PropertyList() 

151 fullMetadata["quantum"] = quantumMetadata 

152 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

153 return 

154 

155 # enable lsstDebug debugging 

156 if self.enableLsstDebug: 

157 try: 

158 _LOG.debug("Will try to import debug.py") 

159 import debug # noqa:F401 

160 except ImportError: 

161 _LOG.warn("No 'debug' module found.") 

162 

163 # initialize global state 

164 self.initGlobals(quantum, butler) 

165 

166 # Ensure that we are executing a frozen config 

167 config.freeze() 

168 logInfo(None, "init", metadata=quantumMetadata) 

169 task = self.makeTask(taskClass, label, config, butler) 

170 logInfo(None, "start", metadata=quantumMetadata) 

171 try: 

172 self.runQuantum(task, quantum, taskDef, butler) 

173 except Exception: 

174 _LOG.exception("Execution of task '%s' on quantum %s failed", 

175 taskDef.label, quantum.dataId) 

176 raise 

177 logInfo(None, "end", metadata=quantumMetadata) 

178 fullMetadata = task.getFullMetadata() 

179 fullMetadata["quantum"] = quantumMetadata 

180 self.writeMetadata(quantum, fullMetadata, taskDef, butler) 

181 stopTime = time.time() 

182 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds", 

183 taskDef.label, quantum.dataId, stopTime - startTime) 

184 

185 @contextmanager 

186 def captureLogging(self, taskDef, quantum, butler): 

187 """Configure logging system to capture logs for execution of this task. 

188 

189 Parameters 

190 ---------- 

191 taskDef : `lsst.pipe.base.TaskDef` 

192 The task definition. 

193 quantum : `~lsst.daf.butler.Quantum` 

194 Single Quantum instance. 

195 butler : `~lsst.daf.butler.Butler` 

196 Butler to write logs to. 

197 

198 Notes 

199 ----- 

200 Expected to be used as a context manager to ensure that logging 

201 records are inserted into the butler once the quantum has been 

202 executed: 

203 

204 .. code-block:: py 

205 

206 with self.captureLogging(taskDef, quantum, butler): 

207 # Run quantum and capture logs. 

208 

209 Ths method can also setup logging to attach task- or 

210 quantum-specific information to log messages. Potentially this can 

211 take into account some info from task configuration as well. 

212 """ 

213 # Add a handler to the root logger to capture execution log output. 

214 # How does it get removed reliably? 

215 if taskDef.logOutputDatasetName is not None: 

216 # Either accumulate into ButlerLogRecords or stream 

217 # JSON records to file and ingest that. 

218 if self.stream_json_logs: 

219 tmp = tempfile.NamedTemporaryFile(mode="w", 

220 suffix=".json", 

221 prefix=f"butler-log-{taskDef.label}-", 

222 delete=False) 

223 self.log_handler = FileHandler(tmp.name) 

224 tmp.close() 

225 self.log_handler.setFormatter(JsonLogFormatter()) 

226 else: 

227 self.log_handler = ButlerLogRecordHandler() 

228 

229 logging.getLogger().addHandler(self.log_handler) 

230 

231 # include quantum dataId and task label into MDC 

232 label = taskDef.label 

233 if quantum.dataId: 

234 label += f":{quantum.dataId}" 

235 

236 ctx = _LogCaptureFlag() 

237 try: 

238 with ButlerMDC.set_mdc({"LABEL": label}): 

239 yield ctx 

240 finally: 

241 # Ensure that the logs are stored in butler. 

242 self.writeLogRecords(quantum, taskDef, butler, ctx.store) 

243 

244 def checkExistingOutputs(self, quantum, butler, taskDef): 

245 """Decide whether this quantum needs to be executed. 

246 

247 If only partial outputs exist then they are removed if 

248 ``clobberOutputs`` is True, otherwise an exception is raised. 

249 

250 Parameters 

251 ---------- 

252 quantum : `~lsst.daf.butler.Quantum` 

253 Quantum to check for existing outputs 

254 butler : `~lsst.daf.butler.Butler` 

255 Data butler. 

256 taskDef : `~lsst.pipe.base.TaskDef` 

257 Task definition structure. 

258 

259 Returns 

260 ------- 

261 exist : `bool` 

262 `True` if ``self.skipExisting`` is `True`, and a previous execution 

263 of this quanta appears to have completed successfully (either 

264 because metadata was written or all datasets were written). 

265 `False` otherwise. 

266 

267 Raises 

268 ------ 

269 RuntimeError 

270 Raised if some outputs exist and some not. 

271 """ 

272 collection = butler.run 

273 registry = butler.registry 

274 

275 if self.skipExisting and taskDef.metadataDatasetName is not None: 

276 # Metadata output exists; this is sufficient to assume the previous 

277 # run was successful and should be skipped. 

278 if (ref := butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId)) is not None: 

279 if butler.datastore.exists(ref): 

280 return True 

281 

282 existingRefs = [] 

283 missingRefs = [] 

284 for datasetRefs in quantum.outputs.values(): 

285 for datasetRef in datasetRefs: 

286 ref = registry.findDataset(datasetRef.datasetType, datasetRef.dataId, 

287 collections=butler.run) 

288 if ref is None: 

289 missingRefs.append(datasetRef) 

290 else: 

291 if butler.datastore.exists(ref): 

292 existingRefs.append(ref) 

293 else: 

294 missingRefs.append(datasetRef) 

295 if existingRefs and missingRefs: 

296 # Some outputs exist and some don't, either delete existing ones 

297 # or complain. 

298 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s " 

299 "existingRefs=%s missingRefs=%s", 

300 taskDef, quantum.dataId, collection, existingRefs, missingRefs) 

301 if self.clobberOutputs: 

302 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs) 

303 butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True) 

304 return False 

305 else: 

306 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:" 

307 f" collection={collection} existingRefs={existingRefs}" 

308 f" missingRefs={missingRefs}") 

309 elif existingRefs: 

310 # complete outputs exist, this is fine only if skipExisting is set 

311 return self.skipExisting 

312 else: 

313 # no outputs exist 

314 return False 

315 

316 def makeTask(self, taskClass, name, config, butler): 

317 """Make new task instance. 

318 

319 Parameters 

320 ---------- 

321 taskClass : `type` 

322 Sub-class of `~lsst.pipe.base.PipelineTask`. 

323 name : `str` 

324 Name for this task. 

325 config : `~lsst.pipe.base.PipelineTaskConfig` 

326 Configuration object for this task 

327 

328 Returns 

329 ------- 

330 task : `~lsst.pipe.base.PipelineTask` 

331 Instance of ``taskClass`` type. 

332 butler : `~lsst.daf.butler.Butler` 

333 Data butler. 

334 """ 

335 # call task factory for that 

336 return self.taskFactory.makeTask(taskClass, name, config, None, butler) 

337 

338 def updatedQuantumInputs(self, quantum, butler, taskDef): 

339 """Update quantum with extra information, returns a new updated 

340 Quantum. 

341 

342 Some methods may require input DatasetRefs to have non-None 

343 ``dataset_id``, but in case of intermediate dataset it may not be 

344 filled during QuantumGraph construction. This method will retrieve 

345 missing info from registry. 

346 

347 Parameters 

348 ---------- 

349 quantum : `~lsst.daf.butler.Quantum` 

350 Single Quantum instance. 

351 butler : `~lsst.daf.butler.Butler` 

352 Data butler. 

353 taskDef : `~lsst.pipe.base.TaskDef` 

354 Task definition structure. 

355 

356 Returns 

357 ------- 

358 update : `~lsst.daf.butler.Quantum` 

359 Updated Quantum instance 

360 """ 

361 anyChanges = False 

362 updatedInputs = defaultdict(list) 

363 for key, refsForDatasetType in quantum.inputs.items(): 

364 newRefsForDatasetType = updatedInputs[key] 

365 for ref in refsForDatasetType: 

366 if ref.id is None: 

367 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId, 

368 collections=butler.collections) 

369 if resolvedRef is None: 

370 _LOG.info("No dataset found for %s", ref) 

371 continue 

372 else: 

373 _LOG.debug("Updated dataset ID for %s", ref) 

374 else: 

375 resolvedRef = ref 

376 # We need to ask datastore if the dataset actually exists 

377 # because the Registry of a local "execution butler" cannot 

378 # know this (because we prepopulate it with all of the datasets 

379 # that might be created). 

380 if butler.datastore.exists(resolvedRef): 

381 newRefsForDatasetType.append(resolvedRef) 

382 if len(newRefsForDatasetType) != len(refsForDatasetType): 

383 anyChanges = True 

384 # If we removed any input datasets, let the task check if it has enough 

385 # to proceed and/or prune related datasets that it also doesn't 

386 # need/produce anymore. It will raise NoWorkFound if it can't run, 

387 # which we'll let propagate up. This is exactly what we run during QG 

388 # generation, because a task shouldn't care whether an input is missing 

389 # because some previous task didn't produce it, or because it just 

390 # wasn't there during QG generation. 

391 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items()) 

392 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs) 

393 if anyChanges: 

394 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId) 

395 return Quantum(taskName=quantum.taskName, 

396 taskClass=quantum.taskClass, 

397 dataId=quantum.dataId, 

398 initInputs=quantum.initInputs, 

399 inputs=helper.inputs, 

400 outputs=helper.outputs 

401 ) 

402 

403 def runQuantum(self, task, quantum, taskDef, butler): 

404 """Execute task on a single quantum. 

405 

406 Parameters 

407 ---------- 

408 task : `~lsst.pipe.base.PipelineTask` 

409 Task object. 

410 quantum : `~lsst.daf.butler.Quantum` 

411 Single Quantum instance. 

412 taskDef : `~lsst.pipe.base.TaskDef` 

413 Task definition structure. 

414 butler : `~lsst.daf.butler.Butler` 

415 Data butler. 

416 """ 

417 # Create a butler that operates in the context of a quantum 

418 butlerQC = ButlerQuantumContext(butler, quantum) 

419 

420 # Get the input and output references for the task 

421 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum) 

422 

423 # Call task runQuantum() method. Catch a few known failure modes and 

424 # translate them into specific 

425 try: 

426 task.runQuantum(butlerQC, inputRefs, outputRefs) 

427 except NoWorkFound as err: 

428 # Not an error, just an early exit. 

429 _LOG.info("Task '%s' on quantum %s exited early: %s", 

430 taskDef.label, quantum.dataId, str(err)) 

431 pass 

432 except RepeatableQuantumError as err: 

433 if self.exitOnKnownError: 

434 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId) 

435 _LOG.warning(err, exc_info=True) 

436 sys.exit(err.EXIT_CODE) 

437 else: 

438 raise 

439 except InvalidQuantumError as err: 

440 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId) 

441 _LOG.fatal(err, exc_info=True) 

442 sys.exit(err.EXIT_CODE) 

443 

444 def writeMetadata(self, quantum, metadata, taskDef, butler): 

445 if taskDef.metadataDatasetName is not None: 

446 # DatasetRef has to be in the Quantum outputs, can lookup by name 

447 try: 

448 ref = quantum.outputs[taskDef.metadataDatasetName] 

449 except LookupError as exc: 

450 raise InvalidQuantumError( 

451 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};" 

452 f" this could happen due to inconsistent options between QuantumGraph generation" 

453 f" and execution") from exc 

454 butler.put(metadata, ref[0]) 

455 

456 def writeLogRecords(self, quantum, taskDef, butler, store): 

457 # If we are logging to an external file we must always try to 

458 # close it. 

459 filename = None 

460 if isinstance(self.log_handler, FileHandler): 

461 filename = self.log_handler.stream.name 

462 self.log_handler.close() 

463 

464 if self.log_handler is not None: 

465 # Remove the handler so we stop accumulating log messages. 

466 logging.getLogger().removeHandler(self.log_handler) 

467 

468 try: 

469 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None: 

470 # DatasetRef has to be in the Quantum outputs, can lookup by 

471 # name 

472 try: 

473 ref = quantum.outputs[taskDef.logOutputDatasetName] 

474 except LookupError as exc: 

475 raise InvalidQuantumError( 

476 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};" 

477 f" this could happen due to inconsistent options between QuantumGraph generation" 

478 f" and execution") from exc 

479 

480 if isinstance(self.log_handler, ButlerLogRecordHandler): 

481 butler.put(self.log_handler.records, ref[0]) 

482 

483 # Clear the records in case the handler is reused. 

484 self.log_handler.records.clear() 

485 else: 

486 assert filename is not None, "Somehow unable to extract filename from file handler" 

487 

488 # Need to ingest this file directly into butler. 

489 dataset = FileDataset(path=filename, refs=ref[0]) 

490 try: 

491 butler.ingest(dataset, transfer="move") 

492 filename = None 

493 except NotImplementedError: 

494 # Some datastores can't receive files (e.g. in-memory 

495 # datastore when testing), we store empty list for 

496 # those just to have a dataset. Alternative is to read 

497 # the file as a ButlerLogRecords object and put it. 

498 _LOG.info("Log records could not be stored in this butler because the" 

499 " datastore can not ingest files, empty record list is stored instead.") 

500 records = ButlerLogRecords.from_records([]) 

501 butler.put(records, ref[0]) 

502 finally: 

503 # remove file if it is not ingested 

504 if filename is not None: 

505 try: 

506 os.remove(filename) 

507 except OSError: 

508 pass 

509 

510 def initGlobals(self, quantum, butler): 

511 """Initialize global state needed for task execution. 

512 

513 Parameters 

514 ---------- 

515 quantum : `~lsst.daf.butler.Quantum` 

516 Single Quantum instance. 

517 butler : `~lsst.daf.butler.Butler` 

518 Data butler. 

519 

520 Notes 

521 ----- 

522 There is an issue with initializing filters singleton which is done 

523 by instrument, to avoid requiring tasks to do it in runQuantum() 

524 we do it here when any dataId has an instrument dimension. Also for 

525 now we only allow single instrument, verify that all instrument 

526 names in all dataIds are identical. 

527 

528 This will need revision when filter singleton disappears. 

529 """ 

530 oneInstrument = None 

531 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()): 

532 for datasetRef in datasetRefs: 

533 dataId = datasetRef.dataId 

534 instrument = dataId.get("instrument") 

535 if instrument is not None: 

536 if oneInstrument is not None: 

537 assert instrument == oneInstrument, \ 

538 "Currently require that only one instrument is used per graph" 

539 else: 

540 oneInstrument = instrument 

541 Instrument.fromName(instrument, butler.registry)