Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py: 13%

212 statements  

« prev     ^ index     » next       coverage.py v7.2.1, created at 2023-03-12 01:56 -0800

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import gc 

29import logging 

30import multiprocessing 

31import pickle 

32import sys 

33import time 

34 

35from lsst.pipe.base.graph.graph import QuantumGraph 

36from lsst.pipe.base import InvalidQuantumError 

37 

38# ----------------------------- 

39# Imports for other modules -- 

40# ----------------------------- 

41from .quantumGraphExecutor import QuantumGraphExecutor 

42from lsst.base import disableImplicitThreading 

43from lsst.daf.butler.cli.cliLog import CliLog 

44 

45_LOG = logging.getLogger(__name__.partition(".")[2]) 

46 

47 

48# Possible states for the executing task: 

49# - PENDING: job has not started yet 

50# - RUNNING: job is currently executing 

51# - FINISHED: job finished successfully 

52# - FAILED: job execution failed (process returned non-zero status) 

53# - TIMED_OUT: job is killed due to too long execution time 

54# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

55JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

56 

57 

58class _Job: 

59 """Class representing a job running single task. 

60 

61 Parameters 

62 ---------- 

63 qnode: `~lsst.pipe.base.QuantumNode` 

64 Quantum and some associated information. 

65 """ 

66 def __init__(self, qnode): 

67 self.qnode = qnode 

68 self.process = None 

69 self._state = JobState.PENDING 

70 self.started = None 

71 

72 @property 

73 def state(self): 

74 """Job processing state (JobState)""" 

75 return self._state 

76 

77 def start(self, butler, quantumExecutor, startMethod=None): 

78 """Start process which runs the task. 

79 

80 Parameters 

81 ---------- 

82 butler : `lsst.daf.butler.Butler` 

83 Data butler instance. 

84 quantumExecutor : `QuantumExecutor` 

85 Executor for single quantum. 

86 startMethod : `str`, optional 

87 Start method from `multiprocessing` module. 

88 """ 

89 # Unpickling of quantum has to happen after butler, this is why 

90 # it is pickled manually here. 

91 quantum_pickle = pickle.dumps(self.qnode.quantum) 

92 taskDef = self.qnode.taskDef 

93 logConfigState = CliLog.configState 

94 mp_ctx = multiprocessing.get_context(startMethod) 

95 self.process = mp_ctx.Process( 

96 target=_Job._executeJob, 

97 args=(quantumExecutor, taskDef, quantum_pickle, butler, logConfigState), 

98 name=f"task-{self.qnode.nodeId.number}" 

99 ) 

100 self.process.start() 

101 self.started = time.time() 

102 self._state = JobState.RUNNING 

103 

104 @staticmethod 

105 def _executeJob(quantumExecutor, taskDef, quantum_pickle, butler, logConfigState): 

106 """Execute a job with arguments. 

107 

108 Parameters 

109 ---------- 

110 quantumExecutor : `QuantumExecutor` 

111 Executor for single quantum. 

112 taskDef : `bytes` 

113 Task definition structure. 

114 quantum_pickle : `bytes` 

115 Quantum for this task execution in pickled form. 

116 butler : `lss.daf.butler.Butler` 

117 Data butler instance. 

118 """ 

119 if logConfigState and not CliLog.configState: 

120 # means that we are in a new spawned Python process and we have to 

121 # re-initialize logging 

122 CliLog.replayConfigState(logConfigState) 

123 

124 # have to reset connection pool to avoid sharing connections 

125 if butler is not None: 

126 butler.registry.resetConnectionPool() 

127 

128 quantum = pickle.loads(quantum_pickle) 

129 quantumExecutor.execute(taskDef, quantum, butler) 

130 

131 def stop(self): 

132 """Stop the process. 

133 """ 

134 self.process.terminate() 

135 # give it 1 second to finish or KILL 

136 for i in range(10): 

137 time.sleep(0.1) 

138 if not self.process.is_alive(): 

139 break 

140 else: 

141 _LOG.debug("Killing process %s", self.process.name) 

142 self.process.kill() 

143 

144 def cleanup(self): 

145 """Release processes resources, has to be called for each finished 

146 process. 

147 """ 

148 if self.process and not self.process.is_alive(): 

149 self.process.close() 

150 self.process = None 

151 

152 def __str__(self): 

153 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>" 

154 

155 

156class _JobList: 

157 """Simple list of _Job instances with few convenience methods. 

158 

159 Parameters 

160 ---------- 

161 iterable : iterable of `~lsst.pipe.base.QuantumNode` 

162 Sequence of Quanta to execute. This has to be ordered according to 

163 task dependencies. 

164 """ 

165 def __init__(self, iterable): 

166 self.jobs = [_Job(qnode) for qnode in iterable] 

167 self.pending = self.jobs[:] 

168 self.running = [] 

169 self.finishedNodes = set() 

170 self.failedNodes = set() 

171 self.timedOutNodes = set() 

172 

173 def submit(self, job, butler, quantumExecutor, startMethod=None): 

174 """Submit one more job for execution 

175 

176 Parameters 

177 ---------- 

178 job : `_Job` 

179 Job to submit. 

180 butler : `lsst.daf.butler.Butler` 

181 Data butler instance. 

182 quantumExecutor : `QuantumExecutor` 

183 Executor for single quantum. 

184 startMethod : `str`, optional 

185 Start method from `multiprocessing` module. 

186 """ 

187 # this will raise if job is not in pending list 

188 self.pending.remove(job) 

189 job.start(butler, quantumExecutor, startMethod) 

190 self.running.append(job) 

191 

192 def setJobState(self, job, state): 

193 """Update job state. 

194 

195 Parameters 

196 ---------- 

197 job : `_Job` 

198 Job to submit. 

199 state : `JobState` 

200 New job state, note that only FINISHED, FAILED, TIMED_OUT, or 

201 FAILED_DEP state is acceptable. 

202 """ 

203 allowedStates = ( 

204 JobState.FINISHED, 

205 JobState.FAILED, 

206 JobState.TIMED_OUT, 

207 JobState.FAILED_DEP 

208 ) 

209 assert state in allowedStates, f"State {state} not allowed here" 

210 

211 # remove job from pending/running lists 

212 if job.state == JobState.PENDING: 

213 self.pending.remove(job) 

214 elif job.state == JobState.RUNNING: 

215 self.running.remove(job) 

216 

217 qnode = job.qnode 

218 # it should not be in any of these, but just in case 

219 self.finishedNodes.discard(qnode) 

220 self.failedNodes.discard(qnode) 

221 self.timedOutNodes.discard(qnode) 

222 

223 job._state = state 

224 if state == JobState.FINISHED: 

225 self.finishedNodes.add(qnode) 

226 elif state == JobState.FAILED: 

227 self.failedNodes.add(qnode) 

228 elif state == JobState.FAILED_DEP: 

229 self.failedNodes.add(qnode) 

230 elif state == JobState.TIMED_OUT: 

231 self.failedNodes.add(qnode) 

232 self.timedOutNodes.add(qnode) 

233 else: 

234 raise ValueError(f"Unexpected state value: {state}") 

235 

236 def cleanup(self): 

237 """Do periodic cleanup for jobs that did not finish correctly. 

238 

239 If timed out jobs are killed but take too long to stop then regular 

240 cleanup will not work for them. Here we check all timed out jobs 

241 periodically and do cleanup if they managed to die by this time. 

242 """ 

243 for job in self.jobs: 

244 if job.state == JobState.TIMED_OUT and job.process is not None: 

245 job.cleanup() 

246 

247 

248class MPGraphExecutorError(Exception): 

249 """Exception class for errors raised by MPGraphExecutor. 

250 """ 

251 pass 

252 

253 

254class MPTimeoutError(MPGraphExecutorError): 

255 """Exception raised when task execution times out. 

256 """ 

257 pass 

258 

259 

260class MPGraphExecutor(QuantumGraphExecutor): 

261 """Implementation of QuantumGraphExecutor using same-host multiprocess 

262 execution of Quanta. 

263 

264 Parameters 

265 ---------- 

266 numProc : `int` 

267 Number of processes to use for executing tasks. 

268 timeout : `float` 

269 Time in seconds to wait for tasks to finish. 

270 quantumExecutor : `QuantumExecutor` 

271 Executor for single quantum. For multiprocess-style execution when 

272 ``numProc`` is greater than one this instance must support pickle. 

273 startMethod : `str`, optional 

274 Start method from `multiprocessing` module, `None` selects the best 

275 one for current platform. 

276 failFast : `bool`, optional 

277 If set to ``True`` then stop processing on first error from any task. 

278 executionGraphFixup : `ExecutionGraphFixup`, optional 

279 Instance used for modification of execution graph. 

280 """ 

281 def __init__(self, numProc, timeout, quantumExecutor, *, 

282 startMethod=None, failFast=False, executionGraphFixup=None): 

283 self.numProc = numProc 

284 self.timeout = timeout 

285 self.quantumExecutor = quantumExecutor 

286 self.failFast = failFast 

287 self.executionGraphFixup = executionGraphFixup 

288 

289 # We set default start method as spawn for MacOS and fork for Linux; 

290 # None for all other platforms to use multiprocessing default. 

291 if startMethod is None: 

292 methods = dict(linux="fork", darwin="spawn") 

293 startMethod = methods.get(sys.platform) 

294 self.startMethod = startMethod 

295 

296 def execute(self, graph, butler): 

297 # Docstring inherited from QuantumGraphExecutor.execute 

298 graph = self._fixupQuanta(graph) 

299 if self.numProc > 1: 

300 self._executeQuantaMP(graph, butler) 

301 else: 

302 self._executeQuantaInProcess(graph, butler) 

303 

304 def _fixupQuanta(self, graph: QuantumGraph): 

305 """Call fixup code to modify execution graph. 

306 

307 Parameters 

308 ---------- 

309 graph : `QuantumGraph` 

310 `QuantumGraph` to modify 

311 

312 Returns 

313 ------- 

314 graph : `QuantumGraph` 

315 Modified `QuantumGraph`. 

316 

317 Raises 

318 ------ 

319 MPGraphExecutorError 

320 Raised if execution graph cannot be ordered after modification, 

321 i.e. it has dependency cycles. 

322 """ 

323 if not self.executionGraphFixup: 

324 return graph 

325 

326 _LOG.debug("Call execution graph fixup method") 

327 graph = self.executionGraphFixup.fixupQuanta(graph) 

328 

329 # Detect if there is now a cycle created within the graph 

330 if graph.findCycle(): 

331 raise MPGraphExecutorError( 

332 "Updated execution graph has dependency cycle.") 

333 

334 return graph 

335 

336 def _executeQuantaInProcess(self, graph, butler): 

337 """Execute all Quanta in current process. 

338 

339 Parameters 

340 ---------- 

341 graph : `QuantumGraph` 

342 `QuantumGraph` that is to be executed 

343 butler : `lsst.daf.butler.Butler` 

344 Data butler instance 

345 """ 

346 successCount, totalCount = 0, len(graph) 

347 failedNodes = set() 

348 for qnode in graph: 

349 

350 # Any failed inputs mean that the quantum has to be skipped. 

351 inputNodes = graph.determineInputsToQuantumNode(qnode) 

352 if inputNodes & failedNodes: 

353 _LOG.error( 

354 "Upstream job failed for task <%s dataId=%s>, skipping this task.", 

355 qnode.taskDef, 

356 qnode.quantum.dataId, 

357 ) 

358 failedNodes.add(qnode) 

359 continue 

360 

361 _LOG.debug("Executing %s", qnode) 

362 try: 

363 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler) 

364 successCount += 1 

365 except Exception as exc: 

366 failedNodes.add(qnode) 

367 if self.failFast: 

368 raise MPGraphExecutorError( 

369 f"Task <{qnode.taskDef} dataId={qnode.quantum.dataId}> failed." 

370 ) from exc 

371 else: 

372 # Note that there could be exception safety issues, which 

373 # we presently ignore. 

374 _LOG.error( 

375 "Task <%s dataId=%s> failed; processing will continue for remaining tasks.", 

376 qnode.taskDef, 

377 qnode.quantum.dataId, 

378 exc_info=exc, 

379 ) 

380 finally: 

381 # sqlalchemy has some objects that can last until a garbage 

382 # collection cycle is run, which can happen at unpredictable 

383 # times, run a collection loop here explicitly. 

384 gc.collect() 

385 

386 _LOG.info( 

387 "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.", 

388 successCount, 

389 len(failedNodes), 

390 totalCount - successCount - len(failedNodes), 

391 totalCount, 

392 ) 

393 

394 # Raise an exception if there were any failures. 

395 if failedNodes: 

396 raise MPGraphExecutorError("One or more tasks failed during execution.") 

397 

398 def _executeQuantaMP(self, graph, butler): 

399 """Execute all Quanta in separate processes. 

400 

401 Parameters 

402 ---------- 

403 graph : `QuantumGraph` 

404 `QuantumGraph` that is to be executed. 

405 butler : `lsst.daf.butler.Butler` 

406 Data butler instance 

407 """ 

408 

409 disableImplicitThreading() # To prevent thread contention 

410 

411 _LOG.debug("Using %r for multiprocessing start method", self.startMethod) 

412 

413 # re-pack input quantum data into jobs list 

414 jobs = _JobList(graph) 

415 

416 # check that all tasks can run in sub-process 

417 for job in jobs.jobs: 

418 taskDef = job.qnode.taskDef 

419 if not taskDef.taskClass.canMultiprocess: 

420 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

421 " use single process") 

422 

423 finishedCount, failedCount = 0, 0 

424 while jobs.pending or jobs.running: 

425 

426 _LOG.debug("#pendingJobs: %s", len(jobs.pending)) 

427 _LOG.debug("#runningJobs: %s", len(jobs.running)) 

428 

429 # See if any jobs have finished 

430 for job in jobs.running: 

431 if not job.process.is_alive(): 

432 _LOG.debug("finished: %s", job) 

433 # finished 

434 exitcode = job.process.exitcode 

435 if exitcode == 0: 

436 jobs.setJobState(job, JobState.FINISHED) 

437 job.cleanup() 

438 _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started) 

439 else: 

440 jobs.setJobState(job, JobState.FAILED) 

441 job.cleanup() 

442 _LOG.debug("failed: %s", job) 

443 if self.failFast or exitcode == InvalidQuantumError.EXIT_CODE: 

444 for stopJob in jobs.running: 

445 if stopJob is not job: 

446 stopJob.stop() 

447 raise MPGraphExecutorError( 

448 f"Task {job} failed, exit code={exitcode}." 

449 ) 

450 else: 

451 _LOG.error( 

452 "Task %s failed; processing will continue for remaining tasks.", job 

453 ) 

454 else: 

455 # check for timeout 

456 now = time.time() 

457 if now - job.started > self.timeout: 

458 jobs.setJobState(job, JobState.TIMED_OUT) 

459 _LOG.debug("Terminating job %s due to timeout", job) 

460 job.stop() 

461 job.cleanup() 

462 if self.failFast: 

463 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

464 else: 

465 _LOG.error( 

466 "Timeout (%s sec) for task %s; task is killed, processing continues " 

467 "for remaining tasks.", self.timeout, job 

468 ) 

469 

470 # Fail jobs whose inputs failed, this may need several iterations 

471 # if the order is not right, will be done in the next loop. 

472 if jobs.failedNodes: 

473 for job in jobs.pending: 

474 jobInputNodes = graph.determineInputsToQuantumNode(job.qnode) 

475 if jobInputNodes & jobs.failedNodes: 

476 jobs.setJobState(job, JobState.FAILED_DEP) 

477 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

478 

479 # see if we can start more jobs 

480 if len(jobs.running) < self.numProc: 

481 for job in jobs.pending: 

482 jobInputNodes = graph.determineInputsToQuantumNode(job.qnode) 

483 if jobInputNodes <= jobs.finishedNodes: 

484 # all dependencies have completed, can start new job 

485 if len(jobs.running) < self.numProc: 

486 _LOG.debug("Submitting %s", job) 

487 jobs.submit(job, butler, self.quantumExecutor, self.startMethod) 

488 if len(jobs.running) >= self.numProc: 

489 # Cannot start any more jobs, wait until something 

490 # finishes. 

491 break 

492 

493 # Do cleanup for timed out jobs if necessary. 

494 jobs.cleanup() 

495 

496 # Print progress message if something changed. 

497 newFinished, newFailed = len(jobs.finishedNodes), len(jobs.failedNodes) 

498 if (finishedCount, failedCount) != (newFinished, newFailed): 

499 finishedCount, failedCount = newFinished, newFailed 

500 totalCount = len(jobs.jobs) 

501 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.", 

502 finishedCount, failedCount, totalCount - finishedCount - failedCount, totalCount) 

503 

504 # Here we want to wait until one of the running jobs completes 

505 # but multiprocessing does not provide an API for that, for now 

506 # just sleep a little bit and go back to the loop. 

507 if jobs.running: 

508 time.sleep(0.1) 

509 

510 if jobs.failedNodes: 

511 # print list of failed jobs 

512 _LOG.error("Failed jobs:") 

513 for job in jobs.jobs: 

514 if job.state != JobState.FINISHED: 

515 _LOG.error(" - %s: %s", job.state.name, job) 

516 

517 # if any job failed raise an exception 

518 if jobs.failedNodes == jobs.timedOutNodes: 

519 raise MPTimeoutError("One or more tasks timed out during execution.") 

520 else: 

521 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")