Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import logging 

29import multiprocessing 

30import pickle 

31import sys 

32import time 

33 

34from lsst.pipe.base.graph.graph import QuantumGraph 

35 

36# ----------------------------- 

37# Imports for other modules -- 

38# ----------------------------- 

39from .quantumGraphExecutor import QuantumGraphExecutor 

40from lsst.base import disableImplicitThreading 

41from lsst.daf.butler.cli.cliLog import CliLog 

42 

43_LOG = logging.getLogger(__name__.partition(".")[2]) 

44 

45 

46# Possible states for the executing task: 

47# - PENDING: job has not started yet 

48# - RUNNING: job is currently executing 

49# - FINISHED: job finished successfully 

50# - FAILED: job execution failed (process returned non-zero status) 

51# - TIMED_OUT: job is killed due to too long execution time 

52# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

53JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

54 

55 

56class _Job: 

57 """Class representing a job running single task. 

58 

59 Parameters 

60 ---------- 

61 qnode: `~lsst.pipe.base.QuantumNode` 

62 Quantum and some associated information. 

63 """ 

64 def __init__(self, qnode): 

65 self.qnode = qnode 

66 self.process = None 

67 self.state = JobState.PENDING 

68 self.started = None 

69 

70 def start(self, butler, quantumExecutor, startMethod=None): 

71 """Start process which runs the task. 

72 

73 Parameters 

74 ---------- 

75 butler : `lsst.daf.butler.Butler` 

76 Data butler instance. 

77 quantumExecutor : `QuantumExecutor` 

78 Executor for single quantum. 

79 startMethod : `str`, optional 

80 Start method from `multiprocessing` module. 

81 """ 

82 # Butler can have live database connections which is a problem with 

83 # fork-type activation. Make a pickle of butler to pass that across 

84 # fork. Unpickling of quantum has to happen after butler, this is why 

85 # it is pickled manually here. 

86 butler_pickle = pickle.dumps(butler) 

87 quantum_pickle = pickle.dumps(self.qnode.quantum) 

88 taskDef = self.qnode.taskDef 

89 logConfigState = CliLog.configState 

90 mp_ctx = multiprocessing.get_context(startMethod) 

91 self.process = mp_ctx.Process( 

92 target=_Job._executeJob, 

93 args=(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState), 

94 name=f"task-{self.qnode.nodeId.number}" 

95 ) 

96 self.process.start() 

97 self.started = time.time() 

98 self.state = JobState.RUNNING 

99 

100 @staticmethod 

101 def _executeJob(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState): 

102 """Execute a job with arguments. 

103 

104 Parameters 

105 ---------- 

106 quantumExecutor : `QuantumExecutor` 

107 Executor for single quantum. 

108 taskDef : `bytes` 

109 Task definition structure. 

110 quantum_pickle : `bytes` 

111 Quantum for this task execution in pickled form. 

112 butler_pickle : `bytes` 

113 Data butler instance in pickled form. 

114 """ 

115 if logConfigState and not CliLog.configState: 

116 # means that we are in a new spawned Python process and we have to 

117 # re-initialize logging 

118 CliLog.replayConfigState(logConfigState) 

119 

120 butler = pickle.loads(butler_pickle) 

121 quantum = pickle.loads(quantum_pickle) 

122 quantumExecutor.execute(taskDef, quantum, butler) 

123 

124 def stop(self): 

125 """Stop the process. 

126 """ 

127 self.process.terminate() 

128 # give it 1 second to finish or KILL 

129 for i in range(10): 

130 time.sleep(0.1) 

131 if not self.process.is_alive(): 

132 break 

133 else: 

134 _LOG.debug("Killing process %s", self.process.name) 

135 self.process.kill() 

136 

137 def cleanup(self): 

138 """Release processes resources, has to be called for each finished 

139 process. 

140 """ 

141 if self.process and not self.process.is_alive(): 

142 self.process.close() 

143 self.process = None 

144 

145 def __str__(self): 

146 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>" 

147 

148 

149class _JobList: 

150 """Simple list of _Job instances with few convenience methods. 

151 

152 Parameters 

153 ---------- 

154 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

155 Sequence if Quanta to execute. This has to be ordered according to 

156 task dependencies. 

157 """ 

158 def __init__(self, iterable): 

159 self.jobs = [_Job(qnode) for qnode in iterable] 

160 

161 def pending(self): 

162 """Return list of jobs that wait for execution. 

163 

164 Returns 

165 ------- 

166 jobs : `list` [`_Job`] 

167 List of jobs. 

168 """ 

169 return [job for job in self.jobs if job.state == JobState.PENDING] 

170 

171 def running(self): 

172 """Return list of jobs that are executing. 

173 

174 Returns 

175 ------- 

176 jobs : `list` [`_Job`] 

177 List of jobs. 

178 """ 

179 return [job for job in self.jobs if job.state == JobState.RUNNING] 

180 

181 def finishedNodes(self): 

182 """Return set of QuantumNodes that finished successfully (not failed). 

183 

184 Returns 

185 ------- 

186 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

187 Set of QuantumNodes that have successfully finished 

188 """ 

189 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED) 

190 

191 def failedNodes(self): 

192 """Return set of jobs IDs that failed for any reason. 

193 

194 Returns 

195 ------- 

196 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

197 Set of QUantumNodes that failed during processing 

198 """ 

199 return set(job.qnode for job in self.jobs 

200 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

201 

202 def timedOutIds(self): 

203 """Return set of jobs IDs that timed out. 

204 

205 Returns 

206 ------- 

207 jobsIds : `set` [`int`] 

208 Set of integer job IDs. 

209 """ 

210 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT) 

211 

212 def cleanup(self): 

213 """Do periodic cleanup for jobs that did not finish correctly. 

214 

215 If timed out jobs are killed but take too long to stop then regular 

216 cleanup will not work for them. Here we check all timed out jobs 

217 periodically and do cleanup if they managed to die by this time. 

218 """ 

219 for job in self.jobs: 

220 if job.state == JobState.TIMED_OUT and job.process is not None: 

221 job.cleanup() 

222 

223 

224class MPGraphExecutorError(Exception): 

225 """Exception class for errors raised by MPGraphExecutor. 

226 """ 

227 pass 

228 

229 

230class MPTimeoutError(MPGraphExecutorError): 

231 """Exception raised when task execution times out. 

232 """ 

233 pass 

234 

235 

236class MPGraphExecutor(QuantumGraphExecutor): 

237 """Implementation of QuantumGraphExecutor using same-host multiprocess 

238 execution of Quanta. 

239 

240 Parameters 

241 ---------- 

242 numProc : `int` 

243 Number of processes to use for executing tasks. 

244 timeout : `float` 

245 Time in seconds to wait for tasks to finish. 

246 quantumExecutor : `QuantumExecutor` 

247 Executor for single quantum. For multiprocess-style execution when 

248 ``numProc`` is greater than one this instance must support pickle. 

249 startMethod : `str`, optional 

250 Start method from `multiprocessing` module, `None` selects the best 

251 one for current platform. 

252 failFast : `bool`, optional 

253 If set to ``True`` then stop processing on first error from any task. 

254 executionGraphFixup : `ExecutionGraphFixup`, optional 

255 Instance used for modification of execution graph. 

256 """ 

257 def __init__(self, numProc, timeout, quantumExecutor, *, 

258 startMethod=None, failFast=False, executionGraphFixup=None): 

259 self.numProc = numProc 

260 self.timeout = timeout 

261 self.quantumExecutor = quantumExecutor 

262 self.failFast = failFast 

263 self.executionGraphFixup = executionGraphFixup 

264 

265 # We set default start method as spawn for MacOS and fork for Linux; 

266 # None for all other platforms to use multiprocessing default. 

267 if startMethod is None: 

268 methods = dict(linux="fork", darwin="spawn") 

269 startMethod = methods.get(sys.platform) 

270 self.startMethod = startMethod 

271 

272 def execute(self, graph, butler): 

273 # Docstring inherited from QuantumGraphExecutor.execute 

274 graph = self._fixupQuanta(graph) 

275 if self.numProc > 1: 

276 self._executeQuantaMP(graph, butler) 

277 else: 

278 self._executeQuantaInProcess(graph, butler) 

279 

280 def _fixupQuanta(self, graph: QuantumGraph): 

281 """Call fixup code to modify execution graph. 

282 

283 Parameters 

284 ---------- 

285 graph : `QuantumGraph` 

286 `QuantumGraph` to modify 

287 

288 Returns 

289 ------- 

290 graph : `QuantumGraph` 

291 Modified `QuantumGraph`. 

292 

293 Raises 

294 ------ 

295 MPGraphExecutorError 

296 Raised if execution graph cannot be ordered after modification, 

297 i.e. it has dependency cycles. 

298 """ 

299 if not self.executionGraphFixup: 

300 return graph 

301 

302 _LOG.debug("Call execution graph fixup method") 

303 graph = self.executionGraphFixup.fixupQuanta(graph) 

304 

305 # Detect if there is now a cycle created within the graph 

306 if graph.findCycle(): 

307 raise MPGraphExecutorError( 

308 "Updated execution graph has dependency cycle.") 

309 

310 return graph 

311 

312 def _executeQuantaInProcess(self, graph, butler): 

313 """Execute all Quanta in current process. 

314 

315 Parameters 

316 ---------- 

317 graph : `QuantumGraph` 

318 `QuantumGraph` that is to be executed 

319 butler : `lsst.daf.butler.Butler` 

320 Data butler instance 

321 """ 

322 # Note that in non-MP case any failed task will generate an exception 

323 # and kill the whole thing. In general we cannot guarantee exception 

324 # safety so easiest and safest thing is to let it die. 

325 count, totalCount = 0, len(graph) 

326 for qnode in graph: 

327 _LOG.debug("Executing %s", qnode) 

328 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler) 

329 count += 1 

330 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.", 

331 count, totalCount - count, totalCount) 

332 

333 def _executeQuantaMP(self, graph, butler): 

334 """Execute all Quanta in separate processes. 

335 

336 Parameters 

337 ---------- 

338 graph : `QuantumGraph` 

339 `QuantumGraph` that is to be executed. 

340 butler : `lsst.daf.butler.Butler` 

341 Data butler instance 

342 """ 

343 

344 disableImplicitThreading() # To prevent thread contention 

345 

346 _LOG.debug("Using %r for multiprocessing start method", self.startMethod) 

347 

348 # re-pack input quantum data into jobs list 

349 jobs = _JobList(graph) 

350 

351 # check that all tasks can run in sub-process 

352 for job in jobs.jobs: 

353 taskDef = job.qnode.taskDef 

354 if not taskDef.taskClass.canMultiprocess: 

355 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

356 " use single process") 

357 

358 finished, failed = 0, 0 

359 while jobs.pending() or jobs.running(): 

360 

361 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

362 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

363 

364 # See if any jobs have finished 

365 for job in jobs.running(): 

366 if not job.process.is_alive(): 

367 _LOG.debug("finished: %s", job) 

368 # finished 

369 exitcode = job.process.exitcode 

370 if exitcode == 0: 

371 job.state = JobState.FINISHED 

372 job.cleanup() 

373 _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started) 

374 else: 

375 job.state = JobState.FAILED 

376 job.cleanup() 

377 _LOG.debug("failed: %s", job) 

378 if self.failFast: 

379 for stopJob in jobs.running(): 

380 if stopJob is not job: 

381 stopJob.stop() 

382 raise MPGraphExecutorError( 

383 f"Task {job} failed, exit code={exitcode}." 

384 ) 

385 else: 

386 _LOG.error( 

387 "Task %s failed; processing will continue for remaining tasks.", job 

388 ) 

389 else: 

390 # check for timeout 

391 now = time.time() 

392 if now - job.started > self.timeout: 

393 job.state = JobState.TIMED_OUT 

394 _LOG.debug("Terminating job %s due to timeout", job) 

395 job.stop() 

396 job.cleanup() 

397 if self.failFast: 

398 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

399 else: 

400 _LOG.error( 

401 "Timeout (%s sec) for task %s; task is killed, processing continues " 

402 "for remaining tasks.", self.timeout, job 

403 ) 

404 

405 # see if we can start more jobs 

406 for job in jobs.pending(): 

407 

408 # check all dependencies 

409 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes(): 

410 # upstream job has failed, skipping this 

411 job.state = JobState.FAILED_DEP 

412 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

413 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes(): 

414 # all dependencies have completed, can start new job 

415 if len(jobs.running()) < self.numProc: 

416 _LOG.debug("Sumbitting %s", job) 

417 job.start(butler, self.quantumExecutor, self.startMethod) 

418 

419 # Do cleanup for timed out jobs if necessary. 

420 jobs.cleanup() 

421 

422 # Print progress message if something changed. 

423 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes()) 

424 if (finished, failed) != (newFinished, newFailed): 

425 finished, failed = newFinished, newFailed 

426 totalCount = len(jobs.jobs) 

427 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.", 

428 finished, failed, totalCount - finished - failed, totalCount) 

429 

430 # Here we want to wait until one of the running jobs completes 

431 # but multiprocessing does not provide an API for that, for now 

432 # just sleep a little bit and go back to the loop. 

433 if jobs.running(): 

434 time.sleep(0.1) 

435 

436 if jobs.failedNodes(): 

437 # print list of failed jobs 

438 _LOG.error("Failed jobs:") 

439 for job in jobs.jobs: 

440 if job.state != JobState.FINISHED: 

441 _LOG.error(" - %s: %s", job.state, job) 

442 

443 # if any job failed raise an exception 

444 if jobs.failedNodes() == jobs.timedOutIds(): 

445 raise MPTimeoutError("One or more tasks timed out during execution.") 

446 else: 

447 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")