Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import logging 

29import multiprocessing 

30import pickle 

31import sys 

32import time 

33 

34from lsst.pipe.base.graph.graph import QuantumGraph 

35 

36# ----------------------------- 

37# Imports for other modules -- 

38# ----------------------------- 

39from .quantumGraphExecutor import QuantumGraphExecutor 

40from lsst.base import disableImplicitThreading 

41from lsst.daf.butler.cli.cliLog import CliLog 

42 

43_LOG = logging.getLogger(__name__.partition(".")[2]) 

44 

45 

46# Possible states for the executing task: 

47# - PENDING: job has not started yet 

48# - RUNNING: job is currently executing 

49# - FINISHED: job finished successfully 

50# - FAILED: job execution failed (process returned non-zero status) 

51# - TIMED_OUT: job is killed due to too long execution time 

52# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

53JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

54 

55 

56class _Job: 

57 """Class representing a job running single task. 

58 

59 Parameters 

60 ---------- 

61 qnode: `~lsst.pipe.base.QuantumNode` 

62 Quantum and some associated information. 

63 """ 

64 def __init__(self, qnode): 

65 self.qnode = qnode 

66 self.process = None 

67 self.state = JobState.PENDING 

68 self.started = None 

69 

70 def start(self, butler, quantumExecutor, startMethod=None): 

71 """Start process which runs the task. 

72 

73 Parameters 

74 ---------- 

75 butler : `lsst.daf.butler.Butler` 

76 Data butler instance. 

77 quantumExecutor : `QuantumExecutor` 

78 Executor for single quantum. 

79 startMethod : `str`, optional 

80 Start method from `multiprocessing` module. 

81 """ 

82 # Butler can have live database connections which is a problem with 

83 # fork-type activation. Make a pickle of butler to pass that across 

84 # fork. Unpickling of quantum has to happen after butler, this is why 

85 # it is pickled manually here. 

86 butler_pickle = pickle.dumps(butler) 

87 quantum_pickle = pickle.dumps(self.qnode.quantum) 

88 taskDef = self.qnode.taskDef 

89 logConfigState = CliLog.configState 

90 mp_ctx = multiprocessing.get_context(startMethod) 

91 self.process = mp_ctx.Process( 

92 target=_Job._executeJob, 

93 args=(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState), 

94 name=f"task-{self.qnode.nodeId.number}" 

95 ) 

96 self.process.start() 

97 self.started = time.time() 

98 self.state = JobState.RUNNING 

99 

100 @staticmethod 

101 def _executeJob(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState): 

102 """Execute a job with arguments. 

103 

104 Parameters 

105 ---------- 

106 quantumExecutor : `QuantumExecutor` 

107 Executor for single quantum. 

108 taskDef : `bytes` 

109 Task definition structure. 

110 quantum_pickle : `bytes` 

111 Quantum for this task execution in pickled form. 

112 butler_pickle : `bytes` 

113 Data butler instance in pickled form. 

114 """ 

115 if logConfigState and not CliLog.configState: 

116 # means that we are in a new spawned Python process and we have to 

117 # re-initialize logging 

118 CliLog.replayConfigState(logConfigState) 

119 

120 butler = pickle.loads(butler_pickle) 

121 quantum = pickle.loads(quantum_pickle) 

122 quantumExecutor.execute(taskDef, quantum, butler) 

123 

124 def stop(self): 

125 """Stop the process. 

126 """ 

127 self.process.terminate() 

128 # give it 1 second to finish or KILL 

129 for i in range(10): 

130 time.sleep(0.1) 

131 if not self.process.is_alive(): 

132 break 

133 else: 

134 _LOG.debug("Killing process %s", self.process.name) 

135 self.process.kill() 

136 

137 def cleanup(self): 

138 """Release processes resources, has to be called for each finished 

139 process. 

140 """ 

141 if self.process and not self.process.is_alive(): 

142 self.process.close() 

143 self.process = None 

144 

145 def __str__(self): 

146 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>" 

147 

148 

149class _JobList: 

150 """Simple list of _Job instances with few convenience methods. 

151 

152 Parameters 

153 ---------- 

154 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

155 Sequence if Quanta to execute. This has to be ordered according to 

156 task dependencies. 

157 """ 

158 def __init__(self, iterable): 

159 self.jobs = [_Job(qnode) for qnode in iterable] 

160 

161 def pending(self): 

162 """Return list of jobs that wait for execution. 

163 

164 Returns 

165 ------- 

166 jobs : `list` [`_Job`] 

167 List of jobs. 

168 """ 

169 return [job for job in self.jobs if job.state == JobState.PENDING] 

170 

171 def running(self): 

172 """Return list of jobs that are executing. 

173 

174 Returns 

175 ------- 

176 jobs : `list` [`_Job`] 

177 List of jobs. 

178 """ 

179 return [job for job in self.jobs if job.state == JobState.RUNNING] 

180 

181 def finishedNodes(self): 

182 """Return set of QuantumNodes that finished successfully (not failed). 

183 

184 Returns 

185 ------- 

186 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

187 Set of QuantumNodes that have successfully finished 

188 """ 

189 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED) 

190 

191 def failedNodes(self): 

192 """Return set of jobs IDs that failed for any reason. 

193 

194 Returns 

195 ------- 

196 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

197 Set of QUantumNodes that failed during processing 

198 """ 

199 return set(job.qnode for job in self.jobs 

200 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

201 

202 def timedOutIds(self): 

203 """Return set of jobs IDs that timed out. 

204 

205 Returns 

206 ------- 

207 jobsIds : `set` [`int`] 

208 Set of integer job IDs. 

209 """ 

210 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT) 

211 

212 def cleanup(self): 

213 """Do periodic cleanup for jobs that did not finish correctly. 

214 

215 If timed out jobs are killed but take too long to stop then regular 

216 cleanup will not work for them. Here we check all timed out jobs 

217 periodically and do cleanup if they managed to die by this time. 

218 """ 

219 for job in self.jobs: 

220 if job.state == JobState.TIMED_OUT and job.process is not None: 

221 job.cleanup() 

222 

223 

224class MPGraphExecutorError(Exception): 

225 """Exception class for errors raised by MPGraphExecutor. 

226 """ 

227 pass 

228 

229 

230class MPTimeoutError(MPGraphExecutorError): 

231 """Exception raised when task execution times out. 

232 """ 

233 pass 

234 

235 

236class MPGraphExecutor(QuantumGraphExecutor): 

237 """Implementation of QuantumGraphExecutor using same-host multiprocess 

238 execution of Quanta. 

239 

240 Parameters 

241 ---------- 

242 numProc : `int` 

243 Number of processes to use for executing tasks. 

244 timeout : `float` 

245 Time in seconds to wait for tasks to finish. 

246 quantumExecutor : `QuantumExecutor` 

247 Executor for single quantum. For multiprocess-style execution when 

248 ``numProc`` is greater than one this instance must support pickle. 

249 startMethod : `str`, optional 

250 Start method from `multiprocessing` module, `None` selects the best 

251 one for current platform. 

252 failFast : `bool`, optional 

253 If set to ``True`` then stop processing on first error from any task. 

254 executionGraphFixup : `ExecutionGraphFixup`, optional 

255 Instance used for modification of execution graph. 

256 """ 

257 def __init__(self, numProc, timeout, quantumExecutor, *, 

258 startMethod=None, failFast=False, executionGraphFixup=None): 

259 self.numProc = numProc 

260 self.timeout = timeout 

261 self.quantumExecutor = quantumExecutor 

262 self.failFast = failFast 

263 self.executionGraphFixup = executionGraphFixup 

264 

265 # We set default start method as spawn for MacOS and fork for Linux; 

266 # None for all other platforms to use multiprocessing default. 

267 if startMethod is None: 

268 methods = dict(linux="fork", darwin="spawn") 

269 startMethod = methods.get(sys.platform) 

270 self.startMethod = startMethod 

271 _LOG.info("Using %r for multiprocessing start method", self.startMethod) 

272 

273 def execute(self, graph, butler): 

274 # Docstring inherited from QuantumGraphExecutor.execute 

275 graph = self._fixupQuanta(graph) 

276 if self.numProc > 1: 

277 self._executeQuantaMP(graph, butler) 

278 else: 

279 self._executeQuantaInProcess(graph, butler) 

280 

281 def _fixupQuanta(self, graph: QuantumGraph): 

282 """Call fixup code to modify execution graph. 

283 

284 Parameters 

285 ---------- 

286 graph : `QuantumGraph` 

287 `QuantumGraph` to modify 

288 

289 Returns 

290 ------- 

291 graph : `QuantumGraph` 

292 Modified `QuantumGraph`. 

293 

294 Raises 

295 ------ 

296 MPGraphExecutorError 

297 Raised if execution graph cannot be ordered after modification, 

298 i.e. it has dependency cycles. 

299 """ 

300 if not self.executionGraphFixup: 

301 return graph 

302 

303 _LOG.debug("Call execution graph fixup method") 

304 graph = self.executionGraphFixup.fixupQuanta(graph) 

305 

306 # Detect if there is now a cycle created within the graph 

307 if graph.findCycle(): 

308 raise MPGraphExecutorError( 

309 "Updated execution graph has dependency cycle.") 

310 

311 return graph 

312 

313 def _executeQuantaInProcess(self, graph, butler): 

314 """Execute all Quanta in current process. 

315 

316 Parameters 

317 ---------- 

318 graph : `QuantumGraph` 

319 `QuantumGraph` that is to be executed 

320 butler : `lsst.daf.butler.Butler` 

321 Data butler instance 

322 """ 

323 # Note that in non-MP case any failed task will generate an exception 

324 # and kill the whole thing. In general we cannot guarantee exception 

325 # safety so easiest and safest thing is to let it die. 

326 count, totalCount = 0, len(graph) 

327 for qnode in graph: 

328 _LOG.debug("Executing %s", qnode) 

329 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler) 

330 count += 1 

331 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.", 

332 count, totalCount - count, totalCount) 

333 

334 def _executeQuantaMP(self, graph, butler): 

335 """Execute all Quanta in separate processes. 

336 

337 Parameters 

338 ---------- 

339 graph : `QuantumGraph` 

340 `QuantumGraph` that is to be executed. 

341 butler : `lsst.daf.butler.Butler` 

342 Data butler instance 

343 """ 

344 

345 disableImplicitThreading() # To prevent thread contention 

346 

347 # re-pack input quantum data into jobs list 

348 jobs = _JobList(graph) 

349 

350 # check that all tasks can run in sub-process 

351 for job in jobs.jobs: 

352 taskDef = job.qnode.taskDef 

353 if not taskDef.taskClass.canMultiprocess: 

354 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

355 " use single process") 

356 

357 finished, failed = 0, 0 

358 while jobs.pending() or jobs.running(): 

359 

360 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

361 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

362 

363 # See if any jobs have finished 

364 for job in jobs.running(): 

365 if not job.process.is_alive(): 

366 _LOG.debug("finished: %s", job) 

367 # finished 

368 exitcode = job.process.exitcode 

369 if exitcode == 0: 

370 job.state = JobState.FINISHED 

371 job.cleanup() 

372 _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started) 

373 else: 

374 job.state = JobState.FAILED 

375 job.cleanup() 

376 _LOG.debug("failed: %s", job) 

377 if self.failFast: 

378 for stopJob in jobs.running(): 

379 if stopJob is not job: 

380 stopJob.stop() 

381 raise MPGraphExecutorError( 

382 f"Task {job} failed, exit code={exitcode}." 

383 ) 

384 else: 

385 _LOG.error( 

386 "Task %s failed; processing will continue for remaining tasks.", job 

387 ) 

388 else: 

389 # check for timeout 

390 now = time.time() 

391 if now - job.started > self.timeout: 

392 job.state = JobState.TIMED_OUT 

393 _LOG.debug("Terminating job %s due to timeout", job) 

394 job.stop() 

395 job.cleanup() 

396 if self.failFast: 

397 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

398 else: 

399 _LOG.error( 

400 "Timeout (%s sec) for task %s; task is killed, processing continues " 

401 "for remaining tasks.", self.timeout, job 

402 ) 

403 

404 # see if we can start more jobs 

405 for job in jobs.pending(): 

406 

407 # check all dependencies 

408 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes(): 

409 # upstream job has failed, skipping this 

410 job.state = JobState.FAILED_DEP 

411 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

412 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes(): 

413 # all dependencies have completed, can start new job 

414 if len(jobs.running()) < self.numProc: 

415 _LOG.debug("Sumbitting %s", job) 

416 job.start(butler, self.quantumExecutor, self.startMethod) 

417 

418 # Do cleanup for timed out jobs if necessary. 

419 jobs.cleanup() 

420 

421 # Print progress message if something changed. 

422 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes()) 

423 if (finished, failed) != (newFinished, newFailed): 

424 finished, failed = newFinished, newFailed 

425 totalCount = len(jobs.jobs) 

426 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.", 

427 finished, failed, totalCount - finished - failed, totalCount) 

428 

429 # Here we want to wait until one of the running jobs completes 

430 # but multiprocessing does not provide an API for that, for now 

431 # just sleep a little bit and go back to the loop. 

432 if jobs.running(): 

433 time.sleep(0.1) 

434 

435 if jobs.failedNodes(): 

436 # print list of failed jobs 

437 _LOG.error("Failed jobs:") 

438 for job in jobs.jobs: 

439 if job.state != JobState.FINISHED: 

440 _LOG.error(" - %s: %s", job.state, job) 

441 

442 # if any job failed raise an exception 

443 if jobs.failedNodes() == jobs.timedOutIds(): 

444 raise MPTimeoutError("One or more tasks timed out during execution.") 

445 else: 

446 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")