Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import logging 

29import multiprocessing 

30import pickle 

31import time 

32 

33from lsst.pipe.base.graph.graph import QuantumGraph 

34 

35# ----------------------------- 

36# Imports for other modules -- 

37# ----------------------------- 

38from .quantumGraphExecutor import QuantumGraphExecutor 

39from lsst.base import disableImplicitThreading 

40 

41_LOG = logging.getLogger(__name__.partition(".")[2]) 

42 

43 

44# Possible states for the executing task: 

45# - PENDING: job has not started yet 

46# - RUNNING: job is currently executing 

47# - FINISHED: job finished successfully 

48# - FAILED: job execution failed (process returned non-zero status) 

49# - TIMED_OUT: job is killed due to too long execution time 

50# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

51JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

52 

53 

54class _Job: 

55 """Class representing a job running single task. 

56 

57 Parameters 

58 ---------- 

59 qnode: `~lsst.pipe.base.QuantumNode` 

60 Quantum and some associated information. 

61 """ 

62 def __init__(self, qnode): 

63 self.qnode = qnode 

64 self.process = None 

65 self.state = JobState.PENDING 

66 self.started = None 

67 self.index = qnode.nodeId.number 

68 self.taskDef = qnode.taskDef 

69 

70 def start(self, butler, quantumExecutor): 

71 """Start process which runs the task. 

72 

73 Parameters 

74 ---------- 

75 butler : `lsst.daf.butler.Butler` 

76 Data butler instance. 

77 quantumExecutor : `QuantumExecutor` 

78 Executor for single quantum. 

79 """ 

80 # Butler can have live database connections which is a problem with 

81 # fork-type activation. Make a pickle of butler to pass that across 

82 # fork. 

83 butler_pickle = pickle.dumps(butler) 

84 taskDef = self.taskDef 

85 quantum = self.qnode.quantum 

86 # Use fork for multiprocessing start method on all platforms 

87 mp_ctx = multiprocessing.get_context("fork") 

88 self.process = mp_ctx.Process( 

89 target=self._executeJob, 

90 args=(quantumExecutor, taskDef, quantum, butler_pickle), 

91 name=f"task-{self.index}" 

92 ) 

93 self.process.start() 

94 self.started = time.time() 

95 self.state = JobState.RUNNING 

96 

97 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle): 

98 """Execute a job with arguments. 

99 

100 Parameters 

101 ---------- 

102 quantumExecutor : `QuantumExecutor` 

103 Executor for single quantum. 

104 taskDef : `~lsst.pipe.base.TaskDef` 

105 Task definition structure. 

106 quantum : `~lsst.daf.butler.Quantum` 

107 Quantum for this task execution. 

108 butler_pickle : `bytes` 

109 Data butler instance in pickled form. 

110 """ 

111 butler = pickle.loads(butler_pickle) 

112 quantumExecutor.execute(taskDef, quantum, butler) 

113 

114 def stop(self): 

115 """Stop the process. 

116 """ 

117 self.process.terminate() 

118 # give it 1 second to finish or KILL 

119 for i in range(10): 

120 time.sleep(0.1) 

121 if not self.process.is_alive(): 

122 break 

123 else: 

124 _LOG.debug("Killing process %s", self.process.name) 

125 self.process.kill() 

126 

127 def cleanup(self): 

128 """Release processes resources, has to be called for each finished 

129 process. 

130 """ 

131 if self.process and not self.process.is_alive(): 

132 self.process.close() 

133 self.process = None 

134 

135 def __str__(self): 

136 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>" 

137 

138 

139class _JobList: 

140 """Simple list of _Job instances with few convenience methods. 

141 

142 Parameters 

143 ---------- 

144 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

145 Sequence if Quanta to execute. This has to be ordered according to 

146 task dependencies. 

147 """ 

148 def __init__(self, iterable): 

149 self.jobs = [_Job(qnode) for qnode in iterable] 

150 

151 def pending(self): 

152 """Return list of jobs that wait for execution. 

153 

154 Returns 

155 ------- 

156 jobs : `list` [`_Job`] 

157 List of jobs. 

158 """ 

159 return [job for job in self.jobs if job.state == JobState.PENDING] 

160 

161 def running(self): 

162 """Return list of jobs that are executing. 

163 

164 Returns 

165 ------- 

166 jobs : `list` [`_Job`] 

167 List of jobs. 

168 """ 

169 return [job for job in self.jobs if job.state == JobState.RUNNING] 

170 

171 def finishedNodes(self): 

172 """Return set of QuantumNodes that finished successfully (not failed). 

173 

174 Returns 

175 ------- 

176 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

177 Set of QuantumNodes that have successfully finished 

178 """ 

179 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED) 

180 

181 def failedNodes(self): 

182 """Return set of jobs IDs that failed for any reason. 

183 

184 Returns 

185 ------- 

186 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

187 Set of QUantumNodes that failed during processing 

188 """ 

189 return set(job.qnode for job in self.jobs 

190 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

191 

192 def timedOutIds(self): 

193 """Return set of jobs IDs that timed out. 

194 

195 Returns 

196 ------- 

197 jobsIds : `set` [`int`] 

198 Set of integer job IDs. 

199 """ 

200 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT) 

201 

202 def cleanup(self): 

203 """Do periodic cleanup for jobs that did not finish correctly. 

204 

205 If timed out jobs are killed but take too long to stop then regular 

206 cleanup will not work for them. Here we check all timed out jobs 

207 periodically and do cleanup if they managed to die by this time. 

208 """ 

209 for job in self.jobs: 

210 if job.state == JobState.TIMED_OUT and job.process is not None: 

211 job.cleanup() 

212 

213 

214class MPGraphExecutorError(Exception): 

215 """Exception class for errors raised by MPGraphExecutor. 

216 """ 

217 pass 

218 

219 

220class MPTimeoutError(MPGraphExecutorError): 

221 """Exception raised when task execution times out. 

222 """ 

223 pass 

224 

225 

226class MPGraphExecutor(QuantumGraphExecutor): 

227 """Implementation of QuantumGraphExecutor using same-host multiprocess 

228 execution of Quanta. 

229 

230 Parameters 

231 ---------- 

232 numProc : `int` 

233 Number of processes to use for executing tasks. 

234 timeout : `float` 

235 Time in seconds to wait for tasks to finish. 

236 quantumExecutor : `QuantumExecutor` 

237 Executor for single quantum. For multiprocess-style execution when 

238 ``numProc`` is greater than one this instance must support pickle. 

239 failFast : `bool`, optional 

240 If set to ``True`` then stop processing on first error from any task. 

241 executionGraphFixup : `ExecutionGraphFixup`, optional 

242 Instance used for modification of execution graph. 

243 """ 

244 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None): 

245 self.numProc = numProc 

246 self.timeout = timeout 

247 self.quantumExecutor = quantumExecutor 

248 self.failFast = failFast 

249 self.executionGraphFixup = executionGraphFixup 

250 

251 def execute(self, graph, butler): 

252 # Docstring inherited from QuantumGraphExecutor.execute 

253 graph = self._fixupQuanta(graph) 

254 if self.numProc > 1: 

255 self._executeQuantaMP(graph, butler) 

256 else: 

257 self._executeQuantaInProcess(graph, butler) 

258 

259 def _fixupQuanta(self, graph: QuantumGraph): 

260 """Call fixup code to modify execution graph. 

261 

262 Parameters 

263 ---------- 

264 graph : `QuantumGraph` 

265 `QuantumGraph` to modify 

266 

267 Returns 

268 ------- 

269 graph : `QuantumGraph` 

270 Modified `QuantumGraph`. 

271 

272 Raises 

273 ------ 

274 MPGraphExecutorError 

275 Raised if execution graph cannot be ordered after modification, 

276 i.e. it has dependency cycles. 

277 """ 

278 if not self.executionGraphFixup: 

279 return graph 

280 

281 _LOG.debug("Call execution graph fixup method") 

282 graph = self.executionGraphFixup.fixupQuanta(graph) 

283 

284 # Detect if there is now a cycle created within the graph 

285 if graph.findCycle(): 

286 raise MPGraphExecutorError( 

287 "Updated execution graph has dependency cycle.") 

288 

289 return graph 

290 

291 def _executeQuantaInProcess(self, graph, butler): 

292 """Execute all Quanta in current process. 

293 

294 Parameters 

295 ---------- 

296 graph : `QuantumGraph` 

297 `QuantumGraph` that is to be executed 

298 butler : `lsst.daf.butler.Butler` 

299 Data butler instance 

300 """ 

301 # Note that in non-MP case any failed task will generate an exception 

302 # and kill the whole thing. In general we cannot guarantee exception 

303 # safety so easiest and safest thing is to let it die. 

304 count, totalCount = 0, len(graph) 

305 for qnode in graph: 

306 _LOG.debug("Executing %s", qnode) 

307 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler) 

308 count += 1 

309 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.", 

310 count, totalCount - count, totalCount) 

311 

312 def _executeQuantaMP(self, graph, butler): 

313 """Execute all Quanta in separate processes. 

314 

315 Parameters 

316 ---------- 

317 graph : `QuantumGraph` 

318 `QuantumGraph` that is to be executed. 

319 butler : `lsst.daf.butler.Butler` 

320 Data butler instance 

321 """ 

322 

323 disableImplicitThreading() # To prevent thread contention 

324 

325 # re-pack input quantum data into jobs list 

326 jobs = _JobList(graph) 

327 

328 # check that all tasks can run in sub-process 

329 for job in jobs.jobs: 

330 taskDef = job.taskDef 

331 if not taskDef.taskClass.canMultiprocess: 

332 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

333 " use single process") 

334 

335 finished, failed = 0, 0 

336 while jobs.pending() or jobs.running(): 

337 

338 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

339 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

340 

341 # See if any jobs have finished 

342 for job in jobs.running(): 

343 if not job.process.is_alive(): 

344 _LOG.debug("finished: %s", job) 

345 # finished 

346 exitcode = job.process.exitcode 

347 if exitcode == 0: 

348 job.state = JobState.FINISHED 

349 job.cleanup() 

350 _LOG.debug("success: %s", job) 

351 else: 

352 job.state = JobState.FAILED 

353 job.cleanup() 

354 _LOG.debug("failed: %s", job) 

355 if self.failFast: 

356 for stopJob in jobs.running(): 

357 if stopJob is not job: 

358 stopJob.stop() 

359 raise MPGraphExecutorError( 

360 f"Task {job} failed, exit code={exitcode}." 

361 ) 

362 else: 

363 _LOG.error( 

364 "Task %s failed; processing will continue for remaining tasks.", job 

365 ) 

366 else: 

367 # check for timeout 

368 now = time.time() 

369 if now - job.started > self.timeout: 

370 job.state = JobState.TIMED_OUT 

371 _LOG.debug("Terminating job %s due to timeout", job) 

372 job.stop() 

373 job.cleanup() 

374 if self.failFast: 

375 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

376 else: 

377 _LOG.error( 

378 "Timeout (%s sec) for task %s; task is killed, processing continues " 

379 "for remaining tasks.", self.timeout, job 

380 ) 

381 

382 # see if we can start more jobs 

383 for job in jobs.pending(): 

384 

385 # check all dependencies 

386 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes(): 

387 # upstream job has failed, skipping this 

388 job.state = JobState.FAILED_DEP 

389 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

390 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes(): 

391 # all dependencies have completed, can start new job 

392 if len(jobs.running()) < self.numProc: 

393 _LOG.debug("Sumbitting %s", job) 

394 job.start(butler, self.quantumExecutor) 

395 

396 # Do cleanup for timed out jobs if necessary. 

397 jobs.cleanup() 

398 

399 # Print progress message if something changed. 

400 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes()) 

401 if (finished, failed) != (newFinished, newFailed): 

402 finished, failed = newFinished, newFailed 

403 totalCount = len(jobs.jobs) 

404 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.", 

405 finished, failed, totalCount - finished - failed, totalCount) 

406 

407 # Here we want to wait until one of the running jobs completes 

408 # but multiprocessing does not provide an API for that, for now 

409 # just sleep a little bit and go back to the loop. 

410 if jobs.running(): 

411 time.sleep(0.1) 

412 

413 if jobs.failedNodes(): 

414 # print list of failed jobs 

415 _LOG.error("Failed jobs:") 

416 for job in jobs.jobs: 

417 if job.state != JobState.FINISHED: 

418 _LOG.error(" - %s: %s", job.state, job) 

419 

420 # if any job failed raise an exception 

421 if jobs.failedNodes() == jobs.timedOutIds(): 

422 raise MPTimeoutError("One or more tasks timed out during execution.") 

423 else: 

424 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")