Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import logging 

29import multiprocessing 

30import pickle 

31import time 

32 

33from lsst.pipe.base.graph.graph import QuantumGraph 

34 

35# ----------------------------- 

36# Imports for other modules -- 

37# ----------------------------- 

38from .quantumGraphExecutor import QuantumGraphExecutor 

39from lsst.base import disableImplicitThreading 

40 

41_LOG = logging.getLogger(__name__.partition(".")[2]) 

42 

43 

44# Possible states for the executing task: 

45# - PENDING: job has not started yet 

46# - RUNNING: job is currently executing 

47# - FINISHED: job finished successfully 

48# - FAILED: job execution failed (process returned non-zero status) 

49# - TIMED_OUT: job is killed due to too long execution time 

50# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

51JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

52 

53 

54class _Job: 

55 """Class representing a job running single task. 

56 

57 Parameters 

58 ---------- 

59 qnode: `~lsst.pipe.base.QuantumNode` 

60 Quantum and some associated information. 

61 """ 

62 def __init__(self, qnode): 

63 self.qnode = qnode 

64 self.process = None 

65 self.state = JobState.PENDING 

66 self.started = None 

67 self.index = qnode.nodeId.number 

68 self.taskDef = qnode.taskDef 

69 

70 def start(self, butler, quantumExecutor): 

71 """Start process which runs the task. 

72 

73 Parameters 

74 ---------- 

75 butler : `lsst.daf.butler.Butler` 

76 Data butler instance. 

77 quantumExecutor : `QuantumExecutor` 

78 Executor for single quantum. 

79 """ 

80 # Butler can have live database connections which is a problem with 

81 # fork-type activation. Make a pickle of butler to pass that across 

82 # fork. 

83 butler_pickle = pickle.dumps(butler) 

84 taskDef = self.taskDef 

85 quantum = self.qnode.quantum 

86 self.process = multiprocessing.Process( 

87 target=self._executeJob, 

88 args=(quantumExecutor, taskDef, quantum, butler_pickle), 

89 name=f"task-{self.index}" 

90 ) 

91 self.process.start() 

92 self.started = time.time() 

93 self.state = JobState.RUNNING 

94 

95 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle): 

96 """Execute a job with arguments. 

97 

98 Parameters 

99 ---------- 

100 quantumExecutor : `QuantumExecutor` 

101 Executor for single quantum. 

102 taskDef : `~lsst.pipe.base.TaskDef` 

103 Task definition structure. 

104 quantum : `~lsst.daf.butler.Quantum` 

105 Quantum for this task execution. 

106 butler_pickle : `bytes` 

107 Data butler instance in pickled form. 

108 """ 

109 butler = pickle.loads(butler_pickle) 

110 quantumExecutor.execute(taskDef, quantum, butler) 

111 

112 def stop(self): 

113 """Stop the process. 

114 """ 

115 self.process.terminate() 

116 # give it 1 second to finish or KILL 

117 for i in range(10): 

118 time.sleep(0.1) 

119 if not self.process.is_alive(): 

120 break 

121 else: 

122 _LOG.debug("Killing process %s", self.process.name) 

123 self.process.kill() 

124 

125 def cleanup(self): 

126 """Release processes resources, has to be called for each finished 

127 process. 

128 """ 

129 if self.process and not self.process.is_alive(): 

130 self.process.close() 

131 self.process = None 

132 

133 def __str__(self): 

134 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>" 

135 

136 

137class _JobList: 

138 """Simple list of _Job instances with few convenience methods. 

139 

140 Parameters 

141 ---------- 

142 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

143 Sequence if Quanta to execute. This has to be ordered according to 

144 task dependencies. 

145 """ 

146 def __init__(self, iterable): 

147 self.jobs = [_Job(qnode) for qnode in iterable] 

148 

149 def pending(self): 

150 """Return list of jobs that wait for execution. 

151 

152 Returns 

153 ------- 

154 jobs : `list` [`_Job`] 

155 List of jobs. 

156 """ 

157 return [job for job in self.jobs if job.state == JobState.PENDING] 

158 

159 def running(self): 

160 """Return list of jobs that are executing. 

161 

162 Returns 

163 ------- 

164 jobs : `list` [`_Job`] 

165 List of jobs. 

166 """ 

167 return [job for job in self.jobs if job.state == JobState.RUNNING] 

168 

169 def finishedNodes(self): 

170 """Return set of QuantumNodes that finished successfully (not failed). 

171 

172 Returns 

173 ------- 

174 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

175 Set of QuantumNodes that have successfully finished 

176 """ 

177 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED) 

178 

179 def failedNodes(self): 

180 """Return set of jobs IDs that failed for any reason. 

181 

182 Returns 

183 ------- 

184 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`] 

185 Set of QUantumNodes that failed during processing 

186 """ 

187 return set(job.qnode for job in self.jobs 

188 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

189 

190 def timedOutIds(self): 

191 """Return set of jobs IDs that timed out. 

192 

193 Returns 

194 ------- 

195 jobsIds : `set` [`int`] 

196 Set of integer job IDs. 

197 """ 

198 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT) 

199 

200 def cleanup(self): 

201 """Do periodic cleanup for jobs that did not finish correctly. 

202 

203 If timed out jobs are killed but take too long to stop then regular 

204 cleanup will not work for them. Here we check all timed out jobs 

205 periodically and do cleanup if they managed to die by this time. 

206 """ 

207 for job in self.jobs: 

208 if job.state == JobState.TIMED_OUT and job.process is not None: 

209 job.cleanup() 

210 

211 

212class MPGraphExecutorError(Exception): 

213 """Exception class for errors raised by MPGraphExecutor. 

214 """ 

215 pass 

216 

217 

218class MPTimeoutError(MPGraphExecutorError): 

219 """Exception raised when task execution times out. 

220 """ 

221 pass 

222 

223 

224class MPGraphExecutor(QuantumGraphExecutor): 

225 """Implementation of QuantumGraphExecutor using same-host multiprocess 

226 execution of Quanta. 

227 

228 Parameters 

229 ---------- 

230 numProc : `int` 

231 Number of processes to use for executing tasks. 

232 timeout : `float` 

233 Time in seconds to wait for tasks to finish. 

234 quantumExecutor : `QuantumExecutor` 

235 Executor for single quantum. For multiprocess-style execution when 

236 ``numProc`` is greater than one this instance must support pickle. 

237 failFast : `bool`, optional 

238 If set to ``True`` then stop processing on first error from any task. 

239 executionGraphFixup : `ExecutionGraphFixup`, optional 

240 Instance used for modification of execution graph. 

241 """ 

242 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None): 

243 self.numProc = numProc 

244 self.timeout = timeout 

245 self.quantumExecutor = quantumExecutor 

246 self.failFast = failFast 

247 self.executionGraphFixup = executionGraphFixup 

248 

249 def execute(self, graph, butler): 

250 # Docstring inherited from QuantumGraphExecutor.execute 

251 graph = self._fixupQuanta(graph) 

252 if self.numProc > 1: 

253 self._executeQuantaMP(graph, butler) 

254 else: 

255 self._executeQuantaInProcess(graph, butler) 

256 

257 def _fixupQuanta(self, graph: QuantumGraph): 

258 """Call fixup code to modify execution graph. 

259 

260 Parameters 

261 ---------- 

262 graph : `QuantumGraph` 

263 `QuantumGraph` to modify 

264 

265 Returns 

266 ------- 

267 graph : `QuantumGraph` 

268 Modified `QuantumGraph`. 

269 

270 Raises 

271 ------ 

272 MPGraphExecutorError 

273 Raised if execution graph cannot be ordered after modification, 

274 i.e. it has dependency cycles. 

275 """ 

276 if not self.executionGraphFixup: 

277 return graph 

278 

279 _LOG.debug("Call execution graph fixup method") 

280 graph = self.executionGraphFixup.fixupQuanta(graph) 

281 

282 # Detect if there is now a cycle created within the graph 

283 if graph.findCycle(): 

284 raise MPGraphExecutorError( 

285 "Updated execution graph has dependency cycle.") 

286 

287 return graph 

288 

289 def _executeQuantaInProcess(self, graph, butler): 

290 """Execute all Quanta in current process. 

291 

292 Parameters 

293 ---------- 

294 graph : `QuantumGraph` 

295 `QuantumGraph` that is to be executed 

296 butler : `lsst.daf.butler.Butler` 

297 Data butler instance 

298 """ 

299 # Note that in non-MP case any failed task will generate an exception 

300 # and kill the whole thing. In general we cannot guarantee exception 

301 # safety so easiest and safest thing is to let it die. 

302 count, totalCount = 0, len(graph) 

303 for qnode in graph: 

304 _LOG.debug("Executing %s", qnode) 

305 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler) 

306 count += 1 

307 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.", 

308 count, totalCount - count, totalCount) 

309 

310 def _executeQuantaMP(self, graph, butler): 

311 """Execute all Quanta in separate processes. 

312 

313 Parameters 

314 ---------- 

315 graph : `QuantumGraph` 

316 `QuantumGraph` that is to be executed. 

317 butler : `lsst.daf.butler.Butler` 

318 Data butler instance 

319 """ 

320 

321 disableImplicitThreading() # To prevent thread contention 

322 

323 # re-pack input quantum data into jobs list 

324 jobs = _JobList(graph) 

325 

326 # check that all tasks can run in sub-process 

327 for job in jobs.jobs: 

328 taskDef = job.taskDef 

329 if not taskDef.taskClass.canMultiprocess: 

330 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

331 " use single process") 

332 

333 finished, failed = 0, 0 

334 while jobs.pending() or jobs.running(): 

335 

336 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

337 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

338 

339 # See if any jobs have finished 

340 for job in jobs.running(): 

341 if not job.process.is_alive(): 

342 _LOG.debug("finished: %s", job) 

343 # finished 

344 exitcode = job.process.exitcode 

345 if exitcode == 0: 

346 job.state = JobState.FINISHED 

347 job.cleanup() 

348 _LOG.debug("success: %s", job) 

349 else: 

350 job.state = JobState.FAILED 

351 job.cleanup() 

352 _LOG.debug("failed: %s", job) 

353 if self.failFast: 

354 for stopJob in jobs.running(): 

355 if stopJob is not job: 

356 stopJob.stop() 

357 raise MPGraphExecutorError( 

358 f"Task {job} failed, exit code={exitcode}." 

359 ) 

360 else: 

361 _LOG.error( 

362 "Task %s failed; processing will continue for remaining tasks.", job 

363 ) 

364 else: 

365 # check for timeout 

366 now = time.time() 

367 if now - job.started > self.timeout: 

368 job.state = JobState.TIMED_OUT 

369 _LOG.debug("Terminating job %s due to timeout", job) 

370 job.stop() 

371 job.cleanup() 

372 if self.failFast: 

373 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

374 else: 

375 _LOG.error( 

376 "Timeout (%s sec) for task %s; task is killed, processing continues " 

377 "for remaining tasks.", self.timeout, job 

378 ) 

379 

380 # see if we can start more jobs 

381 for job in jobs.pending(): 

382 

383 # check all dependencies 

384 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes(): 

385 # upstream job has failed, skipping this 

386 job.state = JobState.FAILED_DEP 

387 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

388 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes(): 

389 # all dependencies have completed, can start new job 

390 if len(jobs.running()) < self.numProc: 

391 _LOG.debug("Sumbitting %s", job) 

392 job.start(butler, self.quantumExecutor) 

393 

394 # Do cleanup for timed out jobs if necessary. 

395 jobs.cleanup() 

396 

397 # Print progress message if something changed. 

398 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes()) 

399 if (finished, failed) != (newFinished, newFailed): 

400 finished, failed = newFinished, newFailed 

401 totalCount = len(jobs.jobs) 

402 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.", 

403 finished, failed, totalCount - finished - failed, totalCount) 

404 

405 # Here we want to wait until one of the running jobs completes 

406 # but multiprocessing does not provide an API for that, for now 

407 # just sleep a little bit and go back to the loop. 

408 if jobs.running(): 

409 time.sleep(0.1) 

410 

411 if jobs.failedNodes(): 

412 # print list of failed jobs 

413 _LOG.error("Failed jobs:") 

414 for job in jobs.jobs: 

415 if job.state != JobState.FINISHED: 

416 _LOG.error(" - %s: %s", job.state, job) 

417 

418 # if any job failed raise an exception 

419 if jobs.failedNodes() == jobs.timedOutIds(): 

420 raise MPTimeoutError("One or more tasks timed out during execution.") 

421 else: 

422 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")