Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import logging 

29import multiprocessing 

30import pickle 

31import time 

32 

33# ----------------------------- 

34# Imports for other modules -- 

35# ----------------------------- 

36from .quantumGraphExecutor import QuantumGraphExecutor 

37from lsst.base import disableImplicitThreading 

38 

39_LOG = logging.getLogger(__name__.partition(".")[2]) 

40 

41 

42# Possible states for the executing task: 

43# - PENDING: job has not started yet 

44# - RUNNING: job is currently executing 

45# - FINISHED: job finished successfully 

46# - FAILED: job execution failed (process returned non-zero status) 

47# - TIMED_OUT: job is killed due to too long execution time 

48# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

49JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

50 

51 

52class _Job: 

53 """Class representing a job running single task. 

54 

55 Parameters 

56 ---------- 

57 qdata : `~lsst.pipe.base.QuantumIterData` 

58 Quantum and some associated information. 

59 """ 

60 def __init__(self, qdata): 

61 self.qdata = qdata 

62 self.process = None 

63 self.state = JobState.PENDING 

64 self.started = None 

65 

66 def start(self, butler, quantumExecutor): 

67 """Start process which runs the task. 

68 

69 Parameters 

70 ---------- 

71 butler : `lsst.daf.butler.Butler` 

72 Data butler instance. 

73 quantumExecutor : `QuantumExecutor` 

74 Executor for single quantum. 

75 """ 

76 # Butler can have live database connections which is a problem with 

77 # fork-type activation. Make a pickle of butler to pass that across 

78 # fork. 

79 butler_pickle = pickle.dumps(butler) 

80 taskDef = self.qdata.taskDef 

81 quantum = self.qdata.quantum 

82 self.process = multiprocessing.Process( 

83 target=self._executeJob, 

84 args=(quantumExecutor, taskDef, quantum, butler_pickle), 

85 name=f"task-{self.qdata.index}" 

86 ) 

87 self.process.start() 

88 self.started = time.time() 

89 self.state = JobState.RUNNING 

90 

91 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle): 

92 """Execute a job with arguments. 

93 

94 Parameters 

95 ---------- 

96 quantumExecutor : `QuantumExecutor` 

97 Executor for single quantum. 

98 taskDef : `~lsst.pipe.base.TaskDef` 

99 Task definition structure. 

100 quantum : `~lsst.daf.butler.Quantum` 

101 Quantum for this task execution. 

102 butler_pickle : `bytes` 

103 Data butler instance in pickled form. 

104 """ 

105 butler = pickle.loads(butler_pickle) 

106 quantumExecutor.execute(taskDef, quantum, butler) 

107 

108 def stop(self): 

109 """Stop the process. 

110 """ 

111 self.process.terminate() 

112 # give it 1 second to finish or KILL 

113 for i in range(10): 

114 time.sleep(0.1) 

115 if not self.process.is_alive(): 

116 break 

117 else: 

118 _LOG.debug("Killing process %s", self.process.name) 

119 self.process.kill() 

120 

121 def cleanup(self): 

122 """Release processes resources, has to be called for each finished 

123 process. 

124 """ 

125 if self.process and not self.process.is_alive(): 

126 self.process.close() 

127 self.process = None 

128 

129 def __str__(self): 

130 return f"<{self.qdata.taskDef} dataId={self.qdata.quantum.dataId}>" 

131 

132 

133class _JobList: 

134 """SImple list of _Job instances with few convenience methods. 

135 

136 Parameters 

137 ---------- 

138 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

139 Sequence if Quanta to execute. This has to be ordered according to 

140 task dependencies. 

141 """ 

142 def __init__(self, iterable): 

143 self.jobs = [_Job(qdata) for qdata in iterable] 

144 

145 def pending(self): 

146 """Return list of jobs that wait for execution. 

147 

148 Returns 

149 ------- 

150 jobs : `list` [`_Job`] 

151 List of jobs. 

152 """ 

153 return [job for job in self.jobs if job.state == JobState.PENDING] 

154 

155 def running(self): 

156 """Return list of jobs that are executing. 

157 

158 Returns 

159 ------- 

160 jobs : `list` [`_Job`] 

161 List of jobs. 

162 """ 

163 return [job for job in self.jobs if job.state == JobState.RUNNING] 

164 

165 def finishedIds(self): 

166 """Return set of jobs IDs that finished successfully (not failed). 

167 

168 Job ID is the index of the corresponding quantum. 

169 

170 Returns 

171 ------- 

172 jobsIds : `set` [`int`] 

173 Set of integer job IDs. 

174 """ 

175 return set(job.qdata.index for job in self.jobs if job.state == JobState.FINISHED) 

176 

177 def failedIds(self): 

178 """Return set of jobs IDs that failed for any reason. 

179 

180 Returns 

181 ------- 

182 jobsIds : `set` [`int`] 

183 Set of integer job IDs. 

184 """ 

185 return set(job.qdata.index for job in self.jobs 

186 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

187 

188 def timedOutIds(self): 

189 """Return set of jobs IDs that timed out. 

190 

191 Returns 

192 ------- 

193 jobsIds : `set` [`int`] 

194 Set of integer job IDs. 

195 """ 

196 return set(job.qdata.index for job in self.jobs if job.state == JobState.TIMED_OUT) 

197 

198 def cleanup(self): 

199 """Do periodic cleanup for jobs that did not finish correctly. 

200 

201 If timed out jobs are killed but take too long to stop then regular 

202 cleanup will not work for them. Here we check all timed out jobs 

203 periodically and do cleanup if they managed to die by this time. 

204 """ 

205 for job in self.jobs: 

206 if job.state == JobState.TIMED_OUT and job.process is not None: 

207 job.cleanup() 

208 

209 

210class MPGraphExecutorError(Exception): 

211 """Exception class for errors raised by MPGraphExecutor. 

212 """ 

213 pass 

214 

215 

216class MPTimeoutError(MPGraphExecutorError): 

217 """Exception raised when task execution times out. 

218 """ 

219 pass 

220 

221 

222class MPGraphExecutor(QuantumGraphExecutor): 

223 """Implementation of QuantumGraphExecutor using same-host multiprocess 

224 execution of Quanta. 

225 

226 Parameters 

227 ---------- 

228 numProc : `int` 

229 Number of processes to use for executing tasks. 

230 timeout : `float` 

231 Time in seconds to wait for tasks to finish. 

232 quantumExecutor : `QuantumExecutor` 

233 Executor for single quantum. For multiprocess-style execution when 

234 ``numProc`` is greater than one this instance must support pickle. 

235 failFast : `bool`, optional 

236 If set to ``True`` then stop processing on first error from any task. 

237 executionGraphFixup : `ExecutionGraphFixup`, optional 

238 Instance used for modification of execution graph. 

239 """ 

240 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None): 

241 self.numProc = numProc 

242 self.timeout = timeout 

243 self.quantumExecutor = quantumExecutor 

244 self.failFast = failFast 

245 self.executionGraphFixup = executionGraphFixup 

246 

247 def execute(self, graph, butler): 

248 # Docstring inherited from QuantumGraphExecutor.execute 

249 quantaIter = self._fixupQuanta(graph.traverse()) 

250 if self.numProc > 1: 

251 self._executeQuantaMP(quantaIter, butler) 

252 else: 

253 self._executeQuantaInProcess(quantaIter, butler) 

254 

255 def _fixupQuanta(self, quantaIter): 

256 """Call fixup code to modify execution graph. 

257 

258 Parameters 

259 ---------- 

260 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData` 

261 Quanta as originated from a quantum graph. 

262 

263 Returns 

264 ------- 

265 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData` 

266 Possibly updated set of quanta, properly ordered for execution. 

267 

268 Raises 

269 ------ 

270 MPGraphExecutorError 

271 Raised if execution graph cannot be ordered after modification, 

272 i.e. it has dependency cycles. 

273 """ 

274 if not self.executionGraphFixup: 

275 return quantaIter 

276 

277 _LOG.debug("Call execution graph fixup method") 

278 quantaIter = self.executionGraphFixup.fixupQuanta(quantaIter) 

279 

280 # need it correctly ordered as dependencies may have changed 

281 # after modification, so do topo-sort 

282 updatedQuanta = list(quantaIter) 

283 quanta = [] 

284 ids = set() 

285 _LOG.debug("Re-ordering execution graph") 

286 while updatedQuanta: 

287 # find quantum that has all dependencies resolved already 

288 for i, qdata in enumerate(updatedQuanta): 

289 if ids.issuperset(qdata.dependencies): 

290 _LOG.debug("Found next quanta to execute: %s", qdata) 

291 del updatedQuanta[i] 

292 ids.add(qdata.index) 

293 # we could yield here but I want to detect cycles before 

294 # returning anything from this method 

295 quanta.append(qdata) 

296 break 

297 else: 

298 # means remaining quanta have dependency cycle 

299 raise MPGraphExecutorError( 

300 "Updated execution graph has dependency clycle.") 

301 

302 return quanta 

303 

304 def _executeQuantaInProcess(self, iterable, butler): 

305 """Execute all Quanta in current process. 

306 

307 Parameters 

308 ---------- 

309 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

310 Sequence if Quanta to execute. It is guaranteed that re-requisites 

311 for a given Quantum will always appear before that Quantum. 

312 butler : `lsst.daf.butler.Butler` 

313 Data butler instance 

314 """ 

315 for qdata in iterable: 

316 _LOG.debug("Executing %s", qdata) 

317 self.quantumExecutor.execute(qdata.taskDef, qdata.quantum, butler) 

318 

319 def _executeQuantaMP(self, iterable, butler): 

320 """Execute all Quanta in separate processes. 

321 

322 Parameters 

323 ---------- 

324 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

325 Sequence if Quanta to execute. It is guaranteed that re-requisites 

326 for a given Quantum will always appear before that Quantum. 

327 butler : `lsst.daf.butler.Butler` 

328 Data butler instance 

329 """ 

330 

331 disableImplicitThreading() # To prevent thread contention 

332 

333 # re-pack input quantum data into jobs list 

334 jobs = _JobList(iterable) 

335 

336 # check that all tasks can run in sub-process 

337 for job in jobs.jobs: 

338 taskDef = job.qdata.taskDef 

339 if not taskDef.taskClass.canMultiprocess: 

340 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

341 " use single process") 

342 

343 while jobs.pending() or jobs.running(): 

344 

345 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

346 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

347 

348 # See if any jobs have finished 

349 for job in jobs.running(): 

350 if not job.process.is_alive(): 

351 _LOG.debug("finished: %s", job) 

352 # finished 

353 exitcode = job.process.exitcode 

354 if exitcode == 0: 

355 job.state = JobState.FINISHED 

356 job.cleanup() 

357 _LOG.debug("success: %s", job) 

358 else: 

359 job.state = JobState.FAILED 

360 job.cleanup() 

361 _LOG.debug("failed: %s", job) 

362 if self.failFast: 

363 raise MPGraphExecutorError( 

364 f"Task {job} failed, exit code={exitcode}." 

365 ) 

366 else: 

367 _LOG.error( 

368 "Task %s failed; processing will continue for remaining tasks.", job 

369 ) 

370 else: 

371 # check for timeout 

372 now = time.time() 

373 if now - job.started > self.timeout: 

374 job.state = JobState.TIMED_OUT 

375 _LOG.debug("Terminating job %s due to timeout", job) 

376 job.stop() 

377 job.cleanup() 

378 if self.failFast: 

379 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

380 else: 

381 _LOG.error( 

382 "Timeout (%s sec) for task %s; task is killed, processing continues " 

383 "for remaining tasks.", self.timeout, job 

384 ) 

385 

386 # see if we can start more jobs 

387 for job in jobs.pending(): 

388 

389 # check all dependencies 

390 if job.qdata.dependencies & jobs.failedIds(): 

391 # upstream job has failed, skipping this 

392 job.state = JobState.FAILED_DEP 

393 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

394 elif job.qdata.dependencies <= jobs.finishedIds(): 

395 # all dependencies have completed, can start new job 

396 if len(jobs.running()) < self.numProc: 

397 _LOG.debug("Sumbitting %s", job) 

398 job.start(butler, self.quantumExecutor) 

399 

400 # Do cleanup for timed out jobs if necessary. 

401 jobs.cleanup() 

402 

403 # Here we want to wait until one of the running jobs completes 

404 # but multiprocessing does not provide an API for that, for now 

405 # just sleep a little bit and go back to the loop. 

406 if jobs.running(): 

407 time.sleep(0.1) 

408 

409 if jobs.failedIds(): 

410 # print list of failed jobs 

411 _LOG.error("Failed jobs:") 

412 for job in jobs.jobs: 

413 if job.state != JobState.FINISHED: 

414 _LOG.error(" - %s: %s", job.state, job) 

415 

416 # if any job failed raise an exception 

417 if jobs.failedIds() == jobs.timedOutIds(): 

418 raise MPTimeoutError("One or more tasks timed out during execution.") 

419 else: 

420 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")