Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27from enum import Enum 

28import logging 

29import multiprocessing 

30import pickle 

31import time 

32 

33# ----------------------------- 

34# Imports for other modules -- 

35# ----------------------------- 

36from .quantumGraphExecutor import QuantumGraphExecutor 

37from lsst.base import disableImplicitThreading 

38 

39_LOG = logging.getLogger(__name__.partition(".")[2]) 

40 

41 

42# Possible states for the executing task: 

43# - PENDING: job has not started yet 

44# - RUNNING: job is currently executing 

45# - FINISHED: job finished successfully 

46# - FAILED: job execution failed (process returned non-zero status) 

47# - TIMED_OUT: job is killed due to too long execution time 

48# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

49JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

50 

51 

52class _Job: 

53 """Class representing a job running single task. 

54 

55 Parameters 

56 ---------- 

57 qdata : `~lsst.pipe.base.QuantumIterData` 

58 Quantum and some associated information. 

59 """ 

60 def __init__(self, qdata): 

61 self.qdata = qdata 

62 self.process = None 

63 self.state = JobState.PENDING 

64 self.started = None 

65 

66 def start(self, butler, quantumExecutor): 

67 """Start process which runs the task. 

68 

69 Parameters 

70 ---------- 

71 butler : `lsst.daf.butler.Butler` 

72 Data butler instance. 

73 quantumExecutor : `QuantumExecutor` 

74 Executor for single quantum. 

75 """ 

76 # Butler can have live database connections which is a problem with 

77 # fork-type activation. Make a pickle of butler to pass that across 

78 # fork. 

79 butler_pickle = pickle.dumps(butler) 

80 taskDef = self.qdata.taskDef 

81 quantum = self.qdata.quantum 

82 self.process = multiprocessing.Process( 

83 target=self._executeJob, 

84 args=(quantumExecutor, taskDef, quantum, butler_pickle), 

85 name=f"task-{self.qdata.index}" 

86 ) 

87 self.process.start() 

88 self.started = time.time() 

89 self.state = JobState.RUNNING 

90 

91 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle): 

92 """Execute a job with arguments. 

93 

94 Parameters 

95 ---------- 

96 quantumExecutor : `QuantumExecutor` 

97 Executor for single quantum. 

98 taskDef : `~lsst.pipe.base.TaskDef` 

99 Task definition structure. 

100 quantum : `~lsst.daf.butler.Quantum` 

101 Quantum for this task execution. 

102 butler_pickle : `bytes` 

103 Data butler instance in pickled form. 

104 """ 

105 butler = pickle.loads(butler_pickle) 

106 quantumExecutor.execute(taskDef, quantum, butler) 

107 

108 def stop(self): 

109 """Stop the process. 

110 """ 

111 self.process.terminate() 

112 # give it 1 second to finish or KILL 

113 for i in range(10): 

114 time.sleep(0.1) 

115 if not self.process.is_alive(): 

116 break 

117 else: 

118 _LOG.debug("Killing process %s", self.process.name) 

119 self.process.kill() 

120 

121 def __str__(self): 

122 return f"<{self.qdata.taskDef} dataId={self.qdata.quantum.dataId}>" 

123 

124 

125class _JobList: 

126 """SImple list of _Job instances with few convenience methods. 

127 

128 Parameters 

129 ---------- 

130 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

131 Sequence if Quanta to execute. This has to be ordered according to 

132 task dependencies. 

133 """ 

134 def __init__(self, iterable): 

135 self.jobs = [_Job(qdata) for qdata in iterable] 

136 

137 def pending(self): 

138 """Return list of jobs that wait for execution. 

139 

140 Returns 

141 ------- 

142 jobs : `list` [`_Job`] 

143 List of jobs. 

144 """ 

145 return [job for job in self.jobs if job.state == JobState.PENDING] 

146 

147 def running(self): 

148 """Return list of jobs that are executing. 

149 

150 Returns 

151 ------- 

152 jobs : `list` [`_Job`] 

153 List of jobs. 

154 """ 

155 return [job for job in self.jobs if job.state == JobState.RUNNING] 

156 

157 def finishedIds(self): 

158 """Return set of jobs IDs that finished successfully (not failed). 

159 

160 Job ID is the index of the corresponding quantum. 

161 

162 Returns 

163 ------- 

164 jobsIds : `set` [`int`] 

165 Set of integer job IDs. 

166 """ 

167 return set(job.qdata.index for job in self.jobs if job.state == JobState.FINISHED) 

168 

169 def failedIds(self): 

170 """Return set of jobs IDs that failed for any reason. 

171 

172 Returns 

173 ------- 

174 jobsIds : `set` [`int`] 

175 Set of integer job IDs. 

176 """ 

177 return set(job.qdata.index for job in self.jobs 

178 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

179 

180 def timedOutIds(self): 

181 """Return set of jobs IDs that timed out. 

182 

183 Returns 

184 ------- 

185 jobsIds : `set` [`int`] 

186 Set of integer job IDs. 

187 """ 

188 return set(job.qdata.index for job in self.jobs if job.state == JobState.TIMED_OUT) 

189 

190 

191class MPGraphExecutorError(Exception): 

192 """Exception class for errors raised by MPGraphExecutor. 

193 """ 

194 pass 

195 

196 

197class MPTimeoutError(MPGraphExecutorError): 

198 """Exception raised when task execution times out. 

199 """ 

200 pass 

201 

202 

203class MPGraphExecutor(QuantumGraphExecutor): 

204 """Implementation of QuantumGraphExecutor using same-host multiprocess 

205 execution of Quanta. 

206 

207 Parameters 

208 ---------- 

209 numProc : `int` 

210 Number of processes to use for executing tasks. 

211 timeout : `float` 

212 Time in seconds to wait for tasks to finish. 

213 quantumExecutor : `QuantumExecutor` 

214 Executor for single quantum. For multiprocess-style execution when 

215 ``numProc`` is greater than one this instance must support pickle. 

216 failFast : `bool`, optional 

217 If set to ``True`` then stop processing on first error from any task. 

218 executionGraphFixup : `ExecutionGraphFixup`, optional 

219 Instance used for modification of execution graph. 

220 """ 

221 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None): 

222 self.numProc = numProc 

223 self.timeout = timeout 

224 self.quantumExecutor = quantumExecutor 

225 self.failFast = failFast 

226 self.executionGraphFixup = executionGraphFixup 

227 

228 def execute(self, graph, butler): 

229 # Docstring inherited from QuantumGraphExecutor.execute 

230 quantaIter = self._fixupQuanta(graph.traverse()) 

231 if self.numProc > 1: 

232 self._executeQuantaMP(quantaIter, butler) 

233 else: 

234 self._executeQuantaInProcess(quantaIter, butler) 

235 

236 def _fixupQuanta(self, quantaIter): 

237 """Call fixup code to modify execution graph. 

238 

239 Parameters 

240 ---------- 

241 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData` 

242 Quanta as originated from a quantum graph. 

243 

244 Returns 

245 ------- 

246 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData` 

247 Possibly updated set of quanta, properly ordered for execution. 

248 

249 Raises 

250 ------ 

251 MPGraphExecutorError 

252 Raised if execution graph cannot be ordered after modification, 

253 i.e. it has dependency cycles. 

254 """ 

255 if not self.executionGraphFixup: 

256 return quantaIter 

257 

258 _LOG.debug("Call execution graph fixup method") 

259 quantaIter = self.executionGraphFixup.fixupQuanta(quantaIter) 

260 

261 # need it correctly ordered as dependencies may have changed 

262 # after modification, so do topo-sort 

263 updatedQuanta = list(quantaIter) 

264 quanta = [] 

265 ids = set() 

266 _LOG.debug("Re-ordering execution graph") 

267 while updatedQuanta: 

268 # find quantum that has all dependencies resolved already 

269 for i, qdata in enumerate(updatedQuanta): 

270 if ids.issuperset(qdata.dependencies): 

271 _LOG.debug("Found next quanta to execute: %s", qdata) 

272 del updatedQuanta[i] 

273 ids.add(qdata.index) 

274 # we could yield here but I want to detect cycles before 

275 # returning anything from this method 

276 quanta.append(qdata) 

277 break 

278 else: 

279 # means remaining quanta have dependency cycle 

280 raise MPGraphExecutorError( 

281 "Updated execution graph has dependency clycle.") 

282 

283 return quanta 

284 

285 def _executeQuantaInProcess(self, iterable, butler): 

286 """Execute all Quanta in current process. 

287 

288 Parameters 

289 ---------- 

290 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

291 Sequence if Quanta to execute. It is guaranteed that re-requisites 

292 for a given Quantum will always appear before that Quantum. 

293 butler : `lsst.daf.butler.Butler` 

294 Data butler instance 

295 """ 

296 for qdata in iterable: 

297 _LOG.debug("Executing %s", qdata) 

298 self.quantumExecutor.execute(qdata.taskDef, qdata.quantum, butler) 

299 

300 def _executeQuantaMP(self, iterable, butler): 

301 """Execute all Quanta in separate processes. 

302 

303 Parameters 

304 ---------- 

305 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

306 Sequence if Quanta to execute. It is guaranteed that re-requisites 

307 for a given Quantum will always appear before that Quantum. 

308 butler : `lsst.daf.butler.Butler` 

309 Data butler instance 

310 """ 

311 

312 disableImplicitThreading() # To prevent thread contention 

313 

314 # re-pack input quantum data into jobs list 

315 jobs = _JobList(iterable) 

316 

317 # check that all tasks can run in sub-process 

318 for job in jobs.jobs: 

319 taskDef = job.qdata.taskDef 

320 if not taskDef.taskClass.canMultiprocess: 

321 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

322 " use single process") 

323 

324 while jobs.pending() or jobs.running(): 

325 

326 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

327 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

328 

329 # See if any jobs have finished 

330 for job in jobs.running(): 

331 proc = job.process 

332 if not proc.is_alive(): 

333 _LOG.debug("finished: %s", job) 

334 # finished 

335 if proc.exitcode == 0: 

336 job.state = JobState.FINISHED 

337 _LOG.debug("success: %s", job) 

338 else: 

339 job.state = JobState.FAILED 

340 _LOG.debug("failed: %s", job) 

341 if self.failFast: 

342 raise MPGraphExecutorError( 

343 f"Task {job} failed, exit code={proc.exitcode}." 

344 ) 

345 else: 

346 _LOG.error( 

347 "Task %s failed; processing will continue for remaining tasks.", job 

348 ) 

349 else: 

350 # check for timeout 

351 now = time.time() 

352 if now - job.started > self.timeout: 

353 job.state = JobState.TIMED_OUT 

354 _LOG.debug("Terminating job %s due to timeout", job) 

355 job.stop() 

356 if self.failFast: 

357 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

358 else: 

359 _LOG.error( 

360 "Timeout (%s sec) for task %s; task is killed, processing continues " 

361 "for remaining tasks.", self.timeout, job 

362 ) 

363 

364 # see if we can start more jobs 

365 for job in jobs.pending(): 

366 

367 # check all dependencies 

368 if job.qdata.dependencies & jobs.failedIds(): 

369 # upstream job has failed, skipping this 

370 job.state = JobState.FAILED_DEP 

371 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

372 elif job.qdata.dependencies <= jobs.finishedIds(): 

373 # all dependencies have completed, can start new job 

374 if len(jobs.running()) < self.numProc: 

375 _LOG.debug("Sumbitting %s", job) 

376 job.start(butler, self.quantumExecutor) 

377 

378 # Here we want to wait until one of the running jobs completes 

379 # but multiprocessing does not provide an API for that, for now 

380 # just sleep a little bit and go back to the loop. 

381 if jobs.running(): 

382 time.sleep(0.1) 

383 

384 if jobs.failedIds(): 

385 # print list of failed jobs 

386 _LOG.error("Failed jobs:") 

387 for job in jobs.jobs: 

388 if job.state != JobState.FINISHED: 

389 _LOG.error(" - %s: %s", job.state, job) 

390 

391 # if any job failed raise an exception 

392 if jobs.failedIds() == jobs.timedOutIds(): 

393 raise MPTimeoutError("One or more tasks timed out during execution.") 

394 else: 

395 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")