Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import copy 

28from enum import Enum 

29import logging 

30import multiprocessing 

31import time 

32 

33# ----------------------------- 

34# Imports for other modules -- 

35# ----------------------------- 

36from .quantumGraphExecutor import QuantumGraphExecutor 

37from lsst.base import disableImplicitThreading 

38 

39_LOG = logging.getLogger(__name__.partition(".")[2]) 

40 

41 

42# Possible states for the executing task: 

43# - PENDING: job has not started yet 

44# - RUNNING: job is currently executing 

45# - FINISHED: job finished successfully 

46# - FAILED: job execution failed (process returned non-zero status) 

47# - TIMED_OUT: job is killed due to too long execution time 

48# - FAILED_DEP: one of the dependencies of this job has failed/timed out 

49JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP") 

50 

51 

52class _Job: 

53 """Class representing a job running single task. 

54 

55 Parameters 

56 ---------- 

57 qdata : `~lsst.pipe.base.QuantumIterData` 

58 Quantum and some associated information. 

59 """ 

60 def __init__(self, qdata): 

61 self.qdata = qdata 

62 self.process = None 

63 self.state = JobState.PENDING 

64 self.started = None 

65 

66 def start(self, butler, quantumExecutor): 

67 """Start process which runs the task. 

68 

69 Parameters 

70 ---------- 

71 butler : `lsst.daf.butler.Butler` 

72 Data butler instance. 

73 quantumExecutor : `QuantumExecutor` 

74 Executor for single quantum. 

75 """ 

76 # Butler can have live database connections which is a problem with 

77 # fork-type activation. Make a copy of butler, this guarantees that 

78 # no database is open right after copy. 

79 butler = copy.copy(butler) 

80 taskDef = self.qdata.taskDef 

81 quantum = self.qdata.quantum 

82 self.process = multiprocessing.Process( 

83 target=quantumExecutor.execute, args=(taskDef, quantum, butler), 

84 name=f"task-{self.qdata.index}" 

85 ) 

86 self.process.start() 

87 self.started = time.time() 

88 self.state = JobState.RUNNING 

89 

90 def stop(self): 

91 """Stop the process. 

92 """ 

93 self.process.terminate() 

94 # give it 1 second to finish or KILL 

95 for i in range(10): 

96 time.sleep(0.1) 

97 if not self.process.is_alive(): 

98 break 

99 else: 

100 _LOG.debug("Killing process %s", self.process.name) 

101 self.process.kill() 

102 

103 def __str__(self): 

104 return f"<{self.qdata.taskDef} dataId={self.qdata.quantum.dataId}>" 

105 

106 

107class _JobList: 

108 """SImple list of _Job instances with few convenience methods. 

109 

110 Parameters 

111 ---------- 

112 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

113 Sequence if Quanta to execute. This has to be ordered according to 

114 task dependencies. 

115 """ 

116 def __init__(self, iterable): 

117 self.jobs = [_Job(qdata) for qdata in iterable] 

118 

119 def pending(self): 

120 """Return list of jobs that wait for execution. 

121 

122 Returns 

123 ------- 

124 jobs : `list` [`_Job`] 

125 List of jobs. 

126 """ 

127 return [job for job in self.jobs if job.state == JobState.PENDING] 

128 

129 def running(self): 

130 """Return list of jobs that are executing. 

131 

132 Returns 

133 ------- 

134 jobs : `list` [`_Job`] 

135 List of jobs. 

136 """ 

137 return [job for job in self.jobs if job.state == JobState.RUNNING] 

138 

139 def finishedIds(self): 

140 """Return set of jobs IDs that finished successfully (not failed). 

141 

142 Job ID is the index of the corresponding quantum. 

143 

144 Returns 

145 ------- 

146 jobsIds : `set` [`int`] 

147 Set of integer job IDs. 

148 """ 

149 return set(job.qdata.index for job in self.jobs if job.state == JobState.FINISHED) 

150 

151 def failedIds(self): 

152 """Return set of jobs IDs that failed for any reason. 

153 

154 Returns 

155 ------- 

156 jobsIds : `set` [`int`] 

157 Set of integer job IDs. 

158 """ 

159 return set(job.qdata.index for job in self.jobs 

160 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT)) 

161 

162 def timedOutIds(self): 

163 """Return set of jobs IDs that timed out. 

164 

165 Returns 

166 ------- 

167 jobsIds : `set` [`int`] 

168 Set of integer job IDs. 

169 """ 

170 return set(job.qdata.index for job in self.jobs if job.state == JobState.TIMED_OUT) 

171 

172 

173class MPGraphExecutorError(Exception): 

174 """Exception class for errors raised by MPGraphExecutor. 

175 """ 

176 pass 

177 

178 

179class MPTimeoutError(MPGraphExecutorError): 

180 """Exception raised when task execution times out. 

181 """ 

182 pass 

183 

184 

185class MPGraphExecutor(QuantumGraphExecutor): 

186 """Implementation of QuantumGraphExecutor using same-host multiprocess 

187 execution of Quanta. 

188 

189 Parameters 

190 ---------- 

191 numProc : `int` 

192 Number of processes to use for executing tasks. 

193 timeout : `float` 

194 Time in seconds to wait for tasks to finish. 

195 quantumExecutor : `QuantumExecutor` 

196 Executor for single quantum. For multiprocess-style execution when 

197 ``numProc`` is greater than one this instance must support pickle. 

198 failFast : `bool`, optional 

199 If set to ``True`` then stop processing on first error from any task. 

200 executionGraphFixup : `ExecutionGraphFixup`, optional 

201 Instance used for modification of execution graph. 

202 """ 

203 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None): 

204 self.numProc = numProc 

205 self.timeout = timeout 

206 self.quantumExecutor = quantumExecutor 

207 self.failFast = failFast 

208 self.executionGraphFixup = executionGraphFixup 

209 

210 def execute(self, graph, butler): 

211 # Docstring inherited from QuantumGraphExecutor.execute 

212 quantaIter = self._fixupQuanta(graph.traverse()) 

213 if self.numProc > 1: 

214 self._executeQuantaMP(quantaIter, butler) 

215 else: 

216 self._executeQuantaInProcess(quantaIter, butler) 

217 

218 def _fixupQuanta(self, quantaIter): 

219 """Call fixup code to modify execution graph. 

220 

221 Parameters 

222 ---------- 

223 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData` 

224 Quanta as originated from a quantum graph. 

225 

226 Returns 

227 ------- 

228 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData` 

229 Possibly updated set of quanta, properly ordered for execution. 

230 

231 Raises 

232 ------ 

233 MPGraphExecutorError 

234 Raised if execution graph cannot be ordered after modification, 

235 i.e. it has dependency cycles. 

236 """ 

237 if not self.executionGraphFixup: 

238 return quantaIter 

239 

240 _LOG.debug("Call execution graph fixup method") 

241 quantaIter = self.executionGraphFixup.fixupQuanta(quantaIter) 

242 

243 # need it correctly ordered as dependencies may have changed 

244 # after modification, so do topo-sort 

245 updatedQuanta = list(quantaIter) 

246 quanta = [] 

247 ids = set() 

248 _LOG.debug("Re-ordering execution graph") 

249 while updatedQuanta: 

250 # find quantum that has all dependencies resolved already 

251 for i, qdata in enumerate(updatedQuanta): 

252 if ids.issuperset(qdata.dependencies): 

253 _LOG.debug("Found next quanta to execute: %s", qdata) 

254 del updatedQuanta[i] 

255 ids.add(qdata.index) 

256 # we could yield here but I want to detect cycles before 

257 # returning anything from this method 

258 quanta.append(qdata) 

259 break 

260 else: 

261 # means remaining quanta have dependency cycle 

262 raise MPGraphExecutorError( 

263 "Updated execution graph has dependency clycle.") 

264 

265 return quanta 

266 

267 def _executeQuantaInProcess(self, iterable, butler): 

268 """Execute all Quanta in current process. 

269 

270 Parameters 

271 ---------- 

272 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

273 Sequence if Quanta to execute. It is guaranteed that re-requisites 

274 for a given Quantum will always appear before that Quantum. 

275 butler : `lsst.daf.butler.Butler` 

276 Data butler instance 

277 """ 

278 for qdata in iterable: 

279 _LOG.debug("Executing %s", qdata) 

280 self.quantumExecutor.execute(qdata.taskDef, qdata.quantum, butler) 

281 

282 def _executeQuantaMP(self, iterable, butler): 

283 """Execute all Quanta in separate processes. 

284 

285 Parameters 

286 ---------- 

287 iterable : iterable of `~lsst.pipe.base.QuantumIterData` 

288 Sequence if Quanta to execute. It is guaranteed that re-requisites 

289 for a given Quantum will always appear before that Quantum. 

290 butler : `lsst.daf.butler.Butler` 

291 Data butler instance 

292 """ 

293 

294 disableImplicitThreading() # To prevent thread contention 

295 

296 # re-pack input quantum data into jobs list 

297 jobs = _JobList(iterable) 

298 

299 # check that all tasks can run in sub-process 

300 for job in jobs.jobs: 

301 taskDef = job.qdata.taskDef 

302 if not taskDef.taskClass.canMultiprocess: 

303 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;" 

304 " use single process") 

305 

306 while jobs.pending() or jobs.running(): 

307 

308 _LOG.debug("#pendingJobs: %s", len(jobs.pending())) 

309 _LOG.debug("#runningJobs: %s", len(jobs.running())) 

310 

311 # See if any jobs have finished 

312 for job in jobs.running(): 

313 proc = job.process 

314 if not proc.is_alive(): 

315 _LOG.debug("finished: %s", job) 

316 # finished 

317 if proc.exitcode == 0: 

318 job.state = JobState.FINISHED 

319 _LOG.debug("success: %s", job) 

320 else: 

321 job.state = JobState.FAILED 

322 _LOG.debug("failed: %s", job) 

323 if self.failFast: 

324 raise MPGraphExecutorError( 

325 f"Task {job} failed, exit code={proc.exitcode}." 

326 ) 

327 else: 

328 _LOG.error( 

329 "Task %s failed; processing will continue for remaining tasks.", job 

330 ) 

331 else: 

332 # check for timeout 

333 now = time.time() 

334 if now - job.started > self.timeout: 

335 job.state = JobState.TIMED_OUT 

336 _LOG.debug("Terminating job %s due to timeout", job) 

337 job.stop() 

338 if self.failFast: 

339 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.") 

340 else: 

341 _LOG.error( 

342 "Timeout (%s sec) for task %s; task is killed, processing continues " 

343 "for remaining tasks.", self.timeout, job 

344 ) 

345 

346 # see if we can start more jobs 

347 for job in jobs.pending(): 

348 

349 # check all dependencies 

350 if job.qdata.dependencies & jobs.failedIds(): 

351 # upstream job has failed, skipping this 

352 job.state = JobState.FAILED_DEP 

353 _LOG.error("Upstream job failed for task %s, skipping this task.", job) 

354 elif job.qdata.dependencies <= jobs.finishedIds(): 

355 # all dependencies have completed, can start new job 

356 if len(jobs.running()) < self.numProc: 

357 _LOG.debug("Sumbitting %s", job) 

358 job.start(butler, self.quantumExecutor) 

359 

360 # Here we want to wait until one of the running jobs completes 

361 # but multiprocessing does not provide an API for that, for now 

362 # just sleep a little bit and go back to the loop. 

363 if jobs.running(): 

364 time.sleep(0.1) 

365 

366 if jobs.failedIds(): 

367 # print list of failed jobs 

368 _LOG.error("Failed jobs:") 

369 for job in jobs.jobs: 

370 if job.state != JobState.FINISHED: 

371 _LOG.error(" - %s: %s", job.state, job) 

372 

373 # if any job failed raise an exception 

374 if jobs.failedIds() == jobs.timedOutIds(): 

375 raise MPTimeoutError("One or more tasks timed out during execution.") 

376 else: 

377 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")