Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import copy
28from enum import Enum
29import logging
30import multiprocessing
31import time
33# -----------------------------
34# Imports for other modules --
35# -----------------------------
36from .quantumGraphExecutor import QuantumGraphExecutor
37from lsst.base import disableImplicitThreading
39_LOG = logging.getLogger(__name__.partition(".")[2])
42# Possible states for the executing task:
43# - PENDING: job has not started yet
44# - RUNNING: job is currently executing
45# - FINISHED: job finished successfully
46# - FAILED: job execution failed (process returned non-zero status)
47# - TIMED_OUT: job is killed due to too long execution time
48# - FAILED_DEP: one of the dependencies of this job has failed/timed out
49JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
52class _Job:
53 """Class representing a job running single task.
55 Parameters
56 ----------
57 qdata : `~lsst.pipe.base.QuantumIterData`
58 Quantum and some associated information.
59 """
60 def __init__(self, qdata):
61 self.qdata = qdata
62 self.process = None
63 self.state = JobState.PENDING
64 self.started = None
66 def start(self, butler, quantumExecutor):
67 """Start process which runs the task.
69 Parameters
70 ----------
71 butler : `lsst.daf.butler.Butler`
72 Data butler instance.
73 quantumExecutor : `QuantumExecutor`
74 Executor for single quantum.
75 """
76 # Butler can have live database connections which is a problem with
77 # fork-type activation. Make a copy of butler, this guarantees that
78 # no database is open right after copy.
79 butler = copy.copy(butler)
80 taskDef = self.qdata.taskDef
81 quantum = self.qdata.quantum
82 self.process = multiprocessing.Process(
83 target=quantumExecutor.execute, args=(taskDef, quantum, butler),
84 name=f"task-{self.qdata.index}"
85 )
86 self.process.start()
87 self.started = time.time()
88 self.state = JobState.RUNNING
90 def stop(self):
91 """Stop the process.
92 """
93 self.process.terminate()
94 # give it 1 second to finish or KILL
95 for i in range(10):
96 time.sleep(0.1)
97 if not self.process.is_alive():
98 break
99 else:
100 _LOG.debug("Killing process %s", self.process.name)
101 self.process.kill()
103 def __str__(self):
104 return f"<{self.qdata.taskDef} dataId={self.qdata.quantum.dataId}>"
107class _JobList:
108 """SImple list of _Job instances with few convenience methods.
110 Parameters
111 ----------
112 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
113 Sequence if Quanta to execute. This has to be ordered according to
114 task dependencies.
115 """
116 def __init__(self, iterable):
117 self.jobs = [_Job(qdata) for qdata in iterable]
119 def pending(self):
120 """Return list of jobs that wait for execution.
122 Returns
123 -------
124 jobs : `list` [`_Job`]
125 List of jobs.
126 """
127 return [job for job in self.jobs if job.state == JobState.PENDING]
129 def running(self):
130 """Return list of jobs that are executing.
132 Returns
133 -------
134 jobs : `list` [`_Job`]
135 List of jobs.
136 """
137 return [job for job in self.jobs if job.state == JobState.RUNNING]
139 def finishedIds(self):
140 """Return set of jobs IDs that finished successfully (not failed).
142 Job ID is the index of the corresponding quantum.
144 Returns
145 -------
146 jobsIds : `set` [`int`]
147 Set of integer job IDs.
148 """
149 return set(job.qdata.index for job in self.jobs if job.state == JobState.FINISHED)
151 def failedIds(self):
152 """Return set of jobs IDs that failed for any reason.
154 Returns
155 -------
156 jobsIds : `set` [`int`]
157 Set of integer job IDs.
158 """
159 return set(job.qdata.index for job in self.jobs
160 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
162 def timedOutIds(self):
163 """Return set of jobs IDs that timed out.
165 Returns
166 -------
167 jobsIds : `set` [`int`]
168 Set of integer job IDs.
169 """
170 return set(job.qdata.index for job in self.jobs if job.state == JobState.TIMED_OUT)
173class MPGraphExecutorError(Exception):
174 """Exception class for errors raised by MPGraphExecutor.
175 """
176 pass
179class MPTimeoutError(MPGraphExecutorError):
180 """Exception raised when task execution times out.
181 """
182 pass
185class MPGraphExecutor(QuantumGraphExecutor):
186 """Implementation of QuantumGraphExecutor using same-host multiprocess
187 execution of Quanta.
189 Parameters
190 ----------
191 numProc : `int`
192 Number of processes to use for executing tasks.
193 timeout : `float`
194 Time in seconds to wait for tasks to finish.
195 quantumExecutor : `QuantumExecutor`
196 Executor for single quantum. For multiprocess-style execution when
197 ``numProc`` is greater than one this instance must support pickle.
198 failFast : `bool`, optional
199 If set to ``True`` then stop processing on first error from any task.
200 executionGraphFixup : `ExecutionGraphFixup`, optional
201 Instance used for modification of execution graph.
202 """
203 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None):
204 self.numProc = numProc
205 self.timeout = timeout
206 self.quantumExecutor = quantumExecutor
207 self.failFast = failFast
208 self.executionGraphFixup = executionGraphFixup
210 def execute(self, graph, butler):
211 # Docstring inherited from QuantumGraphExecutor.execute
212 quantaIter = self._fixupQuanta(graph.traverse())
213 if self.numProc > 1:
214 self._executeQuantaMP(quantaIter, butler)
215 else:
216 self._executeQuantaInProcess(quantaIter, butler)
218 def _fixupQuanta(self, quantaIter):
219 """Call fixup code to modify execution graph.
221 Parameters
222 ----------
223 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData`
224 Quanta as originated from a quantum graph.
226 Returns
227 -------
228 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData`
229 Possibly updated set of quanta, properly ordered for execution.
231 Raises
232 ------
233 MPGraphExecutorError
234 Raised if execution graph cannot be ordered after modification,
235 i.e. it has dependency cycles.
236 """
237 if not self.executionGraphFixup:
238 return quantaIter
240 _LOG.debug("Call execution graph fixup method")
241 quantaIter = self.executionGraphFixup.fixupQuanta(quantaIter)
243 # need it correctly ordered as dependencies may have changed
244 # after modification, so do topo-sort
245 updatedQuanta = list(quantaIter)
246 quanta = []
247 ids = set()
248 _LOG.debug("Re-ordering execution graph")
249 while updatedQuanta:
250 # find quantum that has all dependencies resolved already
251 for i, qdata in enumerate(updatedQuanta):
252 if ids.issuperset(qdata.dependencies):
253 _LOG.debug("Found next quanta to execute: %s", qdata)
254 del updatedQuanta[i]
255 ids.add(qdata.index)
256 # we could yield here but I want to detect cycles before
257 # returning anything from this method
258 quanta.append(qdata)
259 break
260 else:
261 # means remaining quanta have dependency cycle
262 raise MPGraphExecutorError(
263 "Updated execution graph has dependency clycle.")
265 return quanta
267 def _executeQuantaInProcess(self, iterable, butler):
268 """Execute all Quanta in current process.
270 Parameters
271 ----------
272 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
273 Sequence if Quanta to execute. It is guaranteed that re-requisites
274 for a given Quantum will always appear before that Quantum.
275 butler : `lsst.daf.butler.Butler`
276 Data butler instance
277 """
278 for qdata in iterable:
279 _LOG.debug("Executing %s", qdata)
280 self.quantumExecutor.execute(qdata.taskDef, qdata.quantum, butler)
282 def _executeQuantaMP(self, iterable, butler):
283 """Execute all Quanta in separate processes.
285 Parameters
286 ----------
287 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
288 Sequence if Quanta to execute. It is guaranteed that re-requisites
289 for a given Quantum will always appear before that Quantum.
290 butler : `lsst.daf.butler.Butler`
291 Data butler instance
292 """
294 disableImplicitThreading() # To prevent thread contention
296 # re-pack input quantum data into jobs list
297 jobs = _JobList(iterable)
299 # check that all tasks can run in sub-process
300 for job in jobs.jobs:
301 taskDef = job.qdata.taskDef
302 if not taskDef.taskClass.canMultiprocess:
303 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
304 " use single process")
306 while jobs.pending() or jobs.running():
308 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
309 _LOG.debug("#runningJobs: %s", len(jobs.running()))
311 # See if any jobs have finished
312 for job in jobs.running():
313 proc = job.process
314 if not proc.is_alive():
315 _LOG.debug("finished: %s", job)
316 # finished
317 if proc.exitcode == 0:
318 job.state = JobState.FINISHED
319 _LOG.debug("success: %s", job)
320 else:
321 job.state = JobState.FAILED
322 _LOG.debug("failed: %s", job)
323 if self.failFast:
324 raise MPGraphExecutorError(
325 f"Task {job} failed, exit code={proc.exitcode}."
326 )
327 else:
328 _LOG.error(
329 "Task %s failed; processing will continue for remaining tasks.", job
330 )
331 else:
332 # check for timeout
333 now = time.time()
334 if now - job.started > self.timeout:
335 job.state = JobState.TIMED_OUT
336 _LOG.debug("Terminating job %s due to timeout", job)
337 job.stop()
338 if self.failFast:
339 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
340 else:
341 _LOG.error(
342 "Timeout (%s sec) for task %s; task is killed, processing continues "
343 "for remaining tasks.", self.timeout, job
344 )
346 # see if we can start more jobs
347 for job in jobs.pending():
349 # check all dependencies
350 if job.qdata.dependencies & jobs.failedIds():
351 # upstream job has failed, skipping this
352 job.state = JobState.FAILED_DEP
353 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
354 elif job.qdata.dependencies <= jobs.finishedIds():
355 # all dependencies have completed, can start new job
356 if len(jobs.running()) < self.numProc:
357 _LOG.debug("Sumbitting %s", job)
358 job.start(butler, self.quantumExecutor)
360 # Here we want to wait until one of the running jobs completes
361 # but multiprocessing does not provide an API for that, for now
362 # just sleep a little bit and go back to the loop.
363 if jobs.running():
364 time.sleep(0.1)
366 if jobs.failedIds():
367 # print list of failed jobs
368 _LOG.error("Failed jobs:")
369 for job in jobs.jobs:
370 if job.state != JobState.FINISHED:
371 _LOG.error(" - %s: %s", job.state, job)
373 # if any job failed raise an exception
374 if jobs.failedIds() == jobs.timedOutIds():
375 raise MPTimeoutError("One or more tasks timed out during execution.")
376 else:
377 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")