Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import time
33# -----------------------------
34# Imports for other modules --
35# -----------------------------
36from .quantumGraphExecutor import QuantumGraphExecutor
37from lsst.base import disableImplicitThreading
39_LOG = logging.getLogger(__name__.partition(".")[2])
42# Possible states for the executing task:
43# - PENDING: job has not started yet
44# - RUNNING: job is currently executing
45# - FINISHED: job finished successfully
46# - FAILED: job execution failed (process returned non-zero status)
47# - TIMED_OUT: job is killed due to too long execution time
48# - FAILED_DEP: one of the dependencies of this job has failed/timed out
49JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
52class _Job:
53 """Class representing a job running single task.
55 Parameters
56 ----------
57 qdata : `~lsst.pipe.base.QuantumIterData`
58 Quantum and some associated information.
59 """
60 def __init__(self, qdata):
61 self.qdata = qdata
62 self.process = None
63 self.state = JobState.PENDING
64 self.started = None
66 def start(self, butler, quantumExecutor):
67 """Start process which runs the task.
69 Parameters
70 ----------
71 butler : `lsst.daf.butler.Butler`
72 Data butler instance.
73 quantumExecutor : `QuantumExecutor`
74 Executor for single quantum.
75 """
76 # Butler can have live database connections which is a problem with
77 # fork-type activation. Make a pickle of butler to pass that across
78 # fork.
79 butler_pickle = pickle.dumps(butler)
80 taskDef = self.qdata.taskDef
81 quantum = self.qdata.quantum
82 self.process = multiprocessing.Process(
83 target=self._executeJob,
84 args=(quantumExecutor, taskDef, quantum, butler_pickle),
85 name=f"task-{self.qdata.index}"
86 )
87 self.process.start()
88 self.started = time.time()
89 self.state = JobState.RUNNING
91 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle):
92 """Execute a job with arguments.
94 Parameters
95 ----------
96 quantumExecutor : `QuantumExecutor`
97 Executor for single quantum.
98 taskDef : `~lsst.pipe.base.TaskDef`
99 Task definition structure.
100 quantum : `~lsst.daf.butler.Quantum`
101 Quantum for this task execution.
102 butler_pickle : `bytes`
103 Data butler instance in pickled form.
104 """
105 butler = pickle.loads(butler_pickle)
106 quantumExecutor.execute(taskDef, quantum, butler)
108 def stop(self):
109 """Stop the process.
110 """
111 self.process.terminate()
112 # give it 1 second to finish or KILL
113 for i in range(10):
114 time.sleep(0.1)
115 if not self.process.is_alive():
116 break
117 else:
118 _LOG.debug("Killing process %s", self.process.name)
119 self.process.kill()
121 def __str__(self):
122 return f"<{self.qdata.taskDef} dataId={self.qdata.quantum.dataId}>"
125class _JobList:
126 """SImple list of _Job instances with few convenience methods.
128 Parameters
129 ----------
130 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
131 Sequence if Quanta to execute. This has to be ordered according to
132 task dependencies.
133 """
134 def __init__(self, iterable):
135 self.jobs = [_Job(qdata) for qdata in iterable]
137 def pending(self):
138 """Return list of jobs that wait for execution.
140 Returns
141 -------
142 jobs : `list` [`_Job`]
143 List of jobs.
144 """
145 return [job for job in self.jobs if job.state == JobState.PENDING]
147 def running(self):
148 """Return list of jobs that are executing.
150 Returns
151 -------
152 jobs : `list` [`_Job`]
153 List of jobs.
154 """
155 return [job for job in self.jobs if job.state == JobState.RUNNING]
157 def finishedIds(self):
158 """Return set of jobs IDs that finished successfully (not failed).
160 Job ID is the index of the corresponding quantum.
162 Returns
163 -------
164 jobsIds : `set` [`int`]
165 Set of integer job IDs.
166 """
167 return set(job.qdata.index for job in self.jobs if job.state == JobState.FINISHED)
169 def failedIds(self):
170 """Return set of jobs IDs that failed for any reason.
172 Returns
173 -------
174 jobsIds : `set` [`int`]
175 Set of integer job IDs.
176 """
177 return set(job.qdata.index for job in self.jobs
178 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
180 def timedOutIds(self):
181 """Return set of jobs IDs that timed out.
183 Returns
184 -------
185 jobsIds : `set` [`int`]
186 Set of integer job IDs.
187 """
188 return set(job.qdata.index for job in self.jobs if job.state == JobState.TIMED_OUT)
191class MPGraphExecutorError(Exception):
192 """Exception class for errors raised by MPGraphExecutor.
193 """
194 pass
197class MPTimeoutError(MPGraphExecutorError):
198 """Exception raised when task execution times out.
199 """
200 pass
203class MPGraphExecutor(QuantumGraphExecutor):
204 """Implementation of QuantumGraphExecutor using same-host multiprocess
205 execution of Quanta.
207 Parameters
208 ----------
209 numProc : `int`
210 Number of processes to use for executing tasks.
211 timeout : `float`
212 Time in seconds to wait for tasks to finish.
213 quantumExecutor : `QuantumExecutor`
214 Executor for single quantum. For multiprocess-style execution when
215 ``numProc`` is greater than one this instance must support pickle.
216 failFast : `bool`, optional
217 If set to ``True`` then stop processing on first error from any task.
218 executionGraphFixup : `ExecutionGraphFixup`, optional
219 Instance used for modification of execution graph.
220 """
221 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None):
222 self.numProc = numProc
223 self.timeout = timeout
224 self.quantumExecutor = quantumExecutor
225 self.failFast = failFast
226 self.executionGraphFixup = executionGraphFixup
228 def execute(self, graph, butler):
229 # Docstring inherited from QuantumGraphExecutor.execute
230 quantaIter = self._fixupQuanta(graph.traverse())
231 if self.numProc > 1:
232 self._executeQuantaMP(quantaIter, butler)
233 else:
234 self._executeQuantaInProcess(quantaIter, butler)
236 def _fixupQuanta(self, quantaIter):
237 """Call fixup code to modify execution graph.
239 Parameters
240 ----------
241 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData`
242 Quanta as originated from a quantum graph.
244 Returns
245 -------
246 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData`
247 Possibly updated set of quanta, properly ordered for execution.
249 Raises
250 ------
251 MPGraphExecutorError
252 Raised if execution graph cannot be ordered after modification,
253 i.e. it has dependency cycles.
254 """
255 if not self.executionGraphFixup:
256 return quantaIter
258 _LOG.debug("Call execution graph fixup method")
259 quantaIter = self.executionGraphFixup.fixupQuanta(quantaIter)
261 # need it correctly ordered as dependencies may have changed
262 # after modification, so do topo-sort
263 updatedQuanta = list(quantaIter)
264 quanta = []
265 ids = set()
266 _LOG.debug("Re-ordering execution graph")
267 while updatedQuanta:
268 # find quantum that has all dependencies resolved already
269 for i, qdata in enumerate(updatedQuanta):
270 if ids.issuperset(qdata.dependencies):
271 _LOG.debug("Found next quanta to execute: %s", qdata)
272 del updatedQuanta[i]
273 ids.add(qdata.index)
274 # we could yield here but I want to detect cycles before
275 # returning anything from this method
276 quanta.append(qdata)
277 break
278 else:
279 # means remaining quanta have dependency cycle
280 raise MPGraphExecutorError(
281 "Updated execution graph has dependency clycle.")
283 return quanta
285 def _executeQuantaInProcess(self, iterable, butler):
286 """Execute all Quanta in current process.
288 Parameters
289 ----------
290 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
291 Sequence if Quanta to execute. It is guaranteed that re-requisites
292 for a given Quantum will always appear before that Quantum.
293 butler : `lsst.daf.butler.Butler`
294 Data butler instance
295 """
296 for qdata in iterable:
297 _LOG.debug("Executing %s", qdata)
298 self.quantumExecutor.execute(qdata.taskDef, qdata.quantum, butler)
300 def _executeQuantaMP(self, iterable, butler):
301 """Execute all Quanta in separate processes.
303 Parameters
304 ----------
305 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
306 Sequence if Quanta to execute. It is guaranteed that re-requisites
307 for a given Quantum will always appear before that Quantum.
308 butler : `lsst.daf.butler.Butler`
309 Data butler instance
310 """
312 disableImplicitThreading() # To prevent thread contention
314 # re-pack input quantum data into jobs list
315 jobs = _JobList(iterable)
317 # check that all tasks can run in sub-process
318 for job in jobs.jobs:
319 taskDef = job.qdata.taskDef
320 if not taskDef.taskClass.canMultiprocess:
321 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
322 " use single process")
324 while jobs.pending() or jobs.running():
326 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
327 _LOG.debug("#runningJobs: %s", len(jobs.running()))
329 # See if any jobs have finished
330 for job in jobs.running():
331 proc = job.process
332 if not proc.is_alive():
333 _LOG.debug("finished: %s", job)
334 # finished
335 if proc.exitcode == 0:
336 job.state = JobState.FINISHED
337 _LOG.debug("success: %s", job)
338 else:
339 job.state = JobState.FAILED
340 _LOG.debug("failed: %s", job)
341 if self.failFast:
342 raise MPGraphExecutorError(
343 f"Task {job} failed, exit code={proc.exitcode}."
344 )
345 else:
346 _LOG.error(
347 "Task %s failed; processing will continue for remaining tasks.", job
348 )
349 else:
350 # check for timeout
351 now = time.time()
352 if now - job.started > self.timeout:
353 job.state = JobState.TIMED_OUT
354 _LOG.debug("Terminating job %s due to timeout", job)
355 job.stop()
356 if self.failFast:
357 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
358 else:
359 _LOG.error(
360 "Timeout (%s sec) for task %s; task is killed, processing continues "
361 "for remaining tasks.", self.timeout, job
362 )
364 # see if we can start more jobs
365 for job in jobs.pending():
367 # check all dependencies
368 if job.qdata.dependencies & jobs.failedIds():
369 # upstream job has failed, skipping this
370 job.state = JobState.FAILED_DEP
371 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
372 elif job.qdata.dependencies <= jobs.finishedIds():
373 # all dependencies have completed, can start new job
374 if len(jobs.running()) < self.numProc:
375 _LOG.debug("Sumbitting %s", job)
376 job.start(butler, self.quantumExecutor)
378 # Here we want to wait until one of the running jobs completes
379 # but multiprocessing does not provide an API for that, for now
380 # just sleep a little bit and go back to the loop.
381 if jobs.running():
382 time.sleep(0.1)
384 if jobs.failedIds():
385 # print list of failed jobs
386 _LOG.error("Failed jobs:")
387 for job in jobs.jobs:
388 if job.state != JobState.FINISHED:
389 _LOG.error(" - %s: %s", job.state, job)
391 # if any job failed raise an exception
392 if jobs.failedIds() == jobs.timedOutIds():
393 raise MPTimeoutError("One or more tasks timed out during execution.")
394 else:
395 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")