Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import time
33# -----------------------------
34# Imports for other modules --
35# -----------------------------
36from .quantumGraphExecutor import QuantumGraphExecutor
37from lsst.base import disableImplicitThreading
39_LOG = logging.getLogger(__name__.partition(".")[2])
42# Possible states for the executing task:
43# - PENDING: job has not started yet
44# - RUNNING: job is currently executing
45# - FINISHED: job finished successfully
46# - FAILED: job execution failed (process returned non-zero status)
47# - TIMED_OUT: job is killed due to too long execution time
48# - FAILED_DEP: one of the dependencies of this job has failed/timed out
49JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
52class _Job:
53 """Class representing a job running single task.
55 Parameters
56 ----------
57 qdata : `~lsst.pipe.base.QuantumIterData`
58 Quantum and some associated information.
59 """
60 def __init__(self, qdata):
61 self.qdata = qdata
62 self.process = None
63 self.state = JobState.PENDING
64 self.started = None
66 def start(self, butler, quantumExecutor):
67 """Start process which runs the task.
69 Parameters
70 ----------
71 butler : `lsst.daf.butler.Butler`
72 Data butler instance.
73 quantumExecutor : `QuantumExecutor`
74 Executor for single quantum.
75 """
76 # Butler can have live database connections which is a problem with
77 # fork-type activation. Make a pickle of butler to pass that across
78 # fork.
79 butler_pickle = pickle.dumps(butler)
80 taskDef = self.qdata.taskDef
81 quantum = self.qdata.quantum
82 self.process = multiprocessing.Process(
83 target=self._executeJob,
84 args=(quantumExecutor, taskDef, quantum, butler_pickle),
85 name=f"task-{self.qdata.index}"
86 )
87 self.process.start()
88 self.started = time.time()
89 self.state = JobState.RUNNING
91 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle):
92 """Execute a job with arguments.
94 Parameters
95 ----------
96 quantumExecutor : `QuantumExecutor`
97 Executor for single quantum.
98 taskDef : `~lsst.pipe.base.TaskDef`
99 Task definition structure.
100 quantum : `~lsst.daf.butler.Quantum`
101 Quantum for this task execution.
102 butler_pickle : `bytes`
103 Data butler instance in pickled form.
104 """
105 butler = pickle.loads(butler_pickle)
106 quantumExecutor.execute(taskDef, quantum, butler)
108 def stop(self):
109 """Stop the process.
110 """
111 self.process.terminate()
112 # give it 1 second to finish or KILL
113 for i in range(10):
114 time.sleep(0.1)
115 if not self.process.is_alive():
116 break
117 else:
118 _LOG.debug("Killing process %s", self.process.name)
119 self.process.kill()
121 def cleanup(self):
122 """Release processes resources, has to be called for each finished
123 process.
124 """
125 if self.process and not self.process.is_alive():
126 self.process.close()
127 self.process = None
129 def __str__(self):
130 return f"<{self.qdata.taskDef} dataId={self.qdata.quantum.dataId}>"
133class _JobList:
134 """SImple list of _Job instances with few convenience methods.
136 Parameters
137 ----------
138 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
139 Sequence if Quanta to execute. This has to be ordered according to
140 task dependencies.
141 """
142 def __init__(self, iterable):
143 self.jobs = [_Job(qdata) for qdata in iterable]
145 def pending(self):
146 """Return list of jobs that wait for execution.
148 Returns
149 -------
150 jobs : `list` [`_Job`]
151 List of jobs.
152 """
153 return [job for job in self.jobs if job.state == JobState.PENDING]
155 def running(self):
156 """Return list of jobs that are executing.
158 Returns
159 -------
160 jobs : `list` [`_Job`]
161 List of jobs.
162 """
163 return [job for job in self.jobs if job.state == JobState.RUNNING]
165 def finishedIds(self):
166 """Return set of jobs IDs that finished successfully (not failed).
168 Job ID is the index of the corresponding quantum.
170 Returns
171 -------
172 jobsIds : `set` [`int`]
173 Set of integer job IDs.
174 """
175 return set(job.qdata.index for job in self.jobs if job.state == JobState.FINISHED)
177 def failedIds(self):
178 """Return set of jobs IDs that failed for any reason.
180 Returns
181 -------
182 jobsIds : `set` [`int`]
183 Set of integer job IDs.
184 """
185 return set(job.qdata.index for job in self.jobs
186 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
188 def timedOutIds(self):
189 """Return set of jobs IDs that timed out.
191 Returns
192 -------
193 jobsIds : `set` [`int`]
194 Set of integer job IDs.
195 """
196 return set(job.qdata.index for job in self.jobs if job.state == JobState.TIMED_OUT)
198 def cleanup(self):
199 """Do periodic cleanup for jobs that did not finish correctly.
201 If timed out jobs are killed but take too long to stop then regular
202 cleanup will not work for them. Here we check all timed out jobs
203 periodically and do cleanup if they managed to die by this time.
204 """
205 for job in self.jobs:
206 if job.state == JobState.TIMED_OUT and job.process is not None:
207 job.cleanup()
210class MPGraphExecutorError(Exception):
211 """Exception class for errors raised by MPGraphExecutor.
212 """
213 pass
216class MPTimeoutError(MPGraphExecutorError):
217 """Exception raised when task execution times out.
218 """
219 pass
222class MPGraphExecutor(QuantumGraphExecutor):
223 """Implementation of QuantumGraphExecutor using same-host multiprocess
224 execution of Quanta.
226 Parameters
227 ----------
228 numProc : `int`
229 Number of processes to use for executing tasks.
230 timeout : `float`
231 Time in seconds to wait for tasks to finish.
232 quantumExecutor : `QuantumExecutor`
233 Executor for single quantum. For multiprocess-style execution when
234 ``numProc`` is greater than one this instance must support pickle.
235 failFast : `bool`, optional
236 If set to ``True`` then stop processing on first error from any task.
237 executionGraphFixup : `ExecutionGraphFixup`, optional
238 Instance used for modification of execution graph.
239 """
240 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None):
241 self.numProc = numProc
242 self.timeout = timeout
243 self.quantumExecutor = quantumExecutor
244 self.failFast = failFast
245 self.executionGraphFixup = executionGraphFixup
247 def execute(self, graph, butler):
248 # Docstring inherited from QuantumGraphExecutor.execute
249 quantaIter = self._fixupQuanta(graph.traverse())
250 if self.numProc > 1:
251 self._executeQuantaMP(quantaIter, butler)
252 else:
253 self._executeQuantaInProcess(quantaIter, butler)
255 def _fixupQuanta(self, quantaIter):
256 """Call fixup code to modify execution graph.
258 Parameters
259 ----------
260 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData`
261 Quanta as originated from a quantum graph.
263 Returns
264 -------
265 quantaIter : iterable of `~lsst.pipe.base.QuantumIterData`
266 Possibly updated set of quanta, properly ordered for execution.
268 Raises
269 ------
270 MPGraphExecutorError
271 Raised if execution graph cannot be ordered after modification,
272 i.e. it has dependency cycles.
273 """
274 if not self.executionGraphFixup:
275 return quantaIter
277 _LOG.debug("Call execution graph fixup method")
278 quantaIter = self.executionGraphFixup.fixupQuanta(quantaIter)
280 # need it correctly ordered as dependencies may have changed
281 # after modification, so do topo-sort
282 updatedQuanta = list(quantaIter)
283 quanta = []
284 ids = set()
285 _LOG.debug("Re-ordering execution graph")
286 while updatedQuanta:
287 # find quantum that has all dependencies resolved already
288 for i, qdata in enumerate(updatedQuanta):
289 if ids.issuperset(qdata.dependencies):
290 _LOG.debug("Found next quanta to execute: %s", qdata)
291 del updatedQuanta[i]
292 ids.add(qdata.index)
293 # we could yield here but I want to detect cycles before
294 # returning anything from this method
295 quanta.append(qdata)
296 break
297 else:
298 # means remaining quanta have dependency cycle
299 raise MPGraphExecutorError(
300 "Updated execution graph has dependency clycle.")
302 return quanta
304 def _executeQuantaInProcess(self, iterable, butler):
305 """Execute all Quanta in current process.
307 Parameters
308 ----------
309 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
310 Sequence if Quanta to execute. It is guaranteed that re-requisites
311 for a given Quantum will always appear before that Quantum.
312 butler : `lsst.daf.butler.Butler`
313 Data butler instance
314 """
315 for qdata in iterable:
316 _LOG.debug("Executing %s", qdata)
317 self.quantumExecutor.execute(qdata.taskDef, qdata.quantum, butler)
319 def _executeQuantaMP(self, iterable, butler):
320 """Execute all Quanta in separate processes.
322 Parameters
323 ----------
324 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
325 Sequence if Quanta to execute. It is guaranteed that re-requisites
326 for a given Quantum will always appear before that Quantum.
327 butler : `lsst.daf.butler.Butler`
328 Data butler instance
329 """
331 disableImplicitThreading() # To prevent thread contention
333 # re-pack input quantum data into jobs list
334 jobs = _JobList(iterable)
336 # check that all tasks can run in sub-process
337 for job in jobs.jobs:
338 taskDef = job.qdata.taskDef
339 if not taskDef.taskClass.canMultiprocess:
340 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
341 " use single process")
343 while jobs.pending() or jobs.running():
345 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
346 _LOG.debug("#runningJobs: %s", len(jobs.running()))
348 # See if any jobs have finished
349 for job in jobs.running():
350 if not job.process.is_alive():
351 _LOG.debug("finished: %s", job)
352 # finished
353 exitcode = job.process.exitcode
354 if exitcode == 0:
355 job.state = JobState.FINISHED
356 job.cleanup()
357 _LOG.debug("success: %s", job)
358 else:
359 job.state = JobState.FAILED
360 job.cleanup()
361 _LOG.debug("failed: %s", job)
362 if self.failFast:
363 raise MPGraphExecutorError(
364 f"Task {job} failed, exit code={exitcode}."
365 )
366 else:
367 _LOG.error(
368 "Task %s failed; processing will continue for remaining tasks.", job
369 )
370 else:
371 # check for timeout
372 now = time.time()
373 if now - job.started > self.timeout:
374 job.state = JobState.TIMED_OUT
375 _LOG.debug("Terminating job %s due to timeout", job)
376 job.stop()
377 job.cleanup()
378 if self.failFast:
379 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
380 else:
381 _LOG.error(
382 "Timeout (%s sec) for task %s; task is killed, processing continues "
383 "for remaining tasks.", self.timeout, job
384 )
386 # see if we can start more jobs
387 for job in jobs.pending():
389 # check all dependencies
390 if job.qdata.dependencies & jobs.failedIds():
391 # upstream job has failed, skipping this
392 job.state = JobState.FAILED_DEP
393 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
394 elif job.qdata.dependencies <= jobs.finishedIds():
395 # all dependencies have completed, can start new job
396 if len(jobs.running()) < self.numProc:
397 _LOG.debug("Sumbitting %s", job)
398 job.start(butler, self.quantumExecutor)
400 # Do cleanup for timed out jobs if necessary.
401 jobs.cleanup()
403 # Here we want to wait until one of the running jobs completes
404 # but multiprocessing does not provide an API for that, for now
405 # just sleep a little bit and go back to the loop.
406 if jobs.running():
407 time.sleep(0.1)
409 if jobs.failedIds():
410 # print list of failed jobs
411 _LOG.error("Failed jobs:")
412 for job in jobs.jobs:
413 if job.state != JobState.FINISHED:
414 _LOG.error(" - %s: %s", job.state, job)
416 # if any job failed raise an exception
417 if jobs.failedIds() == jobs.timedOutIds():
418 raise MPTimeoutError("One or more tasks timed out during execution.")
419 else:
420 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")