Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import sys
32import time
34from lsst.pipe.base.graph.graph import QuantumGraph
36# -----------------------------
37# Imports for other modules --
38# -----------------------------
39from .quantumGraphExecutor import QuantumGraphExecutor
40from lsst.base import disableImplicitThreading
41from lsst.daf.butler.cli.cliLog import CliLog
43_LOG = logging.getLogger(__name__.partition(".")[2])
46# Possible states for the executing task:
47# - PENDING: job has not started yet
48# - RUNNING: job is currently executing
49# - FINISHED: job finished successfully
50# - FAILED: job execution failed (process returned non-zero status)
51# - TIMED_OUT: job is killed due to too long execution time
52# - FAILED_DEP: one of the dependencies of this job has failed/timed out
53JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
56class _Job:
57 """Class representing a job running single task.
59 Parameters
60 ----------
61 qnode: `~lsst.pipe.base.QuantumNode`
62 Quantum and some associated information.
63 """
64 def __init__(self, qnode):
65 self.qnode = qnode
66 self.process = None
67 self.state = JobState.PENDING
68 self.started = None
70 def start(self, butler, quantumExecutor, startMethod=None):
71 """Start process which runs the task.
73 Parameters
74 ----------
75 butler : `lsst.daf.butler.Butler`
76 Data butler instance.
77 quantumExecutor : `QuantumExecutor`
78 Executor for single quantum.
79 startMethod : `str`, optional
80 Start method from `multiprocessing` module.
81 """
82 # Butler can have live database connections which is a problem with
83 # fork-type activation. Make a pickle of butler to pass that across
84 # fork. Unpickling of quantum has to happen after butler, this is why
85 # it is pickled manually here.
86 butler_pickle = pickle.dumps(butler)
87 quantum_pickle = pickle.dumps(self.qnode.quantum)
88 taskDef = self.qnode.taskDef
89 logConfigState = CliLog.configState
90 mp_ctx = multiprocessing.get_context(startMethod)
91 self.process = mp_ctx.Process(
92 target=_Job._executeJob,
93 args=(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState),
94 name=f"task-{self.qnode.nodeId.number}"
95 )
96 self.process.start()
97 self.started = time.time()
98 self.state = JobState.RUNNING
100 @staticmethod
101 def _executeJob(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState):
102 """Execute a job with arguments.
104 Parameters
105 ----------
106 quantumExecutor : `QuantumExecutor`
107 Executor for single quantum.
108 taskDef : `bytes`
109 Task definition structure.
110 quantum_pickle : `bytes`
111 Quantum for this task execution in pickled form.
112 butler_pickle : `bytes`
113 Data butler instance in pickled form.
114 """
115 if logConfigState and not CliLog.configState:
116 # means that we are in a new spawned Python process and we have to
117 # re-initialize logging
118 CliLog.replayConfigState(logConfigState)
120 butler = pickle.loads(butler_pickle)
121 quantum = pickle.loads(quantum_pickle)
122 quantumExecutor.execute(taskDef, quantum, butler)
124 def stop(self):
125 """Stop the process.
126 """
127 self.process.terminate()
128 # give it 1 second to finish or KILL
129 for i in range(10):
130 time.sleep(0.1)
131 if not self.process.is_alive():
132 break
133 else:
134 _LOG.debug("Killing process %s", self.process.name)
135 self.process.kill()
137 def cleanup(self):
138 """Release processes resources, has to be called for each finished
139 process.
140 """
141 if self.process and not self.process.is_alive():
142 self.process.close()
143 self.process = None
145 def __str__(self):
146 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>"
149class _JobList:
150 """Simple list of _Job instances with few convenience methods.
152 Parameters
153 ----------
154 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
155 Sequence if Quanta to execute. This has to be ordered according to
156 task dependencies.
157 """
158 def __init__(self, iterable):
159 self.jobs = [_Job(qnode) for qnode in iterable]
161 def pending(self):
162 """Return list of jobs that wait for execution.
164 Returns
165 -------
166 jobs : `list` [`_Job`]
167 List of jobs.
168 """
169 return [job for job in self.jobs if job.state == JobState.PENDING]
171 def running(self):
172 """Return list of jobs that are executing.
174 Returns
175 -------
176 jobs : `list` [`_Job`]
177 List of jobs.
178 """
179 return [job for job in self.jobs if job.state == JobState.RUNNING]
181 def finishedNodes(self):
182 """Return set of QuantumNodes that finished successfully (not failed).
184 Returns
185 -------
186 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
187 Set of QuantumNodes that have successfully finished
188 """
189 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED)
191 def failedNodes(self):
192 """Return set of jobs IDs that failed for any reason.
194 Returns
195 -------
196 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
197 Set of QUantumNodes that failed during processing
198 """
199 return set(job.qnode for job in self.jobs
200 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
202 def timedOutIds(self):
203 """Return set of jobs IDs that timed out.
205 Returns
206 -------
207 jobsIds : `set` [`int`]
208 Set of integer job IDs.
209 """
210 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT)
212 def cleanup(self):
213 """Do periodic cleanup for jobs that did not finish correctly.
215 If timed out jobs are killed but take too long to stop then regular
216 cleanup will not work for them. Here we check all timed out jobs
217 periodically and do cleanup if they managed to die by this time.
218 """
219 for job in self.jobs:
220 if job.state == JobState.TIMED_OUT and job.process is not None:
221 job.cleanup()
224class MPGraphExecutorError(Exception):
225 """Exception class for errors raised by MPGraphExecutor.
226 """
227 pass
230class MPTimeoutError(MPGraphExecutorError):
231 """Exception raised when task execution times out.
232 """
233 pass
236class MPGraphExecutor(QuantumGraphExecutor):
237 """Implementation of QuantumGraphExecutor using same-host multiprocess
238 execution of Quanta.
240 Parameters
241 ----------
242 numProc : `int`
243 Number of processes to use for executing tasks.
244 timeout : `float`
245 Time in seconds to wait for tasks to finish.
246 quantumExecutor : `QuantumExecutor`
247 Executor for single quantum. For multiprocess-style execution when
248 ``numProc`` is greater than one this instance must support pickle.
249 startMethod : `str`, optional
250 Start method from `multiprocessing` module, `None` selects the best
251 one for current platform.
252 failFast : `bool`, optional
253 If set to ``True`` then stop processing on first error from any task.
254 executionGraphFixup : `ExecutionGraphFixup`, optional
255 Instance used for modification of execution graph.
256 """
257 def __init__(self, numProc, timeout, quantumExecutor, *,
258 startMethod=None, failFast=False, executionGraphFixup=None):
259 self.numProc = numProc
260 self.timeout = timeout
261 self.quantumExecutor = quantumExecutor
262 self.failFast = failFast
263 self.executionGraphFixup = executionGraphFixup
265 # We set default start method as spawn for MacOS and fork for Linux;
266 # None for all other platforms to use multiprocessing default.
267 if startMethod is None:
268 methods = dict(linux="fork", darwin="spawn")
269 startMethod = methods.get(sys.platform)
270 self.startMethod = startMethod
272 def execute(self, graph, butler):
273 # Docstring inherited from QuantumGraphExecutor.execute
274 graph = self._fixupQuanta(graph)
275 if self.numProc > 1:
276 self._executeQuantaMP(graph, butler)
277 else:
278 self._executeQuantaInProcess(graph, butler)
280 def _fixupQuanta(self, graph: QuantumGraph):
281 """Call fixup code to modify execution graph.
283 Parameters
284 ----------
285 graph : `QuantumGraph`
286 `QuantumGraph` to modify
288 Returns
289 -------
290 graph : `QuantumGraph`
291 Modified `QuantumGraph`.
293 Raises
294 ------
295 MPGraphExecutorError
296 Raised if execution graph cannot be ordered after modification,
297 i.e. it has dependency cycles.
298 """
299 if not self.executionGraphFixup:
300 return graph
302 _LOG.debug("Call execution graph fixup method")
303 graph = self.executionGraphFixup.fixupQuanta(graph)
305 # Detect if there is now a cycle created within the graph
306 if graph.findCycle():
307 raise MPGraphExecutorError(
308 "Updated execution graph has dependency cycle.")
310 return graph
312 def _executeQuantaInProcess(self, graph, butler):
313 """Execute all Quanta in current process.
315 Parameters
316 ----------
317 graph : `QuantumGraph`
318 `QuantumGraph` that is to be executed
319 butler : `lsst.daf.butler.Butler`
320 Data butler instance
321 """
322 # Note that in non-MP case any failed task will generate an exception
323 # and kill the whole thing. In general we cannot guarantee exception
324 # safety so easiest and safest thing is to let it die.
325 count, totalCount = 0, len(graph)
326 for qnode in graph:
327 _LOG.debug("Executing %s", qnode)
328 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler)
329 count += 1
330 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.",
331 count, totalCount - count, totalCount)
333 def _executeQuantaMP(self, graph, butler):
334 """Execute all Quanta in separate processes.
336 Parameters
337 ----------
338 graph : `QuantumGraph`
339 `QuantumGraph` that is to be executed.
340 butler : `lsst.daf.butler.Butler`
341 Data butler instance
342 """
344 disableImplicitThreading() # To prevent thread contention
346 _LOG.debug("Using %r for multiprocessing start method", self.startMethod)
348 # re-pack input quantum data into jobs list
349 jobs = _JobList(graph)
351 # check that all tasks can run in sub-process
352 for job in jobs.jobs:
353 taskDef = job.qnode.taskDef
354 if not taskDef.taskClass.canMultiprocess:
355 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
356 " use single process")
358 finished, failed = 0, 0
359 while jobs.pending() or jobs.running():
361 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
362 _LOG.debug("#runningJobs: %s", len(jobs.running()))
364 # See if any jobs have finished
365 for job in jobs.running():
366 if not job.process.is_alive():
367 _LOG.debug("finished: %s", job)
368 # finished
369 exitcode = job.process.exitcode
370 if exitcode == 0:
371 job.state = JobState.FINISHED
372 job.cleanup()
373 _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started)
374 else:
375 job.state = JobState.FAILED
376 job.cleanup()
377 _LOG.debug("failed: %s", job)
378 if self.failFast:
379 for stopJob in jobs.running():
380 if stopJob is not job:
381 stopJob.stop()
382 raise MPGraphExecutorError(
383 f"Task {job} failed, exit code={exitcode}."
384 )
385 else:
386 _LOG.error(
387 "Task %s failed; processing will continue for remaining tasks.", job
388 )
389 else:
390 # check for timeout
391 now = time.time()
392 if now - job.started > self.timeout:
393 job.state = JobState.TIMED_OUT
394 _LOG.debug("Terminating job %s due to timeout", job)
395 job.stop()
396 job.cleanup()
397 if self.failFast:
398 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
399 else:
400 _LOG.error(
401 "Timeout (%s sec) for task %s; task is killed, processing continues "
402 "for remaining tasks.", self.timeout, job
403 )
405 # see if we can start more jobs
406 for job in jobs.pending():
408 # check all dependencies
409 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes():
410 # upstream job has failed, skipping this
411 job.state = JobState.FAILED_DEP
412 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
413 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes():
414 # all dependencies have completed, can start new job
415 if len(jobs.running()) < self.numProc:
416 _LOG.debug("Sumbitting %s", job)
417 job.start(butler, self.quantumExecutor, self.startMethod)
419 # Do cleanup for timed out jobs if necessary.
420 jobs.cleanup()
422 # Print progress message if something changed.
423 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes())
424 if (finished, failed) != (newFinished, newFailed):
425 finished, failed = newFinished, newFailed
426 totalCount = len(jobs.jobs)
427 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
428 finished, failed, totalCount - finished - failed, totalCount)
430 # Here we want to wait until one of the running jobs completes
431 # but multiprocessing does not provide an API for that, for now
432 # just sleep a little bit and go back to the loop.
433 if jobs.running():
434 time.sleep(0.1)
436 if jobs.failedNodes():
437 # print list of failed jobs
438 _LOG.error("Failed jobs:")
439 for job in jobs.jobs:
440 if job.state != JobState.FINISHED:
441 _LOG.error(" - %s: %s", job.state, job)
443 # if any job failed raise an exception
444 if jobs.failedNodes() == jobs.timedOutIds():
445 raise MPTimeoutError("One or more tasks timed out during execution.")
446 else:
447 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")