Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import sys
32import time
34from lsst.pipe.base.graph.graph import QuantumGraph
36# -----------------------------
37# Imports for other modules --
38# -----------------------------
39from .quantumGraphExecutor import QuantumGraphExecutor
40from lsst.base import disableImplicitThreading
41from lsst.daf.butler.cli.cliLog import CliLog
43_LOG = logging.getLogger(__name__.partition(".")[2])
46# Possible states for the executing task:
47# - PENDING: job has not started yet
48# - RUNNING: job is currently executing
49# - FINISHED: job finished successfully
50# - FAILED: job execution failed (process returned non-zero status)
51# - TIMED_OUT: job is killed due to too long execution time
52# - FAILED_DEP: one of the dependencies of this job has failed/timed out
53JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
56class _Job:
57 """Class representing a job running single task.
59 Parameters
60 ----------
61 qnode: `~lsst.pipe.base.QuantumNode`
62 Quantum and some associated information.
63 """
64 def __init__(self, qnode):
65 self.qnode = qnode
66 self.process = None
67 self.state = JobState.PENDING
68 self.started = None
70 def start(self, butler, quantumExecutor, startMethod=None):
71 """Start process which runs the task.
73 Parameters
74 ----------
75 butler : `lsst.daf.butler.Butler`
76 Data butler instance.
77 quantumExecutor : `QuantumExecutor`
78 Executor for single quantum.
79 startMethod : `str`, optional
80 Start method from `multiprocessing` module.
81 """
82 # Butler can have live database connections which is a problem with
83 # fork-type activation. Make a pickle of butler to pass that across
84 # fork. Unpickling of quantum has to happen after butler, this is why
85 # it is pickled manually here.
86 butler_pickle = pickle.dumps(butler)
87 quantum_pickle = pickle.dumps(self.qnode.quantum)
88 taskDef = self.qnode.taskDef
89 logConfigState = CliLog.configState
90 mp_ctx = multiprocessing.get_context(startMethod)
91 self.process = mp_ctx.Process(
92 target=_Job._executeJob,
93 args=(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState),
94 name=f"task-{self.qnode.nodeId.number}"
95 )
96 self.process.start()
97 self.started = time.time()
98 self.state = JobState.RUNNING
100 @staticmethod
101 def _executeJob(quantumExecutor, taskDef, quantum_pickle, butler_pickle, logConfigState):
102 """Execute a job with arguments.
104 Parameters
105 ----------
106 quantumExecutor : `QuantumExecutor`
107 Executor for single quantum.
108 taskDef : `bytes`
109 Task definition structure.
110 quantum_pickle : `bytes`
111 Quantum for this task execution in pickled form.
112 butler_pickle : `bytes`
113 Data butler instance in pickled form.
114 """
115 if logConfigState and not CliLog.configState:
116 # means that we are in a new spawned Python process and we have to
117 # re-initialize logging
118 CliLog.replayConfigState(logConfigState)
120 butler = pickle.loads(butler_pickle)
121 quantum = pickle.loads(quantum_pickle)
122 quantumExecutor.execute(taskDef, quantum, butler)
124 def stop(self):
125 """Stop the process.
126 """
127 self.process.terminate()
128 # give it 1 second to finish or KILL
129 for i in range(10):
130 time.sleep(0.1)
131 if not self.process.is_alive():
132 break
133 else:
134 _LOG.debug("Killing process %s", self.process.name)
135 self.process.kill()
137 def cleanup(self):
138 """Release processes resources, has to be called for each finished
139 process.
140 """
141 if self.process and not self.process.is_alive():
142 self.process.close()
143 self.process = None
145 def __str__(self):
146 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>"
149class _JobList:
150 """Simple list of _Job instances with few convenience methods.
152 Parameters
153 ----------
154 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
155 Sequence if Quanta to execute. This has to be ordered according to
156 task dependencies.
157 """
158 def __init__(self, iterable):
159 self.jobs = [_Job(qnode) for qnode in iterable]
161 def pending(self):
162 """Return list of jobs that wait for execution.
164 Returns
165 -------
166 jobs : `list` [`_Job`]
167 List of jobs.
168 """
169 return [job for job in self.jobs if job.state == JobState.PENDING]
171 def running(self):
172 """Return list of jobs that are executing.
174 Returns
175 -------
176 jobs : `list` [`_Job`]
177 List of jobs.
178 """
179 return [job for job in self.jobs if job.state == JobState.RUNNING]
181 def finishedNodes(self):
182 """Return set of QuantumNodes that finished successfully (not failed).
184 Returns
185 -------
186 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
187 Set of QuantumNodes that have successfully finished
188 """
189 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED)
191 def failedNodes(self):
192 """Return set of jobs IDs that failed for any reason.
194 Returns
195 -------
196 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
197 Set of QUantumNodes that failed during processing
198 """
199 return set(job.qnode for job in self.jobs
200 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
202 def timedOutIds(self):
203 """Return set of jobs IDs that timed out.
205 Returns
206 -------
207 jobsIds : `set` [`int`]
208 Set of integer job IDs.
209 """
210 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT)
212 def cleanup(self):
213 """Do periodic cleanup for jobs that did not finish correctly.
215 If timed out jobs are killed but take too long to stop then regular
216 cleanup will not work for them. Here we check all timed out jobs
217 periodically and do cleanup if they managed to die by this time.
218 """
219 for job in self.jobs:
220 if job.state == JobState.TIMED_OUT and job.process is not None:
221 job.cleanup()
224class MPGraphExecutorError(Exception):
225 """Exception class for errors raised by MPGraphExecutor.
226 """
227 pass
230class MPTimeoutError(MPGraphExecutorError):
231 """Exception raised when task execution times out.
232 """
233 pass
236class MPGraphExecutor(QuantumGraphExecutor):
237 """Implementation of QuantumGraphExecutor using same-host multiprocess
238 execution of Quanta.
240 Parameters
241 ----------
242 numProc : `int`
243 Number of processes to use for executing tasks.
244 timeout : `float`
245 Time in seconds to wait for tasks to finish.
246 quantumExecutor : `QuantumExecutor`
247 Executor for single quantum. For multiprocess-style execution when
248 ``numProc`` is greater than one this instance must support pickle.
249 startMethod : `str`, optional
250 Start method from `multiprocessing` module, `None` selects the best
251 one for current platform.
252 failFast : `bool`, optional
253 If set to ``True`` then stop processing on first error from any task.
254 executionGraphFixup : `ExecutionGraphFixup`, optional
255 Instance used for modification of execution graph.
256 """
257 def __init__(self, numProc, timeout, quantumExecutor, *,
258 startMethod=None, failFast=False, executionGraphFixup=None):
259 self.numProc = numProc
260 self.timeout = timeout
261 self.quantumExecutor = quantumExecutor
262 self.failFast = failFast
263 self.executionGraphFixup = executionGraphFixup
265 # We set default start method as spawn for MacOS and fork for Linux;
266 # None for all other platforms to use multiprocessing default.
267 if startMethod is None:
268 methods = dict(linux="fork", darwin="spawn")
269 startMethod = methods.get(sys.platform)
270 self.startMethod = startMethod
271 _LOG.info("Using %r for multiprocessing start method", self.startMethod)
273 def execute(self, graph, butler):
274 # Docstring inherited from QuantumGraphExecutor.execute
275 graph = self._fixupQuanta(graph)
276 if self.numProc > 1:
277 self._executeQuantaMP(graph, butler)
278 else:
279 self._executeQuantaInProcess(graph, butler)
281 def _fixupQuanta(self, graph: QuantumGraph):
282 """Call fixup code to modify execution graph.
284 Parameters
285 ----------
286 graph : `QuantumGraph`
287 `QuantumGraph` to modify
289 Returns
290 -------
291 graph : `QuantumGraph`
292 Modified `QuantumGraph`.
294 Raises
295 ------
296 MPGraphExecutorError
297 Raised if execution graph cannot be ordered after modification,
298 i.e. it has dependency cycles.
299 """
300 if not self.executionGraphFixup:
301 return graph
303 _LOG.debug("Call execution graph fixup method")
304 graph = self.executionGraphFixup.fixupQuanta(graph)
306 # Detect if there is now a cycle created within the graph
307 if graph.findCycle():
308 raise MPGraphExecutorError(
309 "Updated execution graph has dependency cycle.")
311 return graph
313 def _executeQuantaInProcess(self, graph, butler):
314 """Execute all Quanta in current process.
316 Parameters
317 ----------
318 graph : `QuantumGraph`
319 `QuantumGraph` that is to be executed
320 butler : `lsst.daf.butler.Butler`
321 Data butler instance
322 """
323 # Note that in non-MP case any failed task will generate an exception
324 # and kill the whole thing. In general we cannot guarantee exception
325 # safety so easiest and safest thing is to let it die.
326 count, totalCount = 0, len(graph)
327 for qnode in graph:
328 _LOG.debug("Executing %s", qnode)
329 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler)
330 count += 1
331 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.",
332 count, totalCount - count, totalCount)
334 def _executeQuantaMP(self, graph, butler):
335 """Execute all Quanta in separate processes.
337 Parameters
338 ----------
339 graph : `QuantumGraph`
340 `QuantumGraph` that is to be executed.
341 butler : `lsst.daf.butler.Butler`
342 Data butler instance
343 """
345 disableImplicitThreading() # To prevent thread contention
347 # re-pack input quantum data into jobs list
348 jobs = _JobList(graph)
350 # check that all tasks can run in sub-process
351 for job in jobs.jobs:
352 taskDef = job.qnode.taskDef
353 if not taskDef.taskClass.canMultiprocess:
354 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
355 " use single process")
357 finished, failed = 0, 0
358 while jobs.pending() or jobs.running():
360 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
361 _LOG.debug("#runningJobs: %s", len(jobs.running()))
363 # See if any jobs have finished
364 for job in jobs.running():
365 if not job.process.is_alive():
366 _LOG.debug("finished: %s", job)
367 # finished
368 exitcode = job.process.exitcode
369 if exitcode == 0:
370 job.state = JobState.FINISHED
371 job.cleanup()
372 _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started)
373 else:
374 job.state = JobState.FAILED
375 job.cleanup()
376 _LOG.debug("failed: %s", job)
377 if self.failFast:
378 for stopJob in jobs.running():
379 if stopJob is not job:
380 stopJob.stop()
381 raise MPGraphExecutorError(
382 f"Task {job} failed, exit code={exitcode}."
383 )
384 else:
385 _LOG.error(
386 "Task %s failed; processing will continue for remaining tasks.", job
387 )
388 else:
389 # check for timeout
390 now = time.time()
391 if now - job.started > self.timeout:
392 job.state = JobState.TIMED_OUT
393 _LOG.debug("Terminating job %s due to timeout", job)
394 job.stop()
395 job.cleanup()
396 if self.failFast:
397 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
398 else:
399 _LOG.error(
400 "Timeout (%s sec) for task %s; task is killed, processing continues "
401 "for remaining tasks.", self.timeout, job
402 )
404 # see if we can start more jobs
405 for job in jobs.pending():
407 # check all dependencies
408 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes():
409 # upstream job has failed, skipping this
410 job.state = JobState.FAILED_DEP
411 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
412 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes():
413 # all dependencies have completed, can start new job
414 if len(jobs.running()) < self.numProc:
415 _LOG.debug("Sumbitting %s", job)
416 job.start(butler, self.quantumExecutor, self.startMethod)
418 # Do cleanup for timed out jobs if necessary.
419 jobs.cleanup()
421 # Print progress message if something changed.
422 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes())
423 if (finished, failed) != (newFinished, newFailed):
424 finished, failed = newFinished, newFailed
425 totalCount = len(jobs.jobs)
426 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
427 finished, failed, totalCount - finished - failed, totalCount)
429 # Here we want to wait until one of the running jobs completes
430 # but multiprocessing does not provide an API for that, for now
431 # just sleep a little bit and go back to the loop.
432 if jobs.running():
433 time.sleep(0.1)
435 if jobs.failedNodes():
436 # print list of failed jobs
437 _LOG.error("Failed jobs:")
438 for job in jobs.jobs:
439 if job.state != JobState.FINISHED:
440 _LOG.error(" - %s: %s", job.state, job)
442 # if any job failed raise an exception
443 if jobs.failedNodes() == jobs.timedOutIds():
444 raise MPTimeoutError("One or more tasks timed out during execution.")
445 else:
446 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")