Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import time
33from lsst.pipe.base.graph.graph import QuantumGraph
35# -----------------------------
36# Imports for other modules --
37# -----------------------------
38from .quantumGraphExecutor import QuantumGraphExecutor
39from lsst.base import disableImplicitThreading
41_LOG = logging.getLogger(__name__.partition(".")[2])
44# Possible states for the executing task:
45# - PENDING: job has not started yet
46# - RUNNING: job is currently executing
47# - FINISHED: job finished successfully
48# - FAILED: job execution failed (process returned non-zero status)
49# - TIMED_OUT: job is killed due to too long execution time
50# - FAILED_DEP: one of the dependencies of this job has failed/timed out
51JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
54class _Job:
55 """Class representing a job running single task.
57 Parameters
58 ----------
59 qnode: `~lsst.pipe.base.QuantumNode`
60 Quantum and some associated information.
61 """
62 def __init__(self, qnode):
63 self.qnode = qnode
64 self.process = None
65 self.state = JobState.PENDING
66 self.started = None
67 self.index = qnode.nodeId.number
68 self.taskDef = qnode.taskDef
70 def start(self, butler, quantumExecutor):
71 """Start process which runs the task.
73 Parameters
74 ----------
75 butler : `lsst.daf.butler.Butler`
76 Data butler instance.
77 quantumExecutor : `QuantumExecutor`
78 Executor for single quantum.
79 """
80 # Butler can have live database connections which is a problem with
81 # fork-type activation. Make a pickle of butler to pass that across
82 # fork.
83 butler_pickle = pickle.dumps(butler)
84 taskDef = self.taskDef
85 quantum = self.qnode.quantum
86 # Use fork for multiprocessing start method on all platforms
87 mp_ctx = multiprocessing.get_context("fork")
88 self.process = mp_ctx.Process(
89 target=self._executeJob,
90 args=(quantumExecutor, taskDef, quantum, butler_pickle),
91 name=f"task-{self.index}"
92 )
93 self.process.start()
94 self.started = time.time()
95 self.state = JobState.RUNNING
97 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle):
98 """Execute a job with arguments.
100 Parameters
101 ----------
102 quantumExecutor : `QuantumExecutor`
103 Executor for single quantum.
104 taskDef : `~lsst.pipe.base.TaskDef`
105 Task definition structure.
106 quantum : `~lsst.daf.butler.Quantum`
107 Quantum for this task execution.
108 butler_pickle : `bytes`
109 Data butler instance in pickled form.
110 """
111 butler = pickle.loads(butler_pickle)
112 quantumExecutor.execute(taskDef, quantum, butler)
114 def stop(self):
115 """Stop the process.
116 """
117 self.process.terminate()
118 # give it 1 second to finish or KILL
119 for i in range(10):
120 time.sleep(0.1)
121 if not self.process.is_alive():
122 break
123 else:
124 _LOG.debug("Killing process %s", self.process.name)
125 self.process.kill()
127 def cleanup(self):
128 """Release processes resources, has to be called for each finished
129 process.
130 """
131 if self.process and not self.process.is_alive():
132 self.process.close()
133 self.process = None
135 def __str__(self):
136 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>"
139class _JobList:
140 """Simple list of _Job instances with few convenience methods.
142 Parameters
143 ----------
144 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
145 Sequence if Quanta to execute. This has to be ordered according to
146 task dependencies.
147 """
148 def __init__(self, iterable):
149 self.jobs = [_Job(qnode) for qnode in iterable]
151 def pending(self):
152 """Return list of jobs that wait for execution.
154 Returns
155 -------
156 jobs : `list` [`_Job`]
157 List of jobs.
158 """
159 return [job for job in self.jobs if job.state == JobState.PENDING]
161 def running(self):
162 """Return list of jobs that are executing.
164 Returns
165 -------
166 jobs : `list` [`_Job`]
167 List of jobs.
168 """
169 return [job for job in self.jobs if job.state == JobState.RUNNING]
171 def finishedNodes(self):
172 """Return set of QuantumNodes that finished successfully (not failed).
174 Returns
175 -------
176 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
177 Set of QuantumNodes that have successfully finished
178 """
179 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED)
181 def failedNodes(self):
182 """Return set of jobs IDs that failed for any reason.
184 Returns
185 -------
186 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
187 Set of QUantumNodes that failed during processing
188 """
189 return set(job.qnode for job in self.jobs
190 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
192 def timedOutIds(self):
193 """Return set of jobs IDs that timed out.
195 Returns
196 -------
197 jobsIds : `set` [`int`]
198 Set of integer job IDs.
199 """
200 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT)
202 def cleanup(self):
203 """Do periodic cleanup for jobs that did not finish correctly.
205 If timed out jobs are killed but take too long to stop then regular
206 cleanup will not work for them. Here we check all timed out jobs
207 periodically and do cleanup if they managed to die by this time.
208 """
209 for job in self.jobs:
210 if job.state == JobState.TIMED_OUT and job.process is not None:
211 job.cleanup()
214class MPGraphExecutorError(Exception):
215 """Exception class for errors raised by MPGraphExecutor.
216 """
217 pass
220class MPTimeoutError(MPGraphExecutorError):
221 """Exception raised when task execution times out.
222 """
223 pass
226class MPGraphExecutor(QuantumGraphExecutor):
227 """Implementation of QuantumGraphExecutor using same-host multiprocess
228 execution of Quanta.
230 Parameters
231 ----------
232 numProc : `int`
233 Number of processes to use for executing tasks.
234 timeout : `float`
235 Time in seconds to wait for tasks to finish.
236 quantumExecutor : `QuantumExecutor`
237 Executor for single quantum. For multiprocess-style execution when
238 ``numProc`` is greater than one this instance must support pickle.
239 failFast : `bool`, optional
240 If set to ``True`` then stop processing on first error from any task.
241 executionGraphFixup : `ExecutionGraphFixup`, optional
242 Instance used for modification of execution graph.
243 """
244 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None):
245 self.numProc = numProc
246 self.timeout = timeout
247 self.quantumExecutor = quantumExecutor
248 self.failFast = failFast
249 self.executionGraphFixup = executionGraphFixup
251 def execute(self, graph, butler):
252 # Docstring inherited from QuantumGraphExecutor.execute
253 graph = self._fixupQuanta(graph)
254 if self.numProc > 1:
255 self._executeQuantaMP(graph, butler)
256 else:
257 self._executeQuantaInProcess(graph, butler)
259 def _fixupQuanta(self, graph: QuantumGraph):
260 """Call fixup code to modify execution graph.
262 Parameters
263 ----------
264 graph : `QuantumGraph`
265 `QuantumGraph` to modify
267 Returns
268 -------
269 graph : `QuantumGraph`
270 Modified `QuantumGraph`.
272 Raises
273 ------
274 MPGraphExecutorError
275 Raised if execution graph cannot be ordered after modification,
276 i.e. it has dependency cycles.
277 """
278 if not self.executionGraphFixup:
279 return graph
281 _LOG.debug("Call execution graph fixup method")
282 graph = self.executionGraphFixup.fixupQuanta(graph)
284 # Detect if there is now a cycle created within the graph
285 if graph.findCycle():
286 raise MPGraphExecutorError(
287 "Updated execution graph has dependency cycle.")
289 return graph
291 def _executeQuantaInProcess(self, graph, butler):
292 """Execute all Quanta in current process.
294 Parameters
295 ----------
296 graph : `QuantumGraph`
297 `QuantumGraph` that is to be executed
298 butler : `lsst.daf.butler.Butler`
299 Data butler instance
300 """
301 # Note that in non-MP case any failed task will generate an exception
302 # and kill the whole thing. In general we cannot guarantee exception
303 # safety so easiest and safest thing is to let it die.
304 count, totalCount = 0, len(graph)
305 for qnode in graph:
306 _LOG.debug("Executing %s", qnode)
307 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler)
308 count += 1
309 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.",
310 count, totalCount - count, totalCount)
312 def _executeQuantaMP(self, graph, butler):
313 """Execute all Quanta in separate processes.
315 Parameters
316 ----------
317 graph : `QuantumGraph`
318 `QuantumGraph` that is to be executed.
319 butler : `lsst.daf.butler.Butler`
320 Data butler instance
321 """
323 disableImplicitThreading() # To prevent thread contention
325 # re-pack input quantum data into jobs list
326 jobs = _JobList(graph)
328 # check that all tasks can run in sub-process
329 for job in jobs.jobs:
330 taskDef = job.taskDef
331 if not taskDef.taskClass.canMultiprocess:
332 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
333 " use single process")
335 finished, failed = 0, 0
336 while jobs.pending() or jobs.running():
338 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
339 _LOG.debug("#runningJobs: %s", len(jobs.running()))
341 # See if any jobs have finished
342 for job in jobs.running():
343 if not job.process.is_alive():
344 _LOG.debug("finished: %s", job)
345 # finished
346 exitcode = job.process.exitcode
347 if exitcode == 0:
348 job.state = JobState.FINISHED
349 job.cleanup()
350 _LOG.debug("success: %s", job)
351 else:
352 job.state = JobState.FAILED
353 job.cleanup()
354 _LOG.debug("failed: %s", job)
355 if self.failFast:
356 for stopJob in jobs.running():
357 if stopJob is not job:
358 stopJob.stop()
359 raise MPGraphExecutorError(
360 f"Task {job} failed, exit code={exitcode}."
361 )
362 else:
363 _LOG.error(
364 "Task %s failed; processing will continue for remaining tasks.", job
365 )
366 else:
367 # check for timeout
368 now = time.time()
369 if now - job.started > self.timeout:
370 job.state = JobState.TIMED_OUT
371 _LOG.debug("Terminating job %s due to timeout", job)
372 job.stop()
373 job.cleanup()
374 if self.failFast:
375 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
376 else:
377 _LOG.error(
378 "Timeout (%s sec) for task %s; task is killed, processing continues "
379 "for remaining tasks.", self.timeout, job
380 )
382 # see if we can start more jobs
383 for job in jobs.pending():
385 # check all dependencies
386 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes():
387 # upstream job has failed, skipping this
388 job.state = JobState.FAILED_DEP
389 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
390 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes():
391 # all dependencies have completed, can start new job
392 if len(jobs.running()) < self.numProc:
393 _LOG.debug("Sumbitting %s", job)
394 job.start(butler, self.quantumExecutor)
396 # Do cleanup for timed out jobs if necessary.
397 jobs.cleanup()
399 # Print progress message if something changed.
400 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes())
401 if (finished, failed) != (newFinished, newFailed):
402 finished, failed = newFinished, newFailed
403 totalCount = len(jobs.jobs)
404 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
405 finished, failed, totalCount - finished - failed, totalCount)
407 # Here we want to wait until one of the running jobs completes
408 # but multiprocessing does not provide an API for that, for now
409 # just sleep a little bit and go back to the loop.
410 if jobs.running():
411 time.sleep(0.1)
413 if jobs.failedNodes():
414 # print list of failed jobs
415 _LOG.error("Failed jobs:")
416 for job in jobs.jobs:
417 if job.state != JobState.FINISHED:
418 _LOG.error(" - %s: %s", job.state, job)
420 # if any job failed raise an exception
421 if jobs.failedNodes() == jobs.timedOutIds():
422 raise MPTimeoutError("One or more tasks timed out during execution.")
423 else:
424 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")