Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import time
33from lsst.pipe.base.graph.graph import QuantumGraph
35# -----------------------------
36# Imports for other modules --
37# -----------------------------
38from .quantumGraphExecutor import QuantumGraphExecutor
39from lsst.base import disableImplicitThreading
41_LOG = logging.getLogger(__name__.partition(".")[2])
44# Possible states for the executing task:
45# - PENDING: job has not started yet
46# - RUNNING: job is currently executing
47# - FINISHED: job finished successfully
48# - FAILED: job execution failed (process returned non-zero status)
49# - TIMED_OUT: job is killed due to too long execution time
50# - FAILED_DEP: one of the dependencies of this job has failed/timed out
51JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
54class _Job:
55 """Class representing a job running single task.
57 Parameters
58 ----------
59 qnode: `~lsst.pipe.base.QuantumNode`
60 Quantum and some associated information.
61 """
62 def __init__(self, qnode):
63 self.qnode = qnode
64 self.process = None
65 self.state = JobState.PENDING
66 self.started = None
67 self.index = qnode.nodeId.number
68 self.taskDef = qnode.taskDef
70 def start(self, butler, quantumExecutor):
71 """Start process which runs the task.
73 Parameters
74 ----------
75 butler : `lsst.daf.butler.Butler`
76 Data butler instance.
77 quantumExecutor : `QuantumExecutor`
78 Executor for single quantum.
79 """
80 # Butler can have live database connections which is a problem with
81 # fork-type activation. Make a pickle of butler to pass that across
82 # fork.
83 butler_pickle = pickle.dumps(butler)
84 taskDef = self.taskDef
85 quantum = self.qnode.quantum
86 self.process = multiprocessing.Process(
87 target=self._executeJob,
88 args=(quantumExecutor, taskDef, quantum, butler_pickle),
89 name=f"task-{self.index}"
90 )
91 self.process.start()
92 self.started = time.time()
93 self.state = JobState.RUNNING
95 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle):
96 """Execute a job with arguments.
98 Parameters
99 ----------
100 quantumExecutor : `QuantumExecutor`
101 Executor for single quantum.
102 taskDef : `~lsst.pipe.base.TaskDef`
103 Task definition structure.
104 quantum : `~lsst.daf.butler.Quantum`
105 Quantum for this task execution.
106 butler_pickle : `bytes`
107 Data butler instance in pickled form.
108 """
109 butler = pickle.loads(butler_pickle)
110 quantumExecutor.execute(taskDef, quantum, butler)
112 def stop(self):
113 """Stop the process.
114 """
115 self.process.terminate()
116 # give it 1 second to finish or KILL
117 for i in range(10):
118 time.sleep(0.1)
119 if not self.process.is_alive():
120 break
121 else:
122 _LOG.debug("Killing process %s", self.process.name)
123 self.process.kill()
125 def cleanup(self):
126 """Release processes resources, has to be called for each finished
127 process.
128 """
129 if self.process and not self.process.is_alive():
130 self.process.close()
131 self.process = None
133 def __str__(self):
134 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>"
137class _JobList:
138 """Simple list of _Job instances with few convenience methods.
140 Parameters
141 ----------
142 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
143 Sequence if Quanta to execute. This has to be ordered according to
144 task dependencies.
145 """
146 def __init__(self, iterable):
147 self.jobs = [_Job(qnode) for qnode in iterable]
149 def pending(self):
150 """Return list of jobs that wait for execution.
152 Returns
153 -------
154 jobs : `list` [`_Job`]
155 List of jobs.
156 """
157 return [job for job in self.jobs if job.state == JobState.PENDING]
159 def running(self):
160 """Return list of jobs that are executing.
162 Returns
163 -------
164 jobs : `list` [`_Job`]
165 List of jobs.
166 """
167 return [job for job in self.jobs if job.state == JobState.RUNNING]
169 def finishedNodes(self):
170 """Return set of QuantumNodes that finished successfully (not failed).
172 Returns
173 -------
174 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
175 Set of QuantumNodes that have successfully finished
176 """
177 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED)
179 def failedNodes(self):
180 """Return set of jobs IDs that failed for any reason.
182 Returns
183 -------
184 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
185 Set of QUantumNodes that failed during processing
186 """
187 return set(job.qnode for job in self.jobs
188 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
190 def timedOutIds(self):
191 """Return set of jobs IDs that timed out.
193 Returns
194 -------
195 jobsIds : `set` [`int`]
196 Set of integer job IDs.
197 """
198 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT)
200 def cleanup(self):
201 """Do periodic cleanup for jobs that did not finish correctly.
203 If timed out jobs are killed but take too long to stop then regular
204 cleanup will not work for them. Here we check all timed out jobs
205 periodically and do cleanup if they managed to die by this time.
206 """
207 for job in self.jobs:
208 if job.state == JobState.TIMED_OUT and job.process is not None:
209 job.cleanup()
212class MPGraphExecutorError(Exception):
213 """Exception class for errors raised by MPGraphExecutor.
214 """
215 pass
218class MPTimeoutError(MPGraphExecutorError):
219 """Exception raised when task execution times out.
220 """
221 pass
224class MPGraphExecutor(QuantumGraphExecutor):
225 """Implementation of QuantumGraphExecutor using same-host multiprocess
226 execution of Quanta.
228 Parameters
229 ----------
230 numProc : `int`
231 Number of processes to use for executing tasks.
232 timeout : `float`
233 Time in seconds to wait for tasks to finish.
234 quantumExecutor : `QuantumExecutor`
235 Executor for single quantum. For multiprocess-style execution when
236 ``numProc`` is greater than one this instance must support pickle.
237 failFast : `bool`, optional
238 If set to ``True`` then stop processing on first error from any task.
239 executionGraphFixup : `ExecutionGraphFixup`, optional
240 Instance used for modification of execution graph.
241 """
242 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None):
243 self.numProc = numProc
244 self.timeout = timeout
245 self.quantumExecutor = quantumExecutor
246 self.failFast = failFast
247 self.executionGraphFixup = executionGraphFixup
249 def execute(self, graph, butler):
250 # Docstring inherited from QuantumGraphExecutor.execute
251 graph = self._fixupQuanta(graph)
252 if self.numProc > 1:
253 self._executeQuantaMP(graph, butler)
254 else:
255 self._executeQuantaInProcess(graph, butler)
257 def _fixupQuanta(self, graph: QuantumGraph):
258 """Call fixup code to modify execution graph.
260 Parameters
261 ----------
262 graph : `QuantumGraph`
263 `QuantumGraph` to modify
265 Returns
266 -------
267 graph : `QuantumGraph`
268 Modified `QuantumGraph`.
270 Raises
271 ------
272 MPGraphExecutorError
273 Raised if execution graph cannot be ordered after modification,
274 i.e. it has dependency cycles.
275 """
276 if not self.executionGraphFixup:
277 return graph
279 _LOG.debug("Call execution graph fixup method")
280 graph = self.executionGraphFixup.fixupQuanta(graph)
282 # Detect if there is now a cycle created within the graph
283 if graph.findCycle():
284 raise MPGraphExecutorError(
285 "Updated execution graph has dependency cycle.")
287 return graph
289 def _executeQuantaInProcess(self, graph, butler):
290 """Execute all Quanta in current process.
292 Parameters
293 ----------
294 graph : `QuantumGraph`
295 `QuantumGraph` that is to be executed
296 butler : `lsst.daf.butler.Butler`
297 Data butler instance
298 """
299 # Note that in non-MP case any failed task will generate an exception
300 # and kill the whole thing. In general we cannot guarantee exception
301 # safety so easiest and safest thing is to let it die.
302 count, totalCount = 0, len(graph)
303 for qnode in graph:
304 _LOG.debug("Executing %s", qnode)
305 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler)
306 count += 1
307 _LOG.info("Executed %d quanta, %d remain out of total %d quanta.",
308 count, totalCount - count, totalCount)
310 def _executeQuantaMP(self, graph, butler):
311 """Execute all Quanta in separate processes.
313 Parameters
314 ----------
315 graph : `QuantumGraph`
316 `QuantumGraph` that is to be executed.
317 butler : `lsst.daf.butler.Butler`
318 Data butler instance
319 """
321 disableImplicitThreading() # To prevent thread contention
323 # re-pack input quantum data into jobs list
324 jobs = _JobList(graph)
326 # check that all tasks can run in sub-process
327 for job in jobs.jobs:
328 taskDef = job.taskDef
329 if not taskDef.taskClass.canMultiprocess:
330 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
331 " use single process")
333 finished, failed = 0, 0
334 while jobs.pending() or jobs.running():
336 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
337 _LOG.debug("#runningJobs: %s", len(jobs.running()))
339 # See if any jobs have finished
340 for job in jobs.running():
341 if not job.process.is_alive():
342 _LOG.debug("finished: %s", job)
343 # finished
344 exitcode = job.process.exitcode
345 if exitcode == 0:
346 job.state = JobState.FINISHED
347 job.cleanup()
348 _LOG.debug("success: %s", job)
349 else:
350 job.state = JobState.FAILED
351 job.cleanup()
352 _LOG.debug("failed: %s", job)
353 if self.failFast:
354 for stopJob in jobs.running():
355 if stopJob is not job:
356 stopJob.stop()
357 raise MPGraphExecutorError(
358 f"Task {job} failed, exit code={exitcode}."
359 )
360 else:
361 _LOG.error(
362 "Task %s failed; processing will continue for remaining tasks.", job
363 )
364 else:
365 # check for timeout
366 now = time.time()
367 if now - job.started > self.timeout:
368 job.state = JobState.TIMED_OUT
369 _LOG.debug("Terminating job %s due to timeout", job)
370 job.stop()
371 job.cleanup()
372 if self.failFast:
373 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
374 else:
375 _LOG.error(
376 "Timeout (%s sec) for task %s; task is killed, processing continues "
377 "for remaining tasks.", self.timeout, job
378 )
380 # see if we can start more jobs
381 for job in jobs.pending():
383 # check all dependencies
384 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes():
385 # upstream job has failed, skipping this
386 job.state = JobState.FAILED_DEP
387 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
388 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes():
389 # all dependencies have completed, can start new job
390 if len(jobs.running()) < self.numProc:
391 _LOG.debug("Sumbitting %s", job)
392 job.start(butler, self.quantumExecutor)
394 # Do cleanup for timed out jobs if necessary.
395 jobs.cleanup()
397 # Print progress message if something changed.
398 newFinished, newFailed = len(jobs.finishedNodes()), len(jobs.failedNodes())
399 if (finished, failed) != (newFinished, newFailed):
400 finished, failed = newFinished, newFailed
401 totalCount = len(jobs.jobs)
402 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
403 finished, failed, totalCount - finished - failed, totalCount)
405 # Here we want to wait until one of the running jobs completes
406 # but multiprocessing does not provide an API for that, for now
407 # just sleep a little bit and go back to the loop.
408 if jobs.running():
409 time.sleep(0.1)
411 if jobs.failedNodes():
412 # print list of failed jobs
413 _LOG.error("Failed jobs:")
414 for job in jobs.jobs:
415 if job.state != JobState.FINISHED:
416 _LOG.error(" - %s: %s", job.state, job)
418 # if any job failed raise an exception
419 if jobs.failedNodes() == jobs.timedOutIds():
420 raise MPTimeoutError("One or more tasks timed out during execution.")
421 else:
422 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")