Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import logging
29import multiprocessing
30import pickle
31import time
33from lsst.pipe.base.graph.graph import QuantumGraph
35# -----------------------------
36# Imports for other modules --
37# -----------------------------
38from .quantumGraphExecutor import QuantumGraphExecutor
39from lsst.base import disableImplicitThreading
41_LOG = logging.getLogger(__name__.partition(".")[2])
44# Possible states for the executing task:
45# - PENDING: job has not started yet
46# - RUNNING: job is currently executing
47# - FINISHED: job finished successfully
48# - FAILED: job execution failed (process returned non-zero status)
49# - TIMED_OUT: job is killed due to too long execution time
50# - FAILED_DEP: one of the dependencies of this job has failed/timed out
51JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
54class _Job:
55 """Class representing a job running single task.
57 Parameters
58 ----------
59 qnode: `~lsst.pipe.base.QuantumNode`
60 Quantum and some associated information.
61 """
62 def __init__(self, qnode):
63 self.qnode = qnode
64 self.process = None
65 self.state = JobState.PENDING
66 self.started = None
67 self.index = qnode.nodeId.number
68 self.taskDef = qnode.taskDef
70 def start(self, butler, quantumExecutor):
71 """Start process which runs the task.
73 Parameters
74 ----------
75 butler : `lsst.daf.butler.Butler`
76 Data butler instance.
77 quantumExecutor : `QuantumExecutor`
78 Executor for single quantum.
79 """
80 # Butler can have live database connections which is a problem with
81 # fork-type activation. Make a pickle of butler to pass that across
82 # fork.
83 butler_pickle = pickle.dumps(butler)
84 taskDef = self.taskDef
85 quantum = self.qnode.quantum
86 self.process = multiprocessing.Process(
87 target=self._executeJob,
88 args=(quantumExecutor, taskDef, quantum, butler_pickle),
89 name=f"task-{self.index}"
90 )
91 self.process.start()
92 self.started = time.time()
93 self.state = JobState.RUNNING
95 def _executeJob(self, quantumExecutor, taskDef, quantum, butler_pickle):
96 """Execute a job with arguments.
98 Parameters
99 ----------
100 quantumExecutor : `QuantumExecutor`
101 Executor for single quantum.
102 taskDef : `~lsst.pipe.base.TaskDef`
103 Task definition structure.
104 quantum : `~lsst.daf.butler.Quantum`
105 Quantum for this task execution.
106 butler_pickle : `bytes`
107 Data butler instance in pickled form.
108 """
109 butler = pickle.loads(butler_pickle)
110 quantumExecutor.execute(taskDef, quantum, butler)
112 def stop(self):
113 """Stop the process.
114 """
115 self.process.terminate()
116 # give it 1 second to finish or KILL
117 for i in range(10):
118 time.sleep(0.1)
119 if not self.process.is_alive():
120 break
121 else:
122 _LOG.debug("Killing process %s", self.process.name)
123 self.process.kill()
125 def cleanup(self):
126 """Release processes resources, has to be called for each finished
127 process.
128 """
129 if self.process and not self.process.is_alive():
130 self.process.close()
131 self.process = None
133 def __str__(self):
134 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>"
137class _JobList:
138 """SImple list of _Job instances with few convenience methods.
140 Parameters
141 ----------
142 iterable : iterable of `~lsst.pipe.base.QuantumIterData`
143 Sequence if Quanta to execute. This has to be ordered according to
144 task dependencies.
145 """
146 def __init__(self, iterable):
147 self.jobs = [_Job(qnode) for qnode in iterable]
149 def pending(self):
150 """Return list of jobs that wait for execution.
152 Returns
153 -------
154 jobs : `list` [`_Job`]
155 List of jobs.
156 """
157 return [job for job in self.jobs if job.state == JobState.PENDING]
159 def running(self):
160 """Return list of jobs that are executing.
162 Returns
163 -------
164 jobs : `list` [`_Job`]
165 List of jobs.
166 """
167 return [job for job in self.jobs if job.state == JobState.RUNNING]
169 def finishedNodes(self):
170 """Return set of QuantumNodes that finished successfully (not failed).
172 Returns
173 -------
174 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
175 Set of QuantumNodes that have successfully finished
176 """
177 return set(job.qnode for job in self.jobs if job.state == JobState.FINISHED)
179 def failedNodes(self):
180 """Return set of jobs IDs that failed for any reason.
182 Returns
183 -------
184 QuantumNodes : `set` [`~lsst.pipe.base.QuantumNode`]
185 Set of QUantumNodes that failed during processing
186 """
187 return set(job.qnode for job in self.jobs
188 if job.state in (JobState.FAILED, JobState.FAILED_DEP, JobState.TIMED_OUT))
190 def timedOutIds(self):
191 """Return set of jobs IDs that timed out.
193 Returns
194 -------
195 jobsIds : `set` [`int`]
196 Set of integer job IDs.
197 """
198 return set(job.qnode for job in self.jobs if job.state == JobState.TIMED_OUT)
200 def cleanup(self):
201 """Do periodic cleanup for jobs that did not finish correctly.
203 If timed out jobs are killed but take too long to stop then regular
204 cleanup will not work for them. Here we check all timed out jobs
205 periodically and do cleanup if they managed to die by this time.
206 """
207 for job in self.jobs:
208 if job.state == JobState.TIMED_OUT and job.process is not None:
209 job.cleanup()
212class MPGraphExecutorError(Exception):
213 """Exception class for errors raised by MPGraphExecutor.
214 """
215 pass
218class MPTimeoutError(MPGraphExecutorError):
219 """Exception raised when task execution times out.
220 """
221 pass
224class MPGraphExecutor(QuantumGraphExecutor):
225 """Implementation of QuantumGraphExecutor using same-host multiprocess
226 execution of Quanta.
228 Parameters
229 ----------
230 numProc : `int`
231 Number of processes to use for executing tasks.
232 timeout : `float`
233 Time in seconds to wait for tasks to finish.
234 quantumExecutor : `QuantumExecutor`
235 Executor for single quantum. For multiprocess-style execution when
236 ``numProc`` is greater than one this instance must support pickle.
237 failFast : `bool`, optional
238 If set to ``True`` then stop processing on first error from any task.
239 executionGraphFixup : `ExecutionGraphFixup`, optional
240 Instance used for modification of execution graph.
241 """
242 def __init__(self, numProc, timeout, quantumExecutor, *, failFast=False, executionGraphFixup=None):
243 self.numProc = numProc
244 self.timeout = timeout
245 self.quantumExecutor = quantumExecutor
246 self.failFast = failFast
247 self.executionGraphFixup = executionGraphFixup
249 def execute(self, graph, butler):
250 # Docstring inherited from QuantumGraphExecutor.execute
251 graph = self._fixupQuanta(graph)
252 if self.numProc > 1:
253 self._executeQuantaMP(graph, butler)
254 else:
255 self._executeQuantaInProcess(graph, butler)
257 def _fixupQuanta(self, graph: QuantumGraph):
258 """Call fixup code to modify execution graph.
260 Parameters
261 ----------
262 graph : `QuantumGraph`
263 `QuantumGraph` to modify
265 Returns
266 -------
267 graph : `QuantumGraph`
268 Modified `QuantumGraph`.
270 Raises
271 ------
272 MPGraphExecutorError
273 Raised if execution graph cannot be ordered after modification,
274 i.e. it has dependency cycles.
275 """
276 if not self.executionGraphFixup:
277 return graph
279 _LOG.debug("Call execution graph fixup method")
280 graph = self.executionGraphFixup.fixupQuanta(graph)
282 # Detect if there is now a cycle created within the graph
283 if graph.findCycle():
284 raise MPGraphExecutorError(
285 "Updated execution graph has dependency cycle.")
287 return graph
289 def _executeQuantaInProcess(self, graph, butler):
290 """Execute all Quanta in current process.
292 Parameters
293 ----------
294 graph : `QuantumGraph`
295 `QuantumGraph` that is to be executed
296 butler : `lsst.daf.butler.Butler`
297 Data butler instance
298 """
299 for qnode in graph:
300 _LOG.debug("Executing %s", qnode)
301 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler)
303 def _executeQuantaMP(self, graph, butler):
304 """Execute all Quanta in separate processes.
306 Parameters
307 ----------
308 graph : `QuantumGraph`
309 `QuantumGraph` that is to be executed.
310 butler : `lsst.daf.butler.Butler`
311 Data butler instance
312 """
314 disableImplicitThreading() # To prevent thread contention
316 # re-pack input quantum data into jobs list
317 jobs = _JobList(graph)
319 # check that all tasks can run in sub-process
320 for job in jobs.jobs:
321 taskDef = job.taskDef
322 if not taskDef.taskClass.canMultiprocess:
323 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
324 " use single process")
326 while jobs.pending() or jobs.running():
328 _LOG.debug("#pendingJobs: %s", len(jobs.pending()))
329 _LOG.debug("#runningJobs: %s", len(jobs.running()))
331 # See if any jobs have finished
332 for job in jobs.running():
333 if not job.process.is_alive():
334 _LOG.debug("finished: %s", job)
335 # finished
336 exitcode = job.process.exitcode
337 if exitcode == 0:
338 job.state = JobState.FINISHED
339 job.cleanup()
340 _LOG.debug("success: %s", job)
341 else:
342 job.state = JobState.FAILED
343 job.cleanup()
344 _LOG.debug("failed: %s", job)
345 if self.failFast:
346 for stopJob in jobs.running():
347 if stopJob is not job:
348 stopJob.stop()
349 raise MPGraphExecutorError(
350 f"Task {job} failed, exit code={exitcode}."
351 )
352 else:
353 _LOG.error(
354 "Task %s failed; processing will continue for remaining tasks.", job
355 )
356 else:
357 # check for timeout
358 now = time.time()
359 if now - job.started > self.timeout:
360 job.state = JobState.TIMED_OUT
361 _LOG.debug("Terminating job %s due to timeout", job)
362 job.stop()
363 job.cleanup()
364 if self.failFast:
365 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
366 else:
367 _LOG.error(
368 "Timeout (%s sec) for task %s; task is killed, processing continues "
369 "for remaining tasks.", self.timeout, job
370 )
372 # see if we can start more jobs
373 for job in jobs.pending():
375 # check all dependencies
376 if graph.determineInputsToQuantumNode(job.qnode) & jobs.failedNodes():
377 # upstream job has failed, skipping this
378 job.state = JobState.FAILED_DEP
379 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
380 elif graph.determineInputsToQuantumNode(job.qnode) <= jobs.finishedNodes():
381 # all dependencies have completed, can start new job
382 if len(jobs.running()) < self.numProc:
383 _LOG.debug("Sumbitting %s", job)
384 job.start(butler, self.quantumExecutor)
386 # Do cleanup for timed out jobs if necessary.
387 jobs.cleanup()
389 # Here we want to wait until one of the running jobs completes
390 # but multiprocessing does not provide an API for that, for now
391 # just sleep a little bit and go back to the loop.
392 if jobs.running():
393 time.sleep(0.1)
395 if jobs.failedNodes():
396 # print list of failed jobs
397 _LOG.error("Failed jobs:")
398 for job in jobs.jobs:
399 if job.state != JobState.FINISHED:
400 _LOG.error(" - %s: %s", job.state, job)
402 # if any job failed raise an exception
403 if jobs.failedNodes() == jobs.timedOutIds():
404 raise MPTimeoutError("One or more tasks timed out during execution.")
405 else:
406 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")