Coverage for python/lsst/ctrl/mpexec/mpGraphExecutor.py: 13%
212 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-05 18:04 -0800
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-05 18:04 -0800
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27from enum import Enum
28import gc
29import logging
30import multiprocessing
31import pickle
32import sys
33import time
35from lsst.pipe.base.graph.graph import QuantumGraph
36from lsst.pipe.base import InvalidQuantumError
38# -----------------------------
39# Imports for other modules --
40# -----------------------------
41from .quantumGraphExecutor import QuantumGraphExecutor
42from lsst.base import disableImplicitThreading
43from lsst.daf.butler.cli.cliLog import CliLog
45_LOG = logging.getLogger(__name__.partition(".")[2])
48# Possible states for the executing task:
49# - PENDING: job has not started yet
50# - RUNNING: job is currently executing
51# - FINISHED: job finished successfully
52# - FAILED: job execution failed (process returned non-zero status)
53# - TIMED_OUT: job is killed due to too long execution time
54# - FAILED_DEP: one of the dependencies of this job has failed/timed out
55JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
58class _Job:
59 """Class representing a job running single task.
61 Parameters
62 ----------
63 qnode: `~lsst.pipe.base.QuantumNode`
64 Quantum and some associated information.
65 """
66 def __init__(self, qnode):
67 self.qnode = qnode
68 self.process = None
69 self._state = JobState.PENDING
70 self.started = None
72 @property
73 def state(self):
74 """Job processing state (JobState)"""
75 return self._state
77 def start(self, butler, quantumExecutor, startMethod=None):
78 """Start process which runs the task.
80 Parameters
81 ----------
82 butler : `lsst.daf.butler.Butler`
83 Data butler instance.
84 quantumExecutor : `QuantumExecutor`
85 Executor for single quantum.
86 startMethod : `str`, optional
87 Start method from `multiprocessing` module.
88 """
89 # Unpickling of quantum has to happen after butler, this is why
90 # it is pickled manually here.
91 quantum_pickle = pickle.dumps(self.qnode.quantum)
92 taskDef = self.qnode.taskDef
93 logConfigState = CliLog.configState
94 mp_ctx = multiprocessing.get_context(startMethod)
95 self.process = mp_ctx.Process(
96 target=_Job._executeJob,
97 args=(quantumExecutor, taskDef, quantum_pickle, butler, logConfigState),
98 name=f"task-{self.qnode.nodeId.number}"
99 )
100 self.process.start()
101 self.started = time.time()
102 self._state = JobState.RUNNING
104 @staticmethod
105 def _executeJob(quantumExecutor, taskDef, quantum_pickle, butler, logConfigState):
106 """Execute a job with arguments.
108 Parameters
109 ----------
110 quantumExecutor : `QuantumExecutor`
111 Executor for single quantum.
112 taskDef : `bytes`
113 Task definition structure.
114 quantum_pickle : `bytes`
115 Quantum for this task execution in pickled form.
116 butler : `lss.daf.butler.Butler`
117 Data butler instance.
118 """
119 if logConfigState and not CliLog.configState:
120 # means that we are in a new spawned Python process and we have to
121 # re-initialize logging
122 CliLog.replayConfigState(logConfigState)
124 # have to reset connection pool to avoid sharing connections
125 if butler is not None:
126 butler.registry.resetConnectionPool()
128 quantum = pickle.loads(quantum_pickle)
129 quantumExecutor.execute(taskDef, quantum, butler)
131 def stop(self):
132 """Stop the process.
133 """
134 self.process.terminate()
135 # give it 1 second to finish or KILL
136 for i in range(10):
137 time.sleep(0.1)
138 if not self.process.is_alive():
139 break
140 else:
141 _LOG.debug("Killing process %s", self.process.name)
142 self.process.kill()
144 def cleanup(self):
145 """Release processes resources, has to be called for each finished
146 process.
147 """
148 if self.process and not self.process.is_alive():
149 self.process.close()
150 self.process = None
152 def __str__(self):
153 return f"<{self.qnode.taskDef} dataId={self.qnode.quantum.dataId}>"
156class _JobList:
157 """Simple list of _Job instances with few convenience methods.
159 Parameters
160 ----------
161 iterable : iterable of `~lsst.pipe.base.QuantumNode`
162 Sequence of Quanta to execute. This has to be ordered according to
163 task dependencies.
164 """
165 def __init__(self, iterable):
166 self.jobs = [_Job(qnode) for qnode in iterable]
167 self.pending = self.jobs[:]
168 self.running = []
169 self.finishedNodes = set()
170 self.failedNodes = set()
171 self.timedOutNodes = set()
173 def submit(self, job, butler, quantumExecutor, startMethod=None):
174 """Submit one more job for execution
176 Parameters
177 ----------
178 job : `_Job`
179 Job to submit.
180 butler : `lsst.daf.butler.Butler`
181 Data butler instance.
182 quantumExecutor : `QuantumExecutor`
183 Executor for single quantum.
184 startMethod : `str`, optional
185 Start method from `multiprocessing` module.
186 """
187 # this will raise if job is not in pending list
188 self.pending.remove(job)
189 job.start(butler, quantumExecutor, startMethod)
190 self.running.append(job)
192 def setJobState(self, job, state):
193 """Update job state.
195 Parameters
196 ----------
197 job : `_Job`
198 Job to submit.
199 state : `JobState`
200 New job state, note that only FINISHED, FAILED, TIMED_OUT, or
201 FAILED_DEP state is acceptable.
202 """
203 allowedStates = (
204 JobState.FINISHED,
205 JobState.FAILED,
206 JobState.TIMED_OUT,
207 JobState.FAILED_DEP
208 )
209 assert state in allowedStates, f"State {state} not allowed here"
211 # remove job from pending/running lists
212 if job.state == JobState.PENDING:
213 self.pending.remove(job)
214 elif job.state == JobState.RUNNING:
215 self.running.remove(job)
217 qnode = job.qnode
218 # it should not be in any of these, but just in case
219 self.finishedNodes.discard(qnode)
220 self.failedNodes.discard(qnode)
221 self.timedOutNodes.discard(qnode)
223 job._state = state
224 if state == JobState.FINISHED:
225 self.finishedNodes.add(qnode)
226 elif state == JobState.FAILED:
227 self.failedNodes.add(qnode)
228 elif state == JobState.FAILED_DEP:
229 self.failedNodes.add(qnode)
230 elif state == JobState.TIMED_OUT:
231 self.failedNodes.add(qnode)
232 self.timedOutNodes.add(qnode)
233 else:
234 raise ValueError(f"Unexpected state value: {state}")
236 def cleanup(self):
237 """Do periodic cleanup for jobs that did not finish correctly.
239 If timed out jobs are killed but take too long to stop then regular
240 cleanup will not work for them. Here we check all timed out jobs
241 periodically and do cleanup if they managed to die by this time.
242 """
243 for job in self.jobs:
244 if job.state == JobState.TIMED_OUT and job.process is not None:
245 job.cleanup()
248class MPGraphExecutorError(Exception):
249 """Exception class for errors raised by MPGraphExecutor.
250 """
251 pass
254class MPTimeoutError(MPGraphExecutorError):
255 """Exception raised when task execution times out.
256 """
257 pass
260class MPGraphExecutor(QuantumGraphExecutor):
261 """Implementation of QuantumGraphExecutor using same-host multiprocess
262 execution of Quanta.
264 Parameters
265 ----------
266 numProc : `int`
267 Number of processes to use for executing tasks.
268 timeout : `float`
269 Time in seconds to wait for tasks to finish.
270 quantumExecutor : `QuantumExecutor`
271 Executor for single quantum. For multiprocess-style execution when
272 ``numProc`` is greater than one this instance must support pickle.
273 startMethod : `str`, optional
274 Start method from `multiprocessing` module, `None` selects the best
275 one for current platform.
276 failFast : `bool`, optional
277 If set to ``True`` then stop processing on first error from any task.
278 executionGraphFixup : `ExecutionGraphFixup`, optional
279 Instance used for modification of execution graph.
280 """
281 def __init__(self, numProc, timeout, quantumExecutor, *,
282 startMethod=None, failFast=False, executionGraphFixup=None):
283 self.numProc = numProc
284 self.timeout = timeout
285 self.quantumExecutor = quantumExecutor
286 self.failFast = failFast
287 self.executionGraphFixup = executionGraphFixup
289 # We set default start method as spawn for MacOS and fork for Linux;
290 # None for all other platforms to use multiprocessing default.
291 if startMethod is None:
292 methods = dict(linux="fork", darwin="spawn")
293 startMethod = methods.get(sys.platform)
294 self.startMethod = startMethod
296 def execute(self, graph, butler):
297 # Docstring inherited from QuantumGraphExecutor.execute
298 graph = self._fixupQuanta(graph)
299 if self.numProc > 1:
300 self._executeQuantaMP(graph, butler)
301 else:
302 self._executeQuantaInProcess(graph, butler)
304 def _fixupQuanta(self, graph: QuantumGraph):
305 """Call fixup code to modify execution graph.
307 Parameters
308 ----------
309 graph : `QuantumGraph`
310 `QuantumGraph` to modify
312 Returns
313 -------
314 graph : `QuantumGraph`
315 Modified `QuantumGraph`.
317 Raises
318 ------
319 MPGraphExecutorError
320 Raised if execution graph cannot be ordered after modification,
321 i.e. it has dependency cycles.
322 """
323 if not self.executionGraphFixup:
324 return graph
326 _LOG.debug("Call execution graph fixup method")
327 graph = self.executionGraphFixup.fixupQuanta(graph)
329 # Detect if there is now a cycle created within the graph
330 if graph.findCycle():
331 raise MPGraphExecutorError(
332 "Updated execution graph has dependency cycle.")
334 return graph
336 def _executeQuantaInProcess(self, graph, butler):
337 """Execute all Quanta in current process.
339 Parameters
340 ----------
341 graph : `QuantumGraph`
342 `QuantumGraph` that is to be executed
343 butler : `lsst.daf.butler.Butler`
344 Data butler instance
345 """
346 successCount, totalCount = 0, len(graph)
347 failedNodes = set()
348 for qnode in graph:
350 # Any failed inputs mean that the quantum has to be skipped.
351 inputNodes = graph.determineInputsToQuantumNode(qnode)
352 if inputNodes & failedNodes:
353 _LOG.error(
354 "Upstream job failed for task <%s dataId=%s>, skipping this task.",
355 qnode.taskDef,
356 qnode.quantum.dataId,
357 )
358 failedNodes.add(qnode)
359 continue
361 _LOG.debug("Executing %s", qnode)
362 try:
363 self.quantumExecutor.execute(qnode.taskDef, qnode.quantum, butler)
364 successCount += 1
365 except Exception as exc:
366 failedNodes.add(qnode)
367 if self.failFast:
368 raise MPGraphExecutorError(
369 f"Task <{qnode.taskDef} dataId={qnode.quantum.dataId}> failed."
370 ) from exc
371 else:
372 # Note that there could be exception safety issues, which
373 # we presently ignore.
374 _LOG.error(
375 "Task <%s dataId=%s> failed; processing will continue for remaining tasks.",
376 qnode.taskDef,
377 qnode.quantum.dataId,
378 exc_info=exc,
379 )
380 finally:
381 # sqlalchemy has some objects that can last until a garbage
382 # collection cycle is run, which can happen at unpredictable
383 # times, run a collection loop here explicitly.
384 gc.collect()
386 _LOG.info(
387 "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
388 successCount,
389 len(failedNodes),
390 totalCount - successCount - len(failedNodes),
391 totalCount,
392 )
394 # Raise an exception if there were any failures.
395 if failedNodes:
396 raise MPGraphExecutorError("One or more tasks failed during execution.")
398 def _executeQuantaMP(self, graph, butler):
399 """Execute all Quanta in separate processes.
401 Parameters
402 ----------
403 graph : `QuantumGraph`
404 `QuantumGraph` that is to be executed.
405 butler : `lsst.daf.butler.Butler`
406 Data butler instance
407 """
409 disableImplicitThreading() # To prevent thread contention
411 _LOG.debug("Using %r for multiprocessing start method", self.startMethod)
413 # re-pack input quantum data into jobs list
414 jobs = _JobList(graph)
416 # check that all tasks can run in sub-process
417 for job in jobs.jobs:
418 taskDef = job.qnode.taskDef
419 if not taskDef.taskClass.canMultiprocess:
420 raise MPGraphExecutorError(f"Task {taskDef.taskName} does not support multiprocessing;"
421 " use single process")
423 finishedCount, failedCount = 0, 0
424 while jobs.pending or jobs.running:
426 _LOG.debug("#pendingJobs: %s", len(jobs.pending))
427 _LOG.debug("#runningJobs: %s", len(jobs.running))
429 # See if any jobs have finished
430 for job in jobs.running:
431 if not job.process.is_alive():
432 _LOG.debug("finished: %s", job)
433 # finished
434 exitcode = job.process.exitcode
435 if exitcode == 0:
436 jobs.setJobState(job, JobState.FINISHED)
437 job.cleanup()
438 _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started)
439 else:
440 jobs.setJobState(job, JobState.FAILED)
441 job.cleanup()
442 _LOG.debug("failed: %s", job)
443 if self.failFast or exitcode == InvalidQuantumError.EXIT_CODE:
444 for stopJob in jobs.running:
445 if stopJob is not job:
446 stopJob.stop()
447 raise MPGraphExecutorError(
448 f"Task {job} failed, exit code={exitcode}."
449 )
450 else:
451 _LOG.error(
452 "Task %s failed; processing will continue for remaining tasks.", job
453 )
454 else:
455 # check for timeout
456 now = time.time()
457 if now - job.started > self.timeout:
458 jobs.setJobState(job, JobState.TIMED_OUT)
459 _LOG.debug("Terminating job %s due to timeout", job)
460 job.stop()
461 job.cleanup()
462 if self.failFast:
463 raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
464 else:
465 _LOG.error(
466 "Timeout (%s sec) for task %s; task is killed, processing continues "
467 "for remaining tasks.", self.timeout, job
468 )
470 # Fail jobs whose inputs failed, this may need several iterations
471 # if the order is not right, will be done in the next loop.
472 if jobs.failedNodes:
473 for job in jobs.pending:
474 jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
475 if jobInputNodes & jobs.failedNodes:
476 jobs.setJobState(job, JobState.FAILED_DEP)
477 _LOG.error("Upstream job failed for task %s, skipping this task.", job)
479 # see if we can start more jobs
480 if len(jobs.running) < self.numProc:
481 for job in jobs.pending:
482 jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
483 if jobInputNodes <= jobs.finishedNodes:
484 # all dependencies have completed, can start new job
485 if len(jobs.running) < self.numProc:
486 _LOG.debug("Submitting %s", job)
487 jobs.submit(job, butler, self.quantumExecutor, self.startMethod)
488 if len(jobs.running) >= self.numProc:
489 # Cannot start any more jobs, wait until something
490 # finishes.
491 break
493 # Do cleanup for timed out jobs if necessary.
494 jobs.cleanup()
496 # Print progress message if something changed.
497 newFinished, newFailed = len(jobs.finishedNodes), len(jobs.failedNodes)
498 if (finishedCount, failedCount) != (newFinished, newFailed):
499 finishedCount, failedCount = newFinished, newFailed
500 totalCount = len(jobs.jobs)
501 _LOG.info("Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
502 finishedCount, failedCount, totalCount - finishedCount - failedCount, totalCount)
504 # Here we want to wait until one of the running jobs completes
505 # but multiprocessing does not provide an API for that, for now
506 # just sleep a little bit and go back to the loop.
507 if jobs.running:
508 time.sleep(0.1)
510 if jobs.failedNodes:
511 # print list of failed jobs
512 _LOG.error("Failed jobs:")
513 for job in jobs.jobs:
514 if job.state != JobState.FINISHED:
515 _LOG.error(" - %s: %s", job.state.name, job)
517 # if any job failed raise an exception
518 if jobs.failedNodes == jobs.timedOutNodes:
519 raise MPTimeoutError("One or more tasks timed out during execution.")
520 else:
521 raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")