3 from __future__
import print_function
4 from builtins
import object
16 from lsst.pipe.base
import CmdLineTask, TaskRunner
17 from .pool
import startPool, Pool, NODE, abortOnError, setBatchType
20 __all__ = [
"Batch",
"PbsBatch",
"SlurmBatch",
"SmpBatch",
"BATCH_TYPES",
"BatchArgumentParser",
21 "BatchCmdLineTask",
"BatchPoolTask", ]
27 _quote_pos = re.compile(
'(?=[^-0-9a-zA-Z_./\n])')
31 r"""Quote the argument for the shell.
40 return _quote_pos.sub(
'\\\\', arg).replace(
'\n',
"'\n'")
46 """Convert a list of shell arguments to a shell command-line"""
47 return ' '.join([
shQuote(a)
for a
in args])
51 """Collect Linux-specific process statistics
53 Parses the /proc/self/status file (N.B. Linux-specific!) into a dict
57 with open(
"/proc/self/status")
as f:
59 key, _, value = line.partition(
":")
60 result[key] = value.strip()
65 """Print the process statistics to the log"""
66 from lsst.log
import Log
67 log = Log.getDefaultLogger()
68 log.info(
"Process stats for %s: %s" % (NODE,
processStats()))
72 """Base class for batch submission"""
74 def __init__(self, outputDir=None, numNodes=0, numProcsPerNode=0, numCores=0, queue=None, jobName=None,
75 walltime=0.0, dryrun=
False, doExec=
False, mpiexec=
"", submit=
None, options=
None,
79 @param outputDir: output directory, or None
80 @param numNodes: number of nodes
81 @param numProcsPerNode: number of processors per node
82 @param numCores: number of cores (Slurm, SMP only)
83 @param queue: name of queue, or None
84 @param jobName: name of job, or None
85 @param walltime: maximum wall clock time for job
86 @param dryrun: Dry run (only print actions that would be taken)?
87 @param doExec: exec the script instead of submitting to batch system?
88 @param mpiexec: options for mpiexec
89 @param submit: command-line options for batch submission (e.g., for qsub, sbatch)
90 @param options: options to append to script header (e.g., #PBS or #SBATCH)
91 @param verbose: produce verbose output?
93 if (numNodes <= 0
or numProcsPerNode <= 0)
and numCores <= 0:
94 raise RuntimeError(
"Must specify numNodes+numProcs or numCores")
114 """Return preamble string for script to be submitted
116 Most batch systems allow you to embed submission options as comments here.
118 raise NotImplementedError(
"Not implemented for base class")
121 """Return execution string for script to be submitted"""
123 "umask %03o" % UMASK,
124 "cd %s" % pipes.quote(os.getcwd()),
127 script += [
"echo \"mpiexec is at: $(which mpiexec)\"",
129 "echo 'umask: ' $(umask)",
134 script += [
"mpiexec %s %s" % (self.
mpiexec, command)]
139 return "\n".join(script)
142 """!Create script to be submitted
144 @param command: command to run
145 @param walltime: maximum wall clock time, overrides value to constructor
146 @return name of script on filesystem
148 fd, scriptName = tempfile.mkstemp()
149 with os.fdopen(fd,
"w")
as f:
157 os.chmod(scriptName, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
161 """!Return command to submit script
163 @param scriptName: name of script on filesystem
165 raise NotImplementedError(
"No implementation for base class")
167 def run(self, command, walltime=None):
168 """!Run the batch system
170 Creates and submits the script to execute the provided command
172 @param command: command to run
173 @param walltime: maximum wall clock time, overrides value to constructor
174 @return name of script on filesystem
176 scriptName = self.
createScript(command, walltime=walltime)
179 print(
"Would run: %s" % command)
181 os.execl(scriptName, scriptName)
188 """Batch submission with PBS"""
194 raise RuntimeError(
"Non-positive walltime: %s (did you forget '--time'?)" % (walltime,))
197 "Number of nodes (--nodes=%d) or number of processors per node (--procs=%d) not set" %
200 raise RuntimeError(
"PBS does not support setting the number of cores")
204 "#PBS -l walltime=%d" % walltime
if walltime
is not None else "",
207 "#PBS -q %s" % self.
queue if self.
queue is not None else "",
209 "#PBS -W umask=%03o" % UMASK,
213 return "qsub %s -V %s" % (self.
submit if self.
submit is not None else "", scriptName)
217 """Batch submission with Slurm"""
223 raise RuntimeError(
"Non-positive walltime: %s (did you forget '--time'?)" % (walltime,))
226 "Number of nodes (--nodes=%d) and number of processors per node (--procs=%d) not set OR "
229 raise RuntimeError(
"Must set either --nodes,--procs or --cores: not both")
232 filename = os.path.join(outputDir, (self.
jobName if self.
jobName is not None else "slurm") +
".o%j")
233 return "\n".join([(
"#SBATCH --nodes=%d" % self.
numNodes)
if self.
numNodes > 0
else "",
237 "#SBATCH --time=%d" % max(walltime/60.0 + 0.5, 1)
if walltime
is not None else "",
238 "#SBATCH --job-name=%s" % self.
jobName if self.
jobName is not None else "",
239 "#SBATCH -p %s" % self.
queue if self.
queue is not None else "",
240 "#SBATCH --output=%s" % filename,
241 "#SBATCH --error=%s" % filename,
246 return "sbatch %s %s" % (self.
submit if self.
submit is not None else "", scriptName)
250 """Not-really-Batch submission with multiple cores on the current node
252 The job is run immediately.
256 super(SmpBatch, self).
__init__(*args, **kwargs)
263 raise RuntimeError(
"SMP does not support the --nodes and --procs command-line options; "
264 "use --cores to specify the number of cores to use")
274 return "exec %s" % scriptName
277 BATCH_TYPES = {
'none' :
None,
286 """An argument parser to get relevant parameters for batch submission
288 We want to be able to display the help for a 'parent' ArgumentParser
289 along with the batch-specific options we introduce in this class, but
290 we don't want to swallow the parent (i.e., ArgumentParser(parents=[parent]))
291 because we want to save the list of arguments that this particular
292 BatchArgumentParser doesn't parse, so they can be passed on to a different
293 program (though we also want to parse them to check that they can be parsed).
297 super(BatchArgumentParser, self).
__init__(*args, **kwargs)
299 group = self.add_argument_group(
"Batch submission options")
300 group.add_argument(
"--queue", help=
"Queue name")
301 group.add_argument(
"--job", help=
"Job name")
302 group.add_argument(
"--nodes", type=int, default=0, help=
"Number of nodes")
303 group.add_argument(
"--procs", type=int, default=0, help=
"Number of processors per node")
304 group.add_argument(
"--cores", type=int, default=0, help=
"Number of cores (Slurm/SMP only)")
305 group.add_argument(
"--time", type=float, default=0,
306 help=
"Expected execution time per element (sec)")
307 group.add_argument(
"--batch-type", dest=
"batchType", choices=list(BATCH_TYPES.keys()), default=
"smp",
308 help=
"Batch system to use")
309 group.add_argument(
"--batch-verbose", dest=
"batchVerbose", action=
"store_true", default=
False,
310 help=(
"Enable verbose output in batch script "
311 "(including system environment information at batch start)?"))
312 group.add_argument(
"--batch-output", dest=
"batchOutput", help=
"Output directory")
313 group.add_argument(
"--batch-submit", dest=
"batchSubmit", help=
"Batch submission command-line flags")
314 group.add_argument(
"--batch-options", dest=
"batchOptions", help=
"Header options for batch script")
315 group.add_argument(
"--batch-profile", dest=
"batchProfile", action=
"store_true", default=
False,
316 help=
"Enable profiling on batch job?")
317 group.add_argument(
"--batch-stats", dest=
"batchStats", action=
"store_true", default=
False,
318 help=
"Print process stats on completion (Linux only)?")
319 group.add_argument(
"--dry-run", dest=
"dryrun", default=
False, action=
"store_true",
321 group.add_argument(
"--do-exec", dest=
"doExec", default=
False, action=
"store_true",
322 help=
"Exec script instead of submit to batch system?")
323 group.add_argument(
"--mpiexec", default=
"", help=
"mpiexec options")
325 def parse_args(self, config=None, args=None, namespace=None, **kwargs):
326 args, leftover = super(BatchArgumentParser, self).parse_known_args(args=args, namespace=namespace)
329 if len(leftover) > 0:
332 self.error(
"Unrecognised arguments: %s" % leftover)
333 args.parent = self._parent.parse_args(config, args=leftover, **kwargs)
334 args.leftover = leftover
339 """Create a Batch object from the command-line arguments"""
341 argMapping = {
'outputDir':
'batchOutput',
343 'numProcsPerNode':
'procs',
350 'mpiexec':
'mpiexec',
351 'submit':
'batchSubmit',
352 'options':
'batchOptions',
353 'verbose':
'batchVerbose',
356 if BATCH_TYPES[args.batchType]
is None:
360 kwargs = {k: getattr(args, v)
for k, v
in argMapping.items()}
361 return BATCH_TYPES[args.batchType](**kwargs)
364 text =
"""This is a script for queue submission of a wrapped script.
366 Use this program name and ignore that for the wrapped script (it will be
367 passed on to the batch system). Arguments for *both* this wrapper script or the
368 wrapped script are valid (if it is required for the wrapped script, it
369 is required for the wrapper as well).
371 *** Batch system submission wrapper:
374 text += super(BatchArgumentParser, self).
format_help()
381 text += self._parent.format_help()
386 prog = self._parent.prog
387 self._parent.prog = self.prog
388 usage = self._parent.format_usage()
389 self._parent.prog = prog
395 """Generate bash script to regenerate the current environment"""
397 for key, val
in os.environ.items():
398 if key
in (
"DISPLAY",):
400 if val.startswith(
"() {"):
407 if key.startswith(
"BASH_FUNC_")
and key.endswith(
"()"):
410 output +=
"{key} {val}\nexport -f {key}\n".format(key=key, val=val)
413 output +=
"export {key}='{val}'\n".format(key=key, val=val.replace(
"'",
"'\"'\"'"))
421 taskParser = cls._makeArgumentParser(doBatch=
True, add_help=
False)
423 batchArgs = batchParser.parse_args(config=cls.ConfigClass(), args=args, override=cls.applyOverrides,
426 if not cls.RunnerClass(cls, batchArgs.parent).precall(batchArgs.parent):
427 taskParser.error(
"Error in task preparation")
431 if batchArgs.batch
is None:
432 sys.argv = [sys.argv[0]] + batchArgs.leftover
434 return cls.parseAndRun()
436 numCores = batchArgs.cores
if batchArgs.cores > 0
else batchArgs.nodes*batchArgs.procs
437 walltime = cls.batchWallTime(batchArgs.time, batchArgs.parent, numCores)
439 command = cls.batchCommand(batchArgs)
440 batchArgs.batch.run(command, walltime=walltime)
444 """!Return walltime request for batch job
446 Subclasses should override if the walltime should be calculated
447 differently (e.g., addition of some serial time).
450 @param time: Requested time per iteration
451 @param parsedCmd: Results of argument parsing
452 @param numCores: Number of cores
454 numTargets = len(cls.RunnerClass.getTargetList(parsedCmd))
455 return time*numTargets/float(numCores)
459 """!Return command to run CmdLineTask
462 @param args: Parsed batch job arguments (from BatchArgumentParser)
464 job = args.job
if args.job
is not None else "job"
465 module = cls.__module__
466 script = (
"import os; os.umask(%#05o); " +
467 "import lsst.base; lsst.base.disableImplicitThreading(); " +
468 "import lsst.ctrl.pool.log; lsst.ctrl.pool.log.jobLog(\"%s\"); ") % (UMASK, job)
471 script += (
"import lsst.ctrl.pool.parallel; import atexit; " +
472 "atexit.register(lsst.ctrl.pool.parallel.printProcessStats); ")
474 script +=
"import %s; %s.%s.parseAndRun();" % (module, module, cls.__name__)
476 profilePre =
"import cProfile; import os; cProfile.run(\"\"\""
477 profilePost =
"\"\"\", filename=\"profile-" + job +
"-%s-%d.dat\" % (os.uname()[1], os.getpid()))"
479 return (
"python -c '" + (profilePre
if args.batchProfile
else "") + script +
480 (profilePost
if args.batchProfile
else "") +
"' " +
shCommandFromArgs(args.leftover))
482 @contextlib.contextmanager
484 """!Provide a context manager for logging an operation
486 @param operation: description of operation (string)
487 @param catch: Catch all exceptions?
488 @param trace: Log a traceback of caught exception?
490 Note that if 'catch' is True, all exceptions are swallowed, but there may
491 be other side-effects such as undefined variables.
493 self.log.info(
"%s: Start %s" % (NODE, operation))
498 cls, e, _ = sys.exc_info()
499 self.log.warn(
"%s: Caught %s while %s: %s" % (NODE, cls.__name__, operation, e))
501 self.log.info(
"%s: Traceback:\n%s" % (NODE, traceback.format_exc()))
505 self.log.info(
"%s: Finished %s" % (NODE, operation))
509 """Starts a BatchCmdLineTask with an MPI process pool
511 Use this subclass of BatchCmdLineTask if you want to use the Pool directly.
516 """Run with a MPI process pool"""
518 super(BatchPoolTask, cls).
parseAndRun(*args, **kwargs)
523 """Run a Task individually on a list of inputs using the MPI process pool"""
528 Warn if the user specified multiprocessing.
530 TaskRunner.__init__(self, *args, **kwargs)
532 self.log.warn(
"Multiprocessing arguments (-j %d) ignored since using batch processing" %
537 """Run the task on all targets
539 Sole input is the result of parsing the command-line with the ArgumentParser.
541 Output is None if 'precall' failed; otherwise it is a list of calling ourself
542 on each element of the target list from the 'getTargetList' method.
546 import multiprocessing
547 self.prepareForMultiProcessing()
550 if self.precall(parsedCmd):
551 targetList = self.getTargetList(parsedCmd)
552 if len(targetList) > 0:
553 parsedCmd.log.info(
"Processing %d targets with a pool of %d processes..." %
554 (len(targetList), pool.size))
556 resultList = pool.map(self, targetList)
558 parsedCmd.log.warn(
"Not running the task because there is no data to process; "
559 "you may preview data using \"--show data\"")
566 """Run the Task on a single target
568 Strips out the process pool 'cache' argument.
570 'args' are those arguments provided by the getTargetList method.
572 Brings down the entire job if an exception is not caught (i.e., --doraise).
574 return TaskRunner.__call__(self, args)
578 """Runs the BatchCmdLineTask in parallel
580 Use this subclass of BatchCmdLineTask if you don't need to use the Pool
581 directly, but just want to iterate over many objects (like a multi-node
582 version of the '-j' command-line argument).
584 RunnerClass = BatchTaskRunner
587 def _makeArgumentParser(cls, *args, **kwargs):
588 """Build an ArgumentParser
590 Removes the batch-specific parts in order to delegate to the parent classes.
592 kwargs.pop(
"doBatch",
False)
593 kwargs.pop(
"add_help",
False)
594 return super(BatchCmdLineTask, cls)._makeArgumentParser(*args, **kwargs)
598 """Parse an argument list and run the command
600 This is the entry point when we run in earnest, so start the process pool
601 so that the worker nodes don't go any further.
604 results = super(BatchParallelTask, cls).
parseAndRun(*args, **kwargs)
def run
Run the batch system.
def submitCommand
Return command to submit script.
def batchWallTime
Return walltime request for batch job.
def startPool
Start a process pool.
def logOperation
Provide a context manager for logging an operation.
def createScript
Create script to be submitted.
def batchCommand
Return command to run CmdLineTask.