3 from __future__
import print_function
4 from builtins
import object
17 from .pool
import startPool, Pool, NODE, abortOnError, setBatchType
20 __all__ = [
"Batch",
"PbsBatch",
"SlurmBatch",
"SmpBatch",
"BATCH_TYPES",
"BatchArgumentParser",
21 "BatchCmdLineTask",
"BatchPoolTask", ]
27 _quote_pos = re.compile(
'(?=[^-0-9a-zA-Z_./\n])')
31 r"""Quote the argument for the shell. 40 return _quote_pos.sub(
'\\\\', arg).replace(
'\n',
"'\n'")
46 """Convert a list of shell arguments to a shell command-line""" 47 return ' '.join([
shQuote(a)
for a
in args])
51 """Collect Linux-specific process statistics 53 Parses the /proc/self/status file (N.B. Linux-specific!) into a dict 57 with open(
"/proc/self/status")
as f:
59 key, _, value = line.partition(
":")
60 result[key] = value.strip()
65 """Print the process statistics to the log""" 67 log = Log.getDefaultLogger()
68 log.info(
"Process stats for %s: %s" % (NODE,
processStats()))
72 """Base class for batch submission""" 74 def __init__(self, outputDir=None, numNodes=0, numProcsPerNode=0, numCores=0, queue=None, jobName=None,
75 walltime=0.0, dryrun=False, doExec=False, mpiexec="", submit=None, options=None,
79 @param outputDir: output directory, or None 80 @param numNodes: number of nodes 81 @param numProcsPerNode: number of processors per node 82 @param numCores: number of cores (Slurm, SMP only) 83 @param queue: name of queue, or None 84 @param jobName: name of job, or None 85 @param walltime: maximum wall clock time for job 86 @param dryrun: Dry run (only print actions that would be taken)? 87 @param doExec: exec the script instead of submitting to batch system? 88 @param mpiexec: options for mpiexec 89 @param submit: command-line options for batch submission (e.g., for qsub, sbatch) 90 @param options: options to append to script header (e.g., #PBS or #SBATCH) 91 @param verbose: produce verbose output? 93 if (numNodes <= 0
or numProcsPerNode <= 0)
and numCores <= 0:
94 raise RuntimeError(
"Must specify numNodes+numProcs or numCores")
114 """Return preamble string for script to be submitted 116 Most batch systems allow you to embed submission options as comments here. 118 raise NotImplementedError(
"Not implemented for base class")
121 """Return execution string for script to be submitted""" 123 "umask %03o" % UMASK,
124 "cd %s" % pipes.quote(os.getcwd()),
127 script += [
"echo \"mpiexec is at: $(which mpiexec)\"",
129 "echo 'umask: ' $(umask)",
134 script += [
"mpiexec %s %s" % (self.
mpiexec, command)]
139 return "\n".join(script)
142 """!Create script to be submitted 144 @param command: command to run 145 @param walltime: maximum wall clock time, overrides value to constructor 146 @return name of script on filesystem 148 fd, scriptName = tempfile.mkstemp()
149 with os.fdopen(fd,
"w")
as f:
157 os.chmod(scriptName, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
161 """!Return command to submit script 163 @param scriptName: name of script on filesystem 165 raise NotImplementedError(
"No implementation for base class")
167 def run(self, command, walltime=None):
168 """!Run the batch system 170 Creates and submits the script to execute the provided command 172 @param command: command to run 173 @param walltime: maximum wall clock time, overrides value to constructor 174 @return name of script on filesystem 176 scriptName = self.
createScript(command, walltime=walltime)
179 print(
"Would run: %s" % command)
181 os.execl(scriptName, scriptName)
188 """Batch submission with PBS""" 194 raise RuntimeError(
"Non-positive walltime: %s (did you forget '--time'?)" % (walltime,))
197 "Number of nodes (--nodes=%d) or number of processors per node (--procs=%d) not set" %
200 raise RuntimeError(
"PBS does not support setting the number of cores")
204 "#PBS -l walltime=%d" % walltime
if walltime
is not None else "",
207 "#PBS -q %s" % self.
queue if self.
queue is not None else "",
209 "#PBS -W umask=%03o" % UMASK,
213 return "qsub %s -V %s" % (self.
submit if self.
submit is not None else "", scriptName)
217 """Batch submission with Slurm""" 223 raise RuntimeError(
"Non-positive walltime: %s (did you forget '--time'?)" % (walltime,))
226 "Number of nodes (--nodes=%d) and number of processors per node (--procs=%d) not set OR " 229 raise RuntimeError(
"Must set either --nodes,--procs or --cores: not both")
232 filename = os.path.join(outputDir, (self.
jobName if self.
jobName is not None else "slurm") +
".o%j")
233 return "\n".join([(
"#SBATCH --nodes=%d" % self.
numNodes)
if self.
numNodes > 0
else "",
237 "#SBATCH --time=%d" % max(walltime/60.0 + 0.5, 1)
if walltime
is not None else "",
238 "#SBATCH --job-name=%s" % self.
jobName if self.
jobName is not None else "",
239 "#SBATCH -p %s" % self.
queue if self.
queue is not None else "",
240 "#SBATCH --output=%s" % filename,
241 "#SBATCH --error=%s" % filename,
246 return "sbatch %s %s" % (self.
submit if self.
submit is not None else "", scriptName)
250 """Not-really-Batch submission with multiple cores on the current node 252 The job is run immediately. 256 super(SmpBatch, self).
__init__(*args, **kwargs)
263 raise RuntimeError(
"SMP does not support the --nodes and --procs command-line options; " 264 "use --cores to specify the number of cores to use")
274 return "exec %s" % scriptName
277 BATCH_TYPES = {
'none' :
None,
286 """An argument parser to get relevant parameters for batch submission 288 We want to be able to display the help for a 'parent' ArgumentParser 289 along with the batch-specific options we introduce in this class, but 290 we don't want to swallow the parent (i.e., ArgumentParser(parents=[parent])) 291 because we want to save the list of arguments that this particular 292 BatchArgumentParser doesn't parse, so they can be passed on to a different 293 program (though we also want to parse them to check that they can be parsed). 297 super(BatchArgumentParser, self).
__init__(*args, **kwargs)
299 group = self.add_argument_group(
"Batch submission options")
300 group.add_argument(
"--queue", help=
"Queue name")
301 group.add_argument(
"--job", help=
"Job name")
302 group.add_argument(
"--nodes", type=int, default=0, help=
"Number of nodes")
303 group.add_argument(
"--procs", type=int, default=0, help=
"Number of processors per node")
304 group.add_argument(
"--cores", type=int, default=0, help=
"Number of cores (Slurm/SMP only)")
305 group.add_argument(
"--time", type=float, default=0,
306 help=
"Expected execution time per element (sec)")
307 group.add_argument(
"--batch-type", dest=
"batchType", choices=list(BATCH_TYPES.keys()), default=
"smp",
308 help=
"Batch system to use")
309 group.add_argument(
"--batch-verbose", dest=
"batchVerbose", action=
"store_true", default=
False,
310 help=(
"Enable verbose output in batch script " 311 "(including system environment information at batch start)?"))
312 group.add_argument(
"--batch-output", dest=
"batchOutput", help=
"Output directory")
313 group.add_argument(
"--batch-submit", dest=
"batchSubmit", help=
"Batch submission command-line flags")
314 group.add_argument(
"--batch-options", dest=
"batchOptions", help=
"Header options for batch script")
315 group.add_argument(
"--batch-profile", dest=
"batchProfile", action=
"store_true", default=
False,
316 help=
"Enable profiling on batch job?")
317 group.add_argument(
"--batch-stats", dest=
"batchStats", action=
"store_true", default=
False,
318 help=
"Print process stats on completion (Linux only)?")
319 group.add_argument(
"--dry-run", dest=
"dryrun", default=
False, action=
"store_true",
321 group.add_argument(
"--do-exec", dest=
"doExec", default=
False, action=
"store_true",
322 help=
"Exec script instead of submit to batch system?")
323 group.add_argument(
"--mpiexec", default=
"", help=
"mpiexec options")
325 def parse_args(self, config=None, args=None, namespace=None, **kwargs):
326 args, leftover = super(BatchArgumentParser, self).parse_known_args(args=args, namespace=namespace)
329 if len(leftover) > 0:
332 self.error(
"Unrecognised arguments: %s" % leftover)
334 args.leftover = leftover
339 """Create a Batch object from the command-line arguments""" 341 argMapping = {
'outputDir':
'batchOutput',
343 'numProcsPerNode':
'procs',
350 'mpiexec':
'mpiexec',
351 'submit':
'batchSubmit',
352 'options':
'batchOptions',
353 'verbose':
'batchVerbose',
356 if BATCH_TYPES[args.batchType]
is None:
360 kwargs = {k: getattr(args, v)
for k, v
in argMapping.items()}
361 return BATCH_TYPES[args.batchType](**kwargs)
364 text =
"""This is a script for queue submission of a wrapped script. 366 Use this program name and ignore that for the wrapped script (it will be 367 passed on to the batch system). Arguments for *both* this wrapper script or the 368 wrapped script are valid (if it is required for the wrapped script, it 369 is required for the wrapper as well). 371 *** Batch system submission wrapper: 374 text += super(BatchArgumentParser, self).
format_help()
395 """Generate bash script to regenerate the current environment""" 397 for key, val
in os.environ.items():
398 if key
in (
"DISPLAY",):
400 if val.startswith(
"() {"):
407 if key.startswith(
"BASH_FUNC_")
and key.endswith(
"()"):
410 output +=
"{key} {val}\nexport -f {key}\n".format(key=key, val=val)
413 output +=
"export {key}='{val}'\n".format(key=key, val=val.replace(
"'",
"'\"'\"'"))
421 taskParser = cls._makeArgumentParser(doBatch=
True, add_help=
False)
423 batchArgs = batchParser.parse_args(config=cls.ConfigClass(), args=args, override=cls.applyOverrides,
426 if not cls.RunnerClass(cls, batchArgs.parent).precall(batchArgs.parent):
427 taskParser.error(
"Error in task preparation")
431 if batchArgs.batch
is None:
432 sys.argv = [sys.argv[0]] + batchArgs.leftover
434 return cls.parseAndRun()
436 numCores = batchArgs.cores
if batchArgs.cores > 0
else batchArgs.nodes*batchArgs.procs
437 walltime = cls.
batchWallTime(batchArgs.time, batchArgs.parent, numCores)
440 batchArgs.batch.run(command, walltime=walltime)
444 """!Return walltime request for batch job 446 Subclasses should override if the walltime should be calculated 447 differently (e.g., addition of some serial time). 450 @param time: Requested time per iteration 451 @param parsedCmd: Results of argument parsing 452 @param numCores: Number of cores 454 numTargets = len(cls.RunnerClass.getTargetList(parsedCmd))
455 return time*numTargets/float(numCores)
459 """!Return command to run CmdLineTask 462 @param args: Parsed batch job arguments (from BatchArgumentParser) 464 job = args.job
if args.job
is not None else "job" 465 module = cls.__module__
466 script = (
"import os; os.umask(%#05o); " +
467 "import lsst.base; lsst.base.disableImplicitThreading(); " +
468 "import lsst.ctrl.pool.log; lsst.ctrl.pool.log.jobLog(\"%s\"); ") % (UMASK, job)
471 script += (
"import lsst.ctrl.pool.parallel; import atexit; " +
472 "atexit.register(lsst.ctrl.pool.parallel.printProcessStats); ")
474 script +=
"import %s; %s.%s.parseAndRun();" % (module, module, cls.__name__)
476 profilePre =
"import cProfile; import os; cProfile.run(\"\"\"" 477 profilePost =
"\"\"\", filename=\"profile-" + job +
"-%s-%d.dat\" % (os.uname()[1], os.getpid()))" 479 return (
"python -c '" + (profilePre
if args.batchProfile
else "") + script +
480 (profilePost
if args.batchProfile
else "") +
"' " +
shCommandFromArgs(args.leftover) +
483 @contextlib.contextmanager
485 """!Provide a context manager for logging an operation 487 @param operation: description of operation (string) 488 @param catch: Catch all exceptions? 489 @param trace: Log a traceback of caught exception? 491 Note that if 'catch' is True, all exceptions are swallowed, but there may 492 be other side-effects such as undefined variables. 494 self.log.info(
"%s: Start %s" % (NODE, operation))
499 cls, e, _ = sys.exc_info()
500 self.log.warn(
"%s: Caught %s while %s: %s" % (NODE, cls.__name__, operation, e))
502 self.log.info(
"%s: Traceback:\n%s" % (NODE, traceback.format_exc()))
506 self.log.info(
"%s: Finished %s" % (NODE, operation))
510 """Starts a BatchCmdLineTask with an MPI process pool 512 Use this subclass of BatchCmdLineTask if you want to use the Pool directly. 517 """Run with a MPI process pool""" 519 super(BatchPoolTask, cls).
parseAndRun(*args, **kwargs)
524 """Run a Task individually on a list of inputs using the MPI process pool""" 529 Warn if the user specified multiprocessing. 531 TaskRunner.__init__(self, *args, **kwargs)
533 self.log.warn(
"Multiprocessing arguments (-j %d) ignored since using batch processing" %
538 """Run the task on all targets 540 Sole input is the result of parsing the command-line with the ArgumentParser. 542 Output is None if 'precall' failed; otherwise it is a list of calling ourself 543 on each element of the target list from the 'getTargetList' method. 547 import multiprocessing
548 self.prepareForMultiProcessing()
551 if self.precall(parsedCmd):
552 targetList = self.getTargetList(parsedCmd)
553 if len(targetList) > 0:
554 parsedCmd.log.info(
"Processing %d targets with a pool of %d processes..." %
555 (len(targetList), pool.size))
557 resultList = pool.map(self, targetList)
559 parsedCmd.log.warn(
"Not running the task because there is no data to process; " 560 "you may preview data using \"--show data\"")
567 """Run the Task on a single target 569 Strips out the process pool 'cache' argument. 571 'args' are those arguments provided by the getTargetList method. 573 Brings down the entire job if an exception is not caught (i.e., --doraise). 575 return TaskRunner.__call__(self, args)
579 """Runs the BatchCmdLineTask in parallel 581 Use this subclass of BatchCmdLineTask if you don't need to use the Pool 582 directly, but just want to iterate over many objects (like a multi-node 583 version of the '-j' command-line argument). 585 RunnerClass = BatchTaskRunner
588 def _makeArgumentParser(cls, *args, **kwargs):
589 """Build an ArgumentParser 591 Removes the batch-specific parts in order to delegate to the parent classes. 593 kwargs.pop(
"doBatch",
False)
594 kwargs.pop(
"add_help",
False)
595 return super(BatchCmdLineTask, cls)._makeArgumentParser(*args, **kwargs)
599 """Parse an argument list and run the command 601 This is the entry point when we run in earnest, so start the process pool 602 so that the worker nodes don't go any further. 605 results = super(BatchParallelTask, cls).
parseAndRun(*args, **kwargs)
def parseAndRun(cls, args, kwargs)
def __call__(self, cache, args)
def preamble(self, walltime=None)
def parseAndRun(cls, args, kwargs)
def batchCommand(cls, args)
Return command to run CmdLineTask.
def __init__(self, outputDir=None, numNodes=0, numProcsPerNode=0, numCores=0, queue=None, jobName=None, walltime=0.0, dryrun=False, doExec=False, mpiexec="", submit=None, options=None, verbose=False)
Constructor.
def __init__(self, parent=None, args, kwargs)
def preamble(self, walltime=None)
def submitCommand(self, scriptName)
def startPool(comm=None, root=0, killSlaves=True)
Start a process pool.
def parseAndSubmit(cls, args=None, kwargs)
def createScript(self, command, walltime=None)
Create script to be submitted.
def __init__(self, args, kwargs)
def preamble(self, walltime=None)
def submitCommand(self, scriptName)
def parse_args(self, config=None, args=None, namespace=None, kwargs)
def submitCommand(self, scriptName)
Return command to submit script.
def batchWallTime(cls, time, parsedCmd, numCores)
Return walltime request for batch job.
def preamble(self, command, walltime=None)
def submitCommand(self, scriptName)
def setBatchType(batchType)
def logOperation(self, operation, catch=False, trace=True)
Provide a context manager for logging an operation.
def __init__(self, args, kwargs)
def run(self, command, walltime=None)
Run the batch system.
def makeBatch(self, args)
def execution(self, command)
def shCommandFromArgs(args)