3 from __future__
import print_function
4 from builtins
import object
17 from .pool
import startPool, Pool, NODE, abortOnError, setBatchType
20 __all__ = [
"Batch",
"PbsBatch",
"SlurmBatch",
"SmpBatch",
"BATCH_TYPES",
"BatchArgumentParser",
21 "BatchCmdLineTask",
"BatchPoolTask", ]
27 _quote_pos = re.compile(
'(?=[^-0-9a-zA-Z_./\n])')
31 r"""Quote the argument for the shell. 40 return _quote_pos.sub(
'\\\\', arg).replace(
'\n',
"'\n'")
46 """Convert a list of shell arguments to a shell command-line""" 47 return ' '.join([
shQuote(a)
for a
in args])
51 """Collect Linux-specific process statistics 53 Parses the /proc/self/status file (N.B. Linux-specific!) into a dict 57 with open(
"/proc/self/status")
as f:
59 key, _, value = line.partition(
":")
60 result[key] = value.strip()
65 """Print the process statistics to the log""" 67 log = Log.getDefaultLogger()
68 log.info(
"Process stats for %s: %s" % (NODE,
processStats()))
72 """Base class for batch submission""" 74 def __init__(self, outputDir=None, numNodes=0, numProcsPerNode=0, numCores=0, queue=None, jobName=None,
75 walltime=0.0, dryrun=False, doExec=False, mpiexec="", submit=None, options=None,
79 @param outputDir: output directory, or None 80 @param numNodes: number of nodes 81 @param numProcsPerNode: number of processors per node 82 @param numCores: number of cores (Slurm, SMP only) 83 @param queue: name of queue, or None 84 @param jobName: name of job, or None 85 @param walltime: maximum wall clock time for job 86 @param dryrun: Dry run (only print actions that would be taken)? 87 @param doExec: exec the script instead of submitting to batch system? 88 @param mpiexec: options for mpiexec 89 @param submit: command-line options for batch submission (e.g., for qsub, sbatch) 90 @param options: options to append to script header (e.g., #PBS or #SBATCH) 91 @param verbose: produce verbose output? 93 if (numNodes <= 0
or numProcsPerNode <= 0)
and numCores <= 0:
94 raise RuntimeError(
"Must specify numNodes+numProcs or numCores")
114 """Return preamble string for script to be submitted 116 Most batch systems allow you to embed submission options as comments here. 118 raise NotImplementedError(
"Not implemented for base class")
121 """Return execution string for script to be submitted""" 123 "umask %03o" % UMASK,
124 "cd %s" % pipes.quote(os.getcwd()),
127 script += [
"echo \"mpiexec is at: $(which mpiexec)\"",
129 "echo 'umask: ' $(umask)",
134 script += [
"mpiexec %s %s" % (self.
mpiexec, command)]
139 return "\n".join(script)
142 """!Create script to be submitted 144 @param command: command to run 145 @param walltime: maximum wall clock time, overrides value to constructor 146 @return name of script on filesystem 148 fd, scriptName = tempfile.mkstemp()
149 with os.fdopen(fd,
"w")
as f:
157 os.chmod(scriptName, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
161 """!Return command to submit script 163 @param scriptName: name of script on filesystem 165 raise NotImplementedError(
"No implementation for base class")
167 def run(self, command, walltime=None):
168 """!Run the batch system 170 Creates and submits the script to execute the provided command 172 @param command: command to run 173 @param walltime: maximum wall clock time, overrides value to constructor 174 @return name of script on filesystem 176 scriptName = self.
createScript(command, walltime=walltime)
179 print(
"Would run: %s" % command)
181 os.execl(scriptName, scriptName)
188 """Batch submission with PBS""" 194 raise RuntimeError(
"Non-positive walltime: %s (did you forget '--time'?)" % (walltime,))
197 "Number of nodes (--nodes=%d) or number of processors per node (--procs=%d) not set" %
200 raise RuntimeError(
"PBS does not support setting the number of cores")
204 "#PBS -l walltime=%d" % walltime
if walltime
is not None else "",
207 "#PBS -q %s" % self.
queue if self.
queue is not None else "",
209 "#PBS -W umask=%03o" % UMASK,
213 return "qsub %s -V %s" % (self.
submit if self.
submit is not None else "", scriptName)
217 """Batch submission with Slurm""" 221 """Format walltime (in seconds) as days-hours:minutes""" 225 days = walltime//secInDay
226 walltime -= days*secInDay
227 hours = walltime//secInHour
228 walltime -= hours*secInHour
229 minutes = walltime//secInMinute
230 walltime -= minutes*secInMinute
233 return "%d-%d:%d" % (days, hours, minutes)
239 raise RuntimeError(
"Non-positive walltime: %s (did you forget '--time'?)" % (walltime,))
242 "Number of nodes (--nodes=%d) and number of processors per node (--procs=%d) not set OR " 245 raise RuntimeError(
"Must set either --nodes,--procs or --cores: not both")
248 filename = os.path.join(outputDir, (self.
jobName if self.
jobName is not None else "slurm") +
".o%j")
249 return "\n".join([(
"#SBATCH --nodes=%d" % self.
numNodes)
if self.
numNodes > 0
else "",
254 "#SBATCH --job-name=%s" % self.
jobName if self.
jobName is not None else "",
255 "#SBATCH -p %s" % self.
queue if self.
queue is not None else "",
256 "#SBATCH --output=%s" % filename,
257 "#SBATCH --error=%s" % filename,
262 return "sbatch %s %s" % (self.
submit if self.
submit is not None else "", scriptName)
266 """Not-really-Batch submission with multiple cores on the current node 268 The job is run immediately. 272 super(SmpBatch, self).
__init__(*args, **kwargs)
279 raise RuntimeError(
"SMP does not support the --nodes and --procs command-line options; " 280 "use --cores to specify the number of cores to use")
290 return "exec %s" % scriptName
293 BATCH_TYPES = {
'none' :
None,
302 """An argument parser to get relevant parameters for batch submission 304 We want to be able to display the help for a 'parent' ArgumentParser 305 along with the batch-specific options we introduce in this class, but 306 we don't want to swallow the parent (i.e., ArgumentParser(parents=[parent])) 307 because we want to save the list of arguments that this particular 308 BatchArgumentParser doesn't parse, so they can be passed on to a different 309 program (though we also want to parse them to check that they can be parsed). 313 super(BatchArgumentParser, self).
__init__(*args, **kwargs)
315 group = self.add_argument_group(
"Batch submission options")
316 group.add_argument(
"--queue", help=
"Queue name")
317 group.add_argument(
"--job", help=
"Job name")
318 group.add_argument(
"--nodes", type=int, default=0, help=
"Number of nodes")
319 group.add_argument(
"--procs", type=int, default=0, help=
"Number of processors per node")
320 group.add_argument(
"--cores", type=int, default=0, help=
"Number of cores (Slurm/SMP only)")
321 group.add_argument(
"--time", type=float, default=0,
322 help=
"Expected execution time per element (sec)")
323 group.add_argument(
"--batch-type", dest=
"batchType", choices=list(BATCH_TYPES.keys()), default=
"smp",
324 help=
"Batch system to use")
325 group.add_argument(
"--batch-verbose", dest=
"batchVerbose", action=
"store_true", default=
False,
326 help=(
"Enable verbose output in batch script " 327 "(including system environment information at batch start)?"))
328 group.add_argument(
"--batch-output", dest=
"batchOutput", help=
"Output directory")
329 group.add_argument(
"--batch-submit", dest=
"batchSubmit", help=
"Batch submission command-line flags")
330 group.add_argument(
"--batch-options", dest=
"batchOptions", help=
"Header options for batch script")
331 group.add_argument(
"--batch-profile", dest=
"batchProfile", action=
"store_true", default=
False,
332 help=
"Enable profiling on batch job?")
333 group.add_argument(
"--batch-stats", dest=
"batchStats", action=
"store_true", default=
False,
334 help=
"Print process stats on completion (Linux only)?")
335 group.add_argument(
"--dry-run", dest=
"dryrun", default=
False, action=
"store_true",
337 group.add_argument(
"--do-exec", dest=
"doExec", default=
False, action=
"store_true",
338 help=
"Exec script instead of submit to batch system?")
339 group.add_argument(
"--mpiexec", default=
"", help=
"mpiexec options")
341 def parse_args(self, config=None, args=None, namespace=None, **kwargs):
342 args, leftover = super(BatchArgumentParser, self).parse_known_args(args=args, namespace=namespace)
345 if len(leftover) > 0:
348 self.error(
"Unrecognised arguments: %s" % leftover)
350 args.leftover = leftover
355 """Create a Batch object from the command-line arguments""" 357 argMapping = {
'outputDir':
'batchOutput',
359 'numProcsPerNode':
'procs',
366 'mpiexec':
'mpiexec',
367 'submit':
'batchSubmit',
368 'options':
'batchOptions',
369 'verbose':
'batchVerbose',
372 if BATCH_TYPES[args.batchType]
is None:
376 kwargs = {k: getattr(args, v)
for k, v
in argMapping.items()}
377 return BATCH_TYPES[args.batchType](**kwargs)
380 text =
"""This is a script for queue submission of a wrapped script. 382 Use this program name and ignore that for the wrapped script (it will be 383 passed on to the batch system). Arguments for *both* this wrapper script or the 384 wrapped script are valid (if it is required for the wrapped script, it 385 is required for the wrapper as well). 387 *** Batch system submission wrapper: 390 text += super(BatchArgumentParser, self).
format_help()
411 """Generate bash script to regenerate the current environment""" 413 for key, val
in os.environ.items():
414 if key
in (
"DISPLAY",):
416 if val.startswith(
"() {"):
423 if key.startswith(
"BASH_FUNC_")
and key.endswith(
"()"):
426 output +=
"{key} {val}\nexport -f {key}\n".format(key=key, val=val)
429 output +=
"export {key}='{val}'\n".format(key=key, val=val.replace(
"'",
"'\"'\"'"))
437 taskParser = cls._makeArgumentParser(doBatch=
True, add_help=
False)
439 batchArgs = batchParser.parse_args(config=cls.ConfigClass(), args=args, override=cls.applyOverrides,
442 if not cls.RunnerClass(cls, batchArgs.parent).precall(batchArgs.parent):
443 taskParser.error(
"Error in task preparation")
447 if batchArgs.batch
is None:
448 sys.argv = [sys.argv[0]] + batchArgs.leftover
450 return cls.parseAndRun()
452 numCores = batchArgs.cores
if batchArgs.cores > 0
else batchArgs.nodes*batchArgs.procs
453 walltime = cls.
batchWallTime(batchArgs.time, batchArgs.parent, numCores)
456 batchArgs.batch.run(command, walltime=walltime)
460 """!Return walltime request for batch job 462 Subclasses should override if the walltime should be calculated 463 differently (e.g., addition of some serial time). 466 @param time: Requested time per iteration 467 @param parsedCmd: Results of argument parsing 468 @param numCores: Number of cores 470 numTargets = len(cls.RunnerClass.getTargetList(parsedCmd))
471 return time*numTargets/float(numCores)
475 """!Return command to run CmdLineTask 478 @param args: Parsed batch job arguments (from BatchArgumentParser) 480 job = args.job
if args.job
is not None else "job" 481 module = cls.__module__
482 script = (
"import os; os.umask(%#05o); " +
483 "import lsst.base; lsst.base.disableImplicitThreading(); " +
484 "import lsst.ctrl.pool.log; lsst.ctrl.pool.log.jobLog(\"%s\"); ") % (UMASK, job)
487 script += (
"import lsst.ctrl.pool.parallel; import atexit; " +
488 "atexit.register(lsst.ctrl.pool.parallel.printProcessStats); ")
490 script +=
"import %s; %s.%s.parseAndRun();" % (module, module, cls.__name__)
492 profilePre =
"import cProfile; import os; cProfile.run(\"\"\"" 493 profilePost =
"\"\"\", filename=\"profile-" + job +
"-%s-%d.dat\" % (os.uname()[1], os.getpid()))" 495 return (
"python -c '" + (profilePre
if args.batchProfile
else "") + script +
496 (profilePost
if args.batchProfile
else "") +
"' " +
shCommandFromArgs(args.leftover) +
499 @contextlib.contextmanager
501 """!Provide a context manager for logging an operation 503 @param operation: description of operation (string) 504 @param catch: Catch all exceptions? 505 @param trace: Log a traceback of caught exception? 507 Note that if 'catch' is True, all exceptions are swallowed, but there may 508 be other side-effects such as undefined variables. 510 self.log.info(
"%s: Start %s" % (NODE, operation))
515 cls, e, _ = sys.exc_info()
516 self.log.warn(
"%s: Caught %s while %s: %s" % (NODE, cls.__name__, operation, e))
518 self.log.info(
"%s: Traceback:\n%s" % (NODE, traceback.format_exc()))
522 self.log.info(
"%s: Finished %s" % (NODE, operation))
526 """Starts a BatchCmdLineTask with an MPI process pool 528 Use this subclass of BatchCmdLineTask if you want to use the Pool directly. 533 """Run with a MPI process pool""" 535 super(BatchPoolTask, cls).
parseAndRun(*args, **kwargs)
540 """Run a Task individually on a list of inputs using the MPI process pool""" 545 Warn if the user specified multiprocessing. 547 TaskRunner.__init__(self, *args, **kwargs)
549 self.log.warn(
"Multiprocessing arguments (-j %d) ignored since using batch processing" %
554 """Run the task on all targets 556 Sole input is the result of parsing the command-line with the ArgumentParser. 558 Output is None if 'precall' failed; otherwise it is a list of calling ourself 559 on each element of the target list from the 'getTargetList' method. 563 import multiprocessing
564 self.prepareForMultiProcessing()
567 if self.precall(parsedCmd):
568 targetList = self.getTargetList(parsedCmd)
569 if len(targetList) > 0:
570 parsedCmd.log.info(
"Processing %d targets with a pool of %d processes..." %
571 (len(targetList), pool.size))
573 resultList = pool.map(self, targetList)
575 parsedCmd.log.warn(
"Not running the task because there is no data to process; " 576 "you may preview data using \"--show data\"")
583 """Run the Task on a single target 585 Strips out the process pool 'cache' argument. 587 'args' are those arguments provided by the getTargetList method. 589 Brings down the entire job if an exception is not caught (i.e., --doraise). 591 return TaskRunner.__call__(self, args)
595 """Runs the BatchCmdLineTask in parallel 597 Use this subclass of BatchCmdLineTask if you don't need to use the Pool 598 directly, but just want to iterate over many objects (like a multi-node 599 version of the '-j' command-line argument). 601 RunnerClass = BatchTaskRunner
604 def _makeArgumentParser(cls, *args, **kwargs):
605 """Build an ArgumentParser 607 Removes the batch-specific parts in order to delegate to the parent classes. 609 kwargs.pop(
"doBatch",
False)
610 kwargs.pop(
"add_help",
False)
611 return super(BatchCmdLineTask, cls)._makeArgumentParser(*args, **kwargs)
615 """Parse an argument list and run the command 617 This is the entry point when we run in earnest, so start the process pool 618 so that the worker nodes don't go any further. 621 results = super(BatchParallelTask, cls).
parseAndRun(*args, **kwargs)
def parseAndRun(cls, args, kwargs)
def __call__(self, cache, args)
def preamble(self, walltime=None)
def parseAndRun(cls, args, kwargs)
def formatWalltime(walltime)
def batchCommand(cls, args)
Return command to run CmdLineTask.
def __init__(self, outputDir=None, numNodes=0, numProcsPerNode=0, numCores=0, queue=None, jobName=None, walltime=0.0, dryrun=False, doExec=False, mpiexec="", submit=None, options=None, verbose=False)
Constructor.
def __init__(self, parent=None, args, kwargs)
def preamble(self, walltime=None)
def submitCommand(self, scriptName)
def startPool(comm=None, root=0, killSlaves=True)
Start a process pool.
def parseAndSubmit(cls, args=None, kwargs)
def createScript(self, command, walltime=None)
Create script to be submitted.
def __init__(self, args, kwargs)
def preamble(self, walltime=None)
def submitCommand(self, scriptName)
def parse_args(self, config=None, args=None, namespace=None, kwargs)
def submitCommand(self, scriptName)
Return command to submit script.
def batchWallTime(cls, time, parsedCmd, numCores)
Return walltime request for batch job.
def preamble(self, command, walltime=None)
def submitCommand(self, scriptName)
def setBatchType(batchType)
def logOperation(self, operation, catch=False, trace=True)
Provide a context manager for logging an operation.
def __init__(self, args, kwargs)
def run(self, command, walltime=None)
Run the batch system.
def makeBatch(self, args)
def execution(self, command)
def shCommandFromArgs(args)