22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner"]
43 def _runPool(pool, timeout, function, iterable):
44 """Wrapper around ``pool.map_async``, to handle timeout 46 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see 47 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 49 Further wraps the function in ``_poolFunctionWrapper`` to catch exceptions 50 that don't inherit from `Exception`. 52 return pool.map_async(function, iterable).get(timeout)
55 @contextlib.contextmanager
57 """Context manager for profiling with cProfile. 63 Filename to which to write profile (profiling disabled if `None` or empty). 64 log : `lsst.log.Log`, optional 65 Log object for logging the profile operations. 67 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise 68 it returns None), which allows additional control over profiling. You can obtain this using 69 the "as" clause, e.g.: 71 with profile(filename) as prof: 74 The output cumulative profile can be printed with a command-line like:: 76 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)' 82 from cProfile
import Profile
85 log.info(
"Enabling cProfile profiling")
89 profile.dump_stats(filename)
91 log.info(
"cProfile stats written to %s" % filename)
95 """Run a command-line task, using `multiprocessing` if requested. 99 TaskClass : `lsst.pipe.base.Task` subclass 100 The class of the task to run. 101 parsedCmd : `argparse.Namespace` 102 The parsed command-line arguments, as returned by the task's argument parser's 103 `~lsst.pipe.base.ArgumentParser.parse_args` method. 107 Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may 108 contain non-picklable elements. It certainly contains more data than we need to send to each 109 instance of the task. 110 doReturnResults : `bool`, optional 111 Should run return the collected result from each invocation of the task? This is only intended for 112 unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you 113 call it enough times) and it will fail when using multiprocessing if the returned data cannot be 116 Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned, 117 with value 0 or 1 to be returned to the unix shell. 122 If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be 127 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this 128 class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more 129 information. See `CmdLineTask.parseAndRun` to see how a task runner is used. 131 You may use this task runner for your command-line task if your task has a run method that takes exactly 132 one argument: a butler data reference. Otherwise you must provide a task-specific subclass of this runner 133 for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly 134 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details. 136 This design matches the common pattern for command-line tasks: the run method takes a single data 137 reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of 138 `TaskRunner` that calls these additional arguments by name. 140 Instances of this class must be picklable in order to be compatible with multiprocessing. If 141 multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `run` calls `prepareForMultiProcessing` 142 to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing 143 then indicate this in your task by setting class variable ``canMultiprocess=False``. 145 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This 146 timeout (in sec) can be specified as the ``timeout`` element in the output from 147 `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`. 149 By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as 150 MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns 151 multiple processes and when multiple users are running on a shared system. Users can override this 152 behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable. 154 .. __: http://bugs.python.org/issue8296 155 .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 159 """Default timeout (seconds) for multiprocessing.""" 161 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
171 self.
timeout = getattr(parsedCmd,
'timeout',
None)
176 if not TaskClass.canMultiprocess:
177 self.
log.warn(
"This task does not support multiprocessing; using one process")
181 """Prepare this instance for multiprocessing 183 Optional non-picklable elements are removed. 185 This is only called if the task is run under multiprocessing. 190 """Run the task on all targets. 194 parsedCmd : `argparse.Namespace` 195 Parsed command `argparse.Namespace`. 200 A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__` 201 is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__` 206 The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise 207 processing is serial. 210 disableImplicitThreading()
212 import multiprocessing
214 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
215 mapFunc = functools.partial(_runPool, pool, self.
timeout)
221 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None 224 if len(targetList) > 0:
225 with
profile(profileName, log):
227 resultList = list(mapFunc(self, targetList))
229 log.warn(
"Not running the task because there is no data to process; " 230 "you may preview data using \"--show data\"")
240 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`. 244 parsedCmd : `argparse.Namespace` 245 The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`. 247 Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having 248 it simplifies overriding `TaskRunner` for tasks whose run method takes additional arguments 249 (see case (1) below). 253 The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any 254 command-line task whose run method takes exactly one argument: a data reference. Otherwise you 255 must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly 256 `TaskRunner.__call__`. There are two cases. 260 If your command-line task has a ``run`` method that takes one data reference followed by additional 261 arguments, then you need only override `TaskRunner.getTargetList` to return the additional arguments 262 as an argument dict. To make this easier, your overridden version of `~TaskRunner.getTargetList` may 263 call `TaskRunner.getTargetList` with the extra arguments as keyword arguments. For example, the 264 following adds an argument dict containing a single key: "calExpList", whose value is the list of data 265 IDs for the calexp ID argument:: 267 def getTargetList(parsedCmd): 268 return TaskRunner.getTargetList( 270 calExpList=parsedCmd.calexp.idList 273 It is equivalent to this slightly longer version:: 276 def getTargetList(parsedCmd): 277 argDict = dict(calExpList=parsedCmd.calexp.idList) 278 return [(dataId, argDict) for dataId in parsedCmd.id.idList] 282 If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and 283 `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList` 284 returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task. 286 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
289 """Create a Task instance. 294 Parsed command-line options (used for extra task args by some task runners). 296 Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners). 300 ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None, 301 but it must construct identical Task instances in either case. 303 Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and 304 `TaskRunner.__call__`. 308 def _precallImpl(self, task, parsedCmd):
309 """The main work of `precall`. 311 We write package versions, schemas and configs, or compare these to existing files on disk if present. 313 if not parsedCmd.noVersions:
314 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
319 """Hook for code that should run exactly once, before multiprocessing. 323 Must return True if `TaskRunner.__call__` should subsequently be called. 327 Implementations must take care to ensure that no unpicklable 328 attributes are added to the TaskRunner itself, for compatibility 329 with multiprocessing. 331 The default implementation writes package versions, schemas and configs, or compares them to existing 332 files on disk if present. 334 task = self.
makeTask(parsedCmd=parsedCmd)
341 except Exception
as e:
342 task.log.fatal(
"Failed in task initialization: %s", e)
343 if not isinstance(e, TaskError):
344 traceback.print_exc(file=sys.stderr)
349 """Run the Task on a single target. 354 Arguments for Task.run() 358 struct : `lsst.pipe.base.Struct` 359 Contains these fields if ``doReturnResults`` is `True`: 361 - ``dataRef``: the provided data reference. 362 - ``metadata``: task metadata after execution of run. 363 - ``result``: result returned by task run, or `None` if the task fails. 364 - ``exitStatus``: 0 if the task completed successfully, 1 otherwise. 366 If ``doReturnResults`` is `False` the struct contains: 368 - ``exitStatus``: 0 if the task completed successfully, 1 otherwise. 372 This default implementation assumes that the ``args`` is a tuple 373 containing a data reference and a dict of keyword arguments. 377 If you override this method and wish to return something when ``doReturnResults`` is `False`, 378 then it must be picklable to support multiprocessing and it should be small enough that pickling 379 and unpickling do not add excessive overhead. 381 dataRef, kwargs = args
383 self.
log = Log.getDefaultLogger()
384 if hasattr(dataRef,
"dataId"):
385 self.
log.MDC(
"LABEL", str(dataRef.dataId))
386 elif isinstance(dataRef, (list, tuple)):
387 self.
log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
392 result = task.run(dataRef, **kwargs)
395 result = task.run(dataRef, **kwargs)
396 except Exception
as e:
402 eName = type(e).__name__
403 if hasattr(dataRef,
"dataId"):
404 task.log.fatal(
"Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
405 elif isinstance(dataRef, (list, tuple)):
406 task.log.fatal(
"Failed on dataIds=[%s]: %s: %s",
407 ", ".join(str(ref.dataId)
for ref
in dataRef), eName, e)
409 task.log.fatal(
"Failed on dataRef=%s: %s: %s", dataRef, eName, e)
411 if not isinstance(e, TaskError):
412 traceback.print_exc(file=sys.stderr)
418 task.writeMetadata(dataRef)
421 self.
log.MDCRemove(
"LABEL")
425 exitStatus=exitStatus,
427 metadata=task.metadata,
432 exitStatus=exitStatus,
437 """A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to 442 """A variant of the base version that passes a butler argument to the task's constructor. 446 parsedCmd : `argparse.Namespace` 447 Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified 448 then args is ignored. 450 Other arguments; if ``parsedCmd`` is `None` then this must be specified. 455 Raised if ``parsedCmd`` and ``args`` are both `None`. 457 if parsedCmd
is not None:
458 butler = parsedCmd.butler
459 elif args
is not None:
460 dataRef, kwargs = args
461 butler = dataRef.butlerSubset.butler
463 raise RuntimeError(
"parsedCmd or args must be specified")
468 """Base class for command-line tasks: tasks that may be executed from the command-line. 472 See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for 473 more information about writing command-line tasks. 475 Subclasses must specify the following class variables: 477 - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your 478 task needs no configuration, then `lsst.pex.config.Config` itself). 479 - ``_DefaultName``: default name used for this task (a str). 481 Subclasses may also specify the following class variables: 483 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task 484 with a run method that takes exactly one argument: a data reference. If your task does 485 not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner`` 486 for more information. 487 - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing. 489 Subclasses must specify a method named ``run``: 491 - By default ``run`` accepts a single butler data reference, but you can specify an alternate task runner 492 (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run method needs 494 - ``run`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for evolution 495 of the task since new values may be added without harming existing code. 496 - The data returned by ``run`` must be picklable if your task is to support multiprocessing. 498 RunnerClass = TaskRunner
499 canMultiprocess =
True 503 """A hook to allow a task to change the values of its config *after* the camera-specific 504 overrides are loaded but before any command-line overrides are applied. 508 config : instance of task's ``ConfigClass`` 513 This is necessary in some cases because the camera-specific overrides may retarget subtasks, 514 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion. 518 This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply 524 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
525 """Parse an argument list and run the command. 529 args : `list`, optional 530 List of command-line arguments; if `None` use `sys.argv`. 531 config : `lsst.pex.config.Config`-type, optional 532 Config for task. If `None` use `Task.ConfigClass`. 533 log : `lsst.log.Log`-type, optional 534 Log. If `None` use the default log. 535 doReturnResults : `bool`, optional 536 If `True`, return the results of this task. Default is `False`. This is only intended for 537 unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you 538 call it enough times) and it will fail when using multiprocessing if the returned data cannot be 543 struct : `lsst.pipe.base.Struct` 546 - ``argumentParser``: the argument parser. 547 - ``parsedCmd``: the parsed command returned by the argument parser's 548 `lsst.pipe.base.ArgumentParser.parse_args` method. 549 - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`). 550 - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation. 551 This will typically be a list of `None` unless ``doReturnResults`` is `True`; 552 see `Task.RunnerClass` (`TaskRunner` by default) for more information. 556 Calling this method with no arguments specified is the standard way to run a command-line task 557 from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other 558 file in that directory. 560 If one or more of the dataIds fails then this routine will exit (with a status giving the 561 number of failed dataIds) rather than returning this struct; this behaviour can be 562 overridden by specifying the ``--noExit`` command-line option. 565 commandAsStr =
" ".join(sys.argv)
572 config = cls.ConfigClass()
573 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.
applyOverrides)
575 parsedCmd.log.info(
"Running: %s", commandAsStr)
577 taskRunner = cls.
RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
578 resultList = taskRunner.run(parsedCmd)
581 nFailed = sum(((res.exitStatus != 0)
for res
in resultList))
582 except (TypeError, AttributeError)
as e:
584 parsedCmd.log.warn(
"Unable to retrieve exit status (%s); assuming success", e)
589 parsedCmd.log.error(
"%d dataRefs failed; not exiting as --noExit was set", nFailed)
594 argumentParser=argumentParser,
596 taskRunner=taskRunner,
597 resultList=resultList,
601 def _makeArgumentParser(cls):
602 """Create and return an argument parser. 606 parser : `lsst.pipe.base.ArgumentParser` 607 The argument parser for this task. 611 By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of 612 dataset type ``raw``. 614 Your task subclass may need to override this method to change the dataset type or data ref level, 615 or to add additional data ID arguments. If you add additional data ID arguments or your task's 616 run method takes more than a single data reference then you will also have to provide a task-specific 617 task runner (see TaskRunner for more information). 620 parser.add_id_argument(name=
"--id", datasetType=
"raw",
621 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
625 """Write the configuration used for processing the data, or check that an existing 626 one is equal to the new one if present. 630 butler : `lsst.daf.persistence.Butler` 631 Data butler used to write the config. The config is written to dataset type 632 `CmdLineTask._getConfigName`. 633 clobber : `bool`, optional 634 A boolean flag that controls what happens if a config already has been saved: 635 - `True`: overwrite or rename the existing config, depending on ``doBackup``. 636 - `False`: raise `TaskError` if this config does not match the existing config. 637 doBackup : bool, optional 638 Set to `True` to backup the config files if clobbering. 641 if configName
is None:
644 butler.put(self.
config, configName, doBackup=doBackup)
645 elif butler.datasetExists(configName, write=
True):
648 oldConfig = butler.get(configName, immediate=
True)
649 except Exception
as exc:
650 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
653 def logConfigMismatch(msg):
654 self.
log.fatal(
"Comparing configuration: %s", msg)
656 if not self.
config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
658 (
"Config does not match existing task config %r on disk; tasks configurations " +
659 "must be consistent within the same output repo (override with --clobber-config)") %
662 butler.put(self.
config, configName)
665 """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`. 669 butler : `lsst.daf.persistence.Butler` 670 Data butler used to write the schema. Each schema is written to the dataset type specified as the 671 key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`. 672 clobber : `bool`, optional 673 A boolean flag that controls what happens if a schema already has been saved: 674 - `True`: overwrite or rename the existing schema, depending on ``doBackup``. 675 - `False`: raise `TaskError` if this schema does not match the existing schema. 676 doBackup : `bool`, optional 677 Set to `True` to backup the schema files if clobbering. 681 If ``clobber`` is `False` and an existing schema does not match a current schema, 682 then some schemas may have been saved successfully and others may not, and there is no easy way to 686 schemaDataset = dataset +
"_schema" 688 butler.put(catalog, schemaDataset, doBackup=doBackup)
689 elif butler.datasetExists(schemaDataset, write=
True):
690 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
691 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
693 (
"New schema does not match schema %r on disk; schemas must be " +
694 " consistent within the same output repo (override with --clobber-config)") %
697 butler.put(catalog, schemaDataset)
700 """Write the metadata produced from processing the data. 705 Butler data reference used to write the metadata. 706 The metadata is written to dataset type `CmdLineTask._getMetadataName`. 710 if metadataName
is not None:
712 except Exception
as e:
713 self.
log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
716 """Compare and write package versions. 720 butler : `lsst.daf.persistence.Butler` 721 Data butler used to read/write the package versions. 722 clobber : `bool`, optional 723 A boolean flag that controls what happens if versions already have been saved: 724 - `True`: overwrite or rename the existing version info, depending on ``doBackup``. 725 - `False`: raise `TaskError` if this version info does not match the existing. 726 doBackup : `bool`, optional 727 If `True` and clobbering, old package version files are backed up. 728 dataset : `str`, optional 729 Name of dataset to read/write. 734 Raised if there is a version mismatch with current and persisted lists of package versions. 738 Note that this operation is subject to a race condition. 740 packages = Packages.fromSystem()
743 return butler.put(packages, dataset, doBackup=doBackup)
744 if not butler.datasetExists(dataset, write=
True):
745 return butler.put(packages, dataset)
748 old = butler.get(dataset, immediate=
True)
749 except Exception
as exc:
750 raise type(exc)(
"Unable to read stored version dataset %s (%s); " 751 "consider using --clobber-versions or --no-versions" %
756 diff = packages.difference(old)
759 "Version mismatch (" +
760 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
761 "); consider using --clobber-versions or --no-versions")
763 extra = packages.extra(old)
766 butler.put(old, dataset, doBackup=doBackup)
768 def _getConfigName(self):
769 """Get the name of the config dataset type, or `None` if config is not to be persisted. 773 The name may depend on the config; that is why this is not a class method. 775 return self._DefaultName +
"_config" 777 def _getMetadataName(self):
778 """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted. 782 The name may depend on the config; that is why this is not a class method. 784 return self._DefaultName +
"_metadata"
def _makeArgumentParser(cls)
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
def _precallImpl(self, task, parsedCmd)
def getFullMetadata(self)
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
def getAllSchemaCatalogs(self)
def writeSchemas(self, butler, clobber=False, doBackup=True)
def prepareForMultiProcessing(self)
def _getMetadataName(self)
def makeTask(self, parsedCmd=None, args=None)
def writeMetadata(self, dataRef)
def precall(self, parsedCmd)
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
def profile(filename, log=None)
def makeTask(self, parsedCmd=None, args=None)
def getTargetList(parsedCmd, kwargs)
def writeConfig(self, butler, clobber=False, doBackup=True)
def applyOverrides(cls, config)