22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner"]
43 def _poolFunctionWrapper(function, arg):
44 """Wrapper around function to catch exceptions that don't inherit from `Exception`. 46 Such exceptions aren't caught by multiprocessing, which causes the slave process to crash and you end up 55 cls, exc, tb = sys.exc_info()
56 log = Log.getDefaultLogger()
57 log.warn(
"Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58 raise Exception(
"Unhandled exception: %s (%s)" % (cls.__name__, exc))
61 def _runPool(pool, timeout, function, iterable):
62 """Wrapper around ``pool.map_async``, to handle timeout 64 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see 65 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 67 Further wraps the function in ``_poolFunctionWrapper`` to catch exceptions 68 that don't inherit from `Exception`. 70 return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
73 @contextlib.contextmanager
75 """Context manager for profiling with cProfile. 81 Filename to which to write profile (profiling disabled if `None` or empty). 82 log : `lsst.log.Log`, optional 83 Log object for logging the profile operations. 85 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise 86 it returns None), which allows additional control over profiling. You can obtain this using 87 the "as" clause, e.g.: 89 with profile(filename) as prof: 92 The output cumulative profile can be printed with a command-line like:: 94 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)' 100 from cProfile
import Profile
103 log.info(
"Enabling cProfile profiling")
107 profile.dump_stats(filename)
109 log.info(
"cProfile stats written to %s" % filename)
113 """Run a command-line task, using `multiprocessing` if requested. 117 TaskClass : `lsst.pipe.base.Task` subclass 118 The class of the task to run. 119 parsedCmd : `argparse.Namespace` 120 The parsed command-line arguments, as returned by the task's argument parser's 121 `~lsst.pipe.base.ArgumentParser.parse_args` method. 125 Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may 126 contain non-picklable elements. It certainly contains more data than we need to send to each 127 instance of the task. 128 doReturnResults : `bool`, optional 129 Should run return the collected result from each invocation of the task? This is only intended for 130 unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you 131 call it enough times) and it will fail when using multiprocessing if the returned data cannot be 134 Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned, 135 with value 0 or 1 to be returned to the unix shell. 140 If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be 145 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this 146 class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more 147 information. See `CmdLineTask.parseAndRun` to see how a task runner is used. 149 You may use this task runner for your command-line task if your task has a run method that takes exactly 150 one argument: a butler data reference. Otherwise you must provide a task-specific subclass of this runner 151 for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly 152 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details. 154 This design matches the common pattern for command-line tasks: the run method takes a single data 155 reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of 156 `TaskRunner` that calls these additional arguments by name. 158 Instances of this class must be picklable in order to be compatible with multiprocessing. If 159 multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `run` calls `prepareForMultiProcessing` 160 to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing 161 then indicate this in your task by setting class variable ``canMultiprocess=False``. 163 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This 164 timeout (in sec) can be specified as the ``timeout`` element in the output from 165 `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`. 167 By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as 168 MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns 169 multiple processes and when multiple users are running on a shared system. Users can override this 170 behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable. 172 .. __: http://bugs.python.org/issue8296 173 .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 177 """Default timeout (seconds) for multiprocessing.""" 179 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
189 self.
timeout = getattr(parsedCmd,
'timeout',
None)
194 if not TaskClass.canMultiprocess:
195 self.
log.warn(
"This task does not support multiprocessing; using one process")
199 """Prepare this instance for multiprocessing 201 Optional non-picklable elements are removed. 203 This is only called if the task is run under multiprocessing. 208 """Run the task on all targets. 212 parsedCmd : `argparse.Namespace` 213 Parsed command `argparse.Namespace`. 218 A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__` 219 is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__` 224 The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise 225 processing is serial. 228 disableImplicitThreading()
230 import multiprocessing
232 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
233 mapFunc = functools.partial(_runPool, pool, self.
timeout)
239 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None 242 if len(targetList) > 0:
243 with
profile(profileName, log):
245 resultList = list(mapFunc(self, targetList))
247 log.warn(
"Not running the task because there is no data to process; " 248 "you may preview data using \"--show data\"")
258 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`. 262 parsedCmd : `argparse.Namespace` 263 The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`. 265 Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having 266 it simplifies overriding `TaskRunner` for tasks whose run method takes additional arguments 267 (see case (1) below). 271 The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any 272 command-line task whose run method takes exactly one argument: a data reference. Otherwise you 273 must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly 274 `TaskRunner.__call__`. There are two cases. 278 If your command-line task has a ``run`` method that takes one data reference followed by additional 279 arguments, then you need only override `TaskRunner.getTargetList` to return the additional arguments 280 as an argument dict. To make this easier, your overridden version of `~TaskRunner.getTargetList` may 281 call `TaskRunner.getTargetList` with the extra arguments as keyword arguments. For example, the 282 following adds an argument dict containing a single key: "calExpList", whose value is the list of data 283 IDs for the calexp ID argument:: 285 def getTargetList(parsedCmd): 286 return TaskRunner.getTargetList( 288 calExpList=parsedCmd.calexp.idList 291 It is equivalent to this slightly longer version:: 294 def getTargetList(parsedCmd): 295 argDict = dict(calExpList=parsedCmd.calexp.idList) 296 return [(dataId, argDict) for dataId in parsedCmd.id.idList] 300 If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and 301 `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList` 302 returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task. 304 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
307 """Create a Task instance. 312 Parsed command-line options (used for extra task args by some task runners). 314 Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners). 318 ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None, 319 but it must construct identical Task instances in either case. 321 Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and 322 `TaskRunner.__call__`. 326 def _precallImpl(self, task, parsedCmd):
327 """The main work of `precall`. 329 We write package versions, schemas and configs, or compare these to existing files on disk if present. 331 if not parsedCmd.noVersions:
332 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
337 """Hook for code that should run exactly once, before multiprocessing. 341 Must return True if `TaskRunner.__call__` should subsequently be called. 345 Implementations must take care to ensure that no unpicklable 346 attributes are added to the TaskRunner itself, for compatibility 347 with multiprocessing. 349 The default implementation writes package versions, schemas and configs, or compares them to existing 350 files on disk if present. 352 task = self.
makeTask(parsedCmd=parsedCmd)
359 except Exception
as e:
360 task.log.fatal(
"Failed in task initialization: %s", e)
361 if not isinstance(e, TaskError):
362 traceback.print_exc(file=sys.stderr)
367 """Run the Task on a single target. 372 Arguments for Task.run() 376 struct : `lsst.pipe.base.Struct` 377 Contains these fields if ``doReturnResults`` is `True`: 379 - ``dataRef``: the provided data reference. 380 - ``metadata``: task metadata after execution of run. 381 - ``result``: result returned by task run, or `None` if the task fails. 382 - ``exitStatus``: 0 if the task completed successfully, 1 otherwise. 384 If ``doReturnResults`` is `False` the struct contains: 386 - ``exitStatus``: 0 if the task completed successfully, 1 otherwise. 390 This default implementation assumes that the ``args`` is a tuple 391 containing a data reference and a dict of keyword arguments. 395 If you override this method and wish to return something when ``doReturnResults`` is `False`, 396 then it must be picklable to support multiprocessing and it should be small enough that pickling 397 and unpickling do not add excessive overhead. 399 dataRef, kwargs = args
401 self.
log = Log.getDefaultLogger()
402 if hasattr(dataRef,
"dataId"):
403 self.
log.MDC(
"LABEL", str(dataRef.dataId))
404 elif isinstance(dataRef, (list, tuple)):
405 self.
log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
410 result = task.run(dataRef, **kwargs)
413 result = task.run(dataRef, **kwargs)
414 except Exception
as e:
420 eName = type(e).__name__
421 if hasattr(dataRef,
"dataId"):
422 task.log.fatal(
"Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
423 elif isinstance(dataRef, (list, tuple)):
424 task.log.fatal(
"Failed on dataIds=[%s]: %s: %s",
425 ", ".join(str(ref.dataId)
for ref
in dataRef), eName, e)
427 task.log.fatal(
"Failed on dataRef=%s: %s: %s", dataRef, eName, e)
429 if not isinstance(e, TaskError):
430 traceback.print_exc(file=sys.stderr)
436 task.writeMetadata(dataRef)
439 self.
log.MDCRemove(
"LABEL")
443 exitStatus=exitStatus,
445 metadata=task.metadata,
450 exitStatus=exitStatus,
455 """A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to 460 """A variant of the base version that passes a butler argument to the task's constructor. 464 parsedCmd : `argparse.Namespace` 465 Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified 466 then args is ignored. 468 Other arguments; if ``parsedCmd`` is `None` then this must be specified. 473 Raised if ``parsedCmd`` and ``args`` are both `None`. 475 if parsedCmd
is not None:
476 butler = parsedCmd.butler
477 elif args
is not None:
478 dataRef, kwargs = args
479 butler = dataRef.butlerSubset.butler
481 raise RuntimeError(
"parsedCmd or args must be specified")
486 """Base class for command-line tasks: tasks that may be executed from the command-line. 490 See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for 491 more information about writing command-line tasks. 493 Subclasses must specify the following class variables: 495 - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your 496 task needs no configuration, then `lsst.pex.config.Config` itself). 497 - ``_DefaultName``: default name used for this task (a str). 499 Subclasses may also specify the following class variables: 501 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task 502 with a run method that takes exactly one argument: a data reference. If your task does 503 not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner`` 504 for more information. 505 - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing. 507 Subclasses must specify a method named ``run``: 509 - By default ``run`` accepts a single butler data reference, but you can specify an alternate task runner 510 (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run method needs 512 - ``run`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for evolution 513 of the task since new values may be added without harming existing code. 514 - The data returned by ``run`` must be picklable if your task is to support multiprocessing. 516 RunnerClass = TaskRunner
517 canMultiprocess =
True 521 """A hook to allow a task to change the values of its config *after* the camera-specific 522 overrides are loaded but before any command-line overrides are applied. 526 config : instance of task's ``ConfigClass`` 531 This is necessary in some cases because the camera-specific overrides may retarget subtasks, 532 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion. 536 This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply 542 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
543 """Parse an argument list and run the command. 547 args : `list`, optional 548 List of command-line arguments; if `None` use `sys.argv`. 549 config : `lsst.pex.config.Config`-type, optional 550 Config for task. If `None` use `Task.ConfigClass`. 551 log : `lsst.log.Log`-type, optional 552 Log. If `None` use the default log. 553 doReturnResults : `bool`, optional 554 If `True`, return the results of this task. Default is `False`. This is only intended for 555 unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you 556 call it enough times) and it will fail when using multiprocessing if the returned data cannot be 561 struct : `lsst.pipe.base.Struct` 564 - ``argumentParser``: the argument parser. 565 - ``parsedCmd``: the parsed command returned by the argument parser's 566 `lsst.pipe.base.ArgumentParser.parse_args` method. 567 - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`). 568 - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation. 569 This will typically be a list of `None` unless ``doReturnResults`` is `True`; 570 see `Task.RunnerClass` (`TaskRunner` by default) for more information. 574 Calling this method with no arguments specified is the standard way to run a command-line task 575 from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other 576 file in that directory. 578 If one or more of the dataIds fails then this routine will exit (with a status giving the 579 number of failed dataIds) rather than returning this struct; this behaviour can be 580 overridden by specifying the ``--noExit`` command-line option. 583 commandAsStr =
" ".join(sys.argv)
590 config = cls.ConfigClass()
591 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.
applyOverrides)
593 parsedCmd.log.info(
"Running: %s", commandAsStr)
595 taskRunner = cls.
RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
596 resultList = taskRunner.run(parsedCmd)
599 nFailed = sum(((res.exitStatus != 0)
for res
in resultList))
600 except (TypeError, AttributeError)
as e:
602 parsedCmd.log.warn(
"Unable to retrieve exit status (%s); assuming success", e)
607 parsedCmd.log.error(
"%d dataRefs failed; not exiting as --noExit was set", nFailed)
612 argumentParser=argumentParser,
614 taskRunner=taskRunner,
615 resultList=resultList,
619 def _makeArgumentParser(cls):
620 """Create and return an argument parser. 624 parser : `lsst.pipe.base.ArgumentParser` 625 The argument parser for this task. 629 By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of 630 dataset type ``raw``. 632 Your task subclass may need to override this method to change the dataset type or data ref level, 633 or to add additional data ID arguments. If you add additional data ID arguments or your task's 634 run method takes more than a single data reference then you will also have to provide a task-specific 635 task runner (see TaskRunner for more information). 638 parser.add_id_argument(name=
"--id", datasetType=
"raw",
639 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
643 """Write the configuration used for processing the data, or check that an existing 644 one is equal to the new one if present. 648 butler : `lsst.daf.persistence.Butler` 649 Data butler used to write the config. The config is written to dataset type 650 `CmdLineTask._getConfigName`. 651 clobber : `bool`, optional 652 A boolean flag that controls what happens if a config already has been saved: 653 - `True`: overwrite or rename the existing config, depending on ``doBackup``. 654 - `False`: raise `TaskError` if this config does not match the existing config. 655 doBackup : bool, optional 656 Set to `True` to backup the config files if clobbering. 659 if configName
is None:
662 butler.put(self.
config, configName, doBackup=doBackup)
663 elif butler.datasetExists(configName, write=
True):
666 oldConfig = butler.get(configName, immediate=
True)
667 except Exception
as exc:
668 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
671 def logConfigMismatch(msg):
672 self.
log.fatal(
"Comparing configuration: %s", msg)
674 if not self.
config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
676 (
"Config does not match existing task config %r on disk; tasks configurations " +
677 "must be consistent within the same output repo (override with --clobber-config)") %
680 butler.put(self.
config, configName)
683 """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`. 687 butler : `lsst.daf.persistence.Butler` 688 Data butler used to write the schema. Each schema is written to the dataset type specified as the 689 key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`. 690 clobber : `bool`, optional 691 A boolean flag that controls what happens if a schema already has been saved: 692 - `True`: overwrite or rename the existing schema, depending on ``doBackup``. 693 - `False`: raise `TaskError` if this schema does not match the existing schema. 694 doBackup : `bool`, optional 695 Set to `True` to backup the schema files if clobbering. 699 If ``clobber`` is `False` and an existing schema does not match a current schema, 700 then some schemas may have been saved successfully and others may not, and there is no easy way to 704 schemaDataset = dataset +
"_schema" 706 butler.put(catalog, schemaDataset, doBackup=doBackup)
707 elif butler.datasetExists(schemaDataset, write=
True):
708 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
709 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
711 (
"New schema does not match schema %r on disk; schemas must be " +
712 " consistent within the same output repo (override with --clobber-config)") %
715 butler.put(catalog, schemaDataset)
718 """Write the metadata produced from processing the data. 723 Butler data reference used to write the metadata. 724 The metadata is written to dataset type `CmdLineTask._getMetadataName`. 728 if metadataName
is not None:
730 except Exception
as e:
731 self.
log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
734 """Compare and write package versions. 738 butler : `lsst.daf.persistence.Butler` 739 Data butler used to read/write the package versions. 740 clobber : `bool`, optional 741 A boolean flag that controls what happens if versions already have been saved: 742 - `True`: overwrite or rename the existing version info, depending on ``doBackup``. 743 - `False`: raise `TaskError` if this version info does not match the existing. 744 doBackup : `bool`, optional 745 If `True` and clobbering, old package version files are backed up. 746 dataset : `str`, optional 747 Name of dataset to read/write. 752 Raised if there is a version mismatch with current and persisted lists of package versions. 756 Note that this operation is subject to a race condition. 758 packages = Packages.fromSystem()
761 return butler.put(packages, dataset, doBackup=doBackup)
762 if not butler.datasetExists(dataset, write=
True):
763 return butler.put(packages, dataset)
766 old = butler.get(dataset, immediate=
True)
767 except Exception
as exc:
768 raise type(exc)(
"Unable to read stored version dataset %s (%s); " 769 "consider using --clobber-versions or --no-versions" %
774 diff = packages.difference(old)
777 "Version mismatch (" +
778 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
779 "); consider using --clobber-versions or --no-versions")
781 extra = packages.extra(old)
784 butler.put(old, dataset, doBackup=doBackup)
786 def _getConfigName(self):
787 """Get the name of the config dataset type, or `None` if config is not to be persisted. 791 The name may depend on the config; that is why this is not a class method. 793 return self._DefaultName +
"_config" 795 def _getMetadataName(self):
796 """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted. 800 The name may depend on the config; that is why this is not a class method. 802 return self._DefaultName +
"_metadata"
def _makeArgumentParser(cls)
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
def _precallImpl(self, task, parsedCmd)
def getFullMetadata(self)
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
def getAllSchemaCatalogs(self)
def writeSchemas(self, butler, clobber=False, doBackup=True)
def prepareForMultiProcessing(self)
def _getMetadataName(self)
def makeTask(self, parsedCmd=None, args=None)
def writeMetadata(self, dataRef)
def precall(self, parsedCmd)
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
def profile(filename, log=None)
def makeTask(self, parsedCmd=None, args=None)
def getTargetList(parsedCmd, kwargs)
def writeConfig(self, butler, clobber=False, doBackup=True)
def applyOverrides(cls, config)