22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner",
"LegacyTaskRunner"]
43 def _runPool(pool, timeout, function, iterable):
44 """Wrapper around ``pool.map_async``, to handle timeout 46 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see 47 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 49 return pool.map_async(function, iterable).get(timeout)
52 @contextlib.contextmanager
54 """Context manager for profiling with cProfile. 60 Filename to which to write profile (profiling disabled if `None` or empty). 61 log : `lsst.log.Log`, optional 62 Log object for logging the profile operations. 64 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise 65 it returns None), which allows additional control over profiling. You can obtain this using 66 the "as" clause, e.g.: 68 with profile(filename) as prof: 71 The output cumulative profile can be printed with a command-line like:: 73 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)' 79 from cProfile
import Profile
82 log.info(
"Enabling cProfile profiling")
86 profile.dump_stats(filename)
88 log.info(
"cProfile stats written to %s" % filename)
92 """Run a command-line task, using `multiprocessing` if requested. 96 TaskClass : `lsst.pipe.base.Task` subclass 97 The class of the task to run. 98 parsedCmd : `argparse.Namespace` 99 The parsed command-line arguments, as returned by the task's argument parser's 100 `~lsst.pipe.base.ArgumentParser.parse_args` method. 104 Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may 105 contain non-picklable elements. It certainly contains more data than we need to send to each 106 instance of the task. 107 doReturnResults : `bool`, optional 108 Should run return the collected result from each invocation of the task? This is only intended for 109 unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you 110 call it enough times) and it will fail when using multiprocessing if the returned data cannot be 113 Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned, 114 with value 0 or 1 to be returned to the unix shell. 119 If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be 124 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this 125 class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more 126 information. See `CmdLineTask.parseAndRun` to see how a task runner is used. 128 You may use this task runner for your command-line task if your task has a runDataRef method that takes 129 exactly one argument: a butler data reference. Otherwise you must provide a task-specific subclass of 130 this runner for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly 131 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details. 133 This design matches the common pattern for command-line tasks: the runDataRef method takes a single data 134 reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of 135 `TaskRunner` that calls these additional arguments by name. 137 Instances of this class must be picklable in order to be compatible with multiprocessing. If 138 multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `runDataRef` calls 139 `prepareForMultiProcessing` to jettison optional non-picklable elements. If your task runner is not 140 compatible with multiprocessing then indicate this in your task by setting class variable 141 ``canMultiprocess=False``. 143 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This 144 timeout (in sec) can be specified as the ``timeout`` element in the output from 145 `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`. 147 By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as 148 MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns 149 multiple processes and when multiple users are running on a shared system. Users can override this 150 behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable. 152 .. __: http://bugs.python.org/issue8296 153 .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 157 """Default timeout (seconds) for multiprocessing.""" 159 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
169 self.
timeout = getattr(parsedCmd,
'timeout',
None)
174 if not TaskClass.canMultiprocess:
175 self.
log.warn(
"This task does not support multiprocessing; using one process")
179 """Prepare this instance for multiprocessing 181 Optional non-picklable elements are removed. 183 This is only called if the task is run under multiprocessing. 188 """Run the task on all targets. 192 parsedCmd : `argparse.Namespace` 193 Parsed command `argparse.Namespace`. 198 A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__` 199 is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__` 204 The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise 205 processing is serial. 208 disableImplicitThreading()
210 import multiprocessing
212 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
213 mapFunc = functools.partial(_runPool, pool, self.
timeout)
219 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None 222 if len(targetList) > 0:
223 with
profile(profileName, log):
225 resultList = list(mapFunc(self, targetList))
227 log.warn(
"Not running the task because there is no data to process; " 228 "you may preview data using \"--show data\"")
238 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`. 242 parsedCmd : `argparse.Namespace` 243 The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`. 245 Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having 246 it simplifies overriding `TaskRunner` for tasks whose runDataRef method takes additional arguments 247 (see case (1) below). 251 The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any 252 command-line task whose runDataRef method takes exactly one argument: a data reference. Otherwise you 253 must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly 254 `TaskRunner.__call__`. There are two cases. 258 If your command-line task has a ``runDataRef`` method that takes one data reference followed by 259 additional arguments, then you need only override `TaskRunner.getTargetList` to return the additional 260 arguments as an argument dict. To make this easier, your overridden version of 261 `~TaskRunner.getTargetList` may call `TaskRunner.getTargetList` with the extra arguments as keyword 262 arguments. For example, the following adds an argument dict containing a single key: "calExpList", 263 whose value is the list of data IDs for the calexp ID argument:: 265 def getTargetList(parsedCmd): 266 return TaskRunner.getTargetList( 268 calExpList=parsedCmd.calexp.idList 271 It is equivalent to this slightly longer version:: 274 def getTargetList(parsedCmd): 275 argDict = dict(calExpList=parsedCmd.calexp.idList) 276 return [(dataId, argDict) for dataId in parsedCmd.id.idList] 280 If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and 281 `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList` 282 returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task. 284 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
287 """Create a Task instance. 292 Parsed command-line options (used for extra task args by some task runners). 294 Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners). 298 ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None, 299 but it must construct identical Task instances in either case. 301 Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and 302 `TaskRunner.__call__`. 306 def _precallImpl(self, task, parsedCmd):
307 """The main work of `precall`. 309 We write package versions, schemas and configs, or compare these to existing files on disk if present. 311 if not parsedCmd.noVersions:
312 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
317 """Hook for code that should run exactly once, before multiprocessing. 321 Must return True if `TaskRunner.__call__` should subsequently be called. 325 Implementations must take care to ensure that no unpicklable attributes are added to the 326 TaskRunner itself, for compatibility with multiprocessing. 328 The default implementation writes package versions, schemas and configs, or compares them to existing 329 files on disk if present. 331 task = self.
makeTask(parsedCmd=parsedCmd)
338 except Exception
as e:
339 task.log.fatal(
"Failed in task initialization: %s", e)
340 if not isinstance(e, TaskError):
341 traceback.print_exc(file=sys.stderr)
346 """Run the Task on a single target. 351 Arguments for Task.runDataRef() 355 struct : `lsst.pipe.base.Struct` 356 Contains these fields if ``doReturnResults`` is `True`: 358 - ``dataRef``: the provided data reference. 359 - ``metadata``: task metadata after execution of run. 360 - ``result``: result returned by task run, or `None` if the task fails. 361 - ``exitStatus``: 0 if the task completed successfully, 1 otherwise. 363 If ``doReturnResults`` is `False` the struct contains: 365 - ``exitStatus``: 0 if the task completed successfully, 1 otherwise. 369 This default implementation assumes that the ``args`` is a tuple containing a data reference and a 370 dict of keyword arguments. 374 If you override this method and wish to return something when ``doReturnResults`` is `False`, 375 then it must be picklable to support multiprocessing and it should be small enough that pickling 376 and unpickling do not add excessive overhead. 378 dataRef, kwargs = args
380 self.
log = Log.getDefaultLogger()
381 if hasattr(dataRef,
"dataId"):
382 self.
log.MDC(
"LABEL", str(dataRef.dataId))
383 elif isinstance(dataRef, (list, tuple)):
384 self.
log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
389 result = self.
runTask(task, dataRef, kwargs)
392 result = self.
runTask(task, dataRef, kwargs)
393 except Exception
as e:
399 eName = type(e).__name__
400 if hasattr(dataRef,
"dataId"):
401 task.log.fatal(
"Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
402 elif isinstance(dataRef, (list, tuple)):
403 task.log.fatal(
"Failed on dataIds=[%s]: %s: %s",
404 ", ".join(str(ref.dataId)
for ref
in dataRef), eName, e)
406 task.log.fatal(
"Failed on dataRef=%s: %s: %s", dataRef, eName, e)
408 if not isinstance(e, TaskError):
409 traceback.print_exc(file=sys.stderr)
415 task.writeMetadata(dataRef)
418 self.
log.MDCRemove(
"LABEL")
422 exitStatus=exitStatus,
424 metadata=task.metadata,
429 exitStatus=exitStatus,
433 """Make the actual call to `runDataRef` for this task. 437 task : `lsst.pipe.base.CmdLineTask` class 438 The class of the task to run. 440 Butler data reference that contains the data the task will process. 442 Any additional keyword arguments. See `TaskRunner.getTargetList` above. 446 The default implementation of `TaskRunner.runTask` works for any command-line task which has a 447 runDataRef method that takes a data reference and an optional set of additional keyword arguments. 448 This method returns the results generated by the task's `runDataRef` method. 451 return task.runDataRef(dataRef, **kwargs)
455 """A `TaskRunner` for `CmdLineTask`\ s which calls the `Task`\ 's `run` method on a `dataRef` rather 456 than the `runDataRef` method. 460 """Call `run` for this task instead of `runDataRef`. See `TaskRunner.runTask` above for details. 462 return task.run(dataRef, **kwargs)
466 """A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to 471 """A variant of the base version that passes a butler argument to the task's constructor. 475 parsedCmd : `argparse.Namespace` 476 Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified 477 then args is ignored. 479 Other arguments; if ``parsedCmd`` is `None` then this must be specified. 484 Raised if ``parsedCmd`` and ``args`` are both `None`. 486 if parsedCmd
is not None:
487 butler = parsedCmd.butler
488 elif args
is not None:
489 dataRef, kwargs = args
490 butler = dataRef.butlerSubset.butler
492 raise RuntimeError(
"parsedCmd or args must be specified")
497 """Base class for command-line tasks: tasks that may be executed from the command-line. 501 See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for 502 more information about writing command-line tasks. 504 Subclasses must specify the following class variables: 506 - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your 507 task needs no configuration, then `lsst.pex.config.Config` itself). 508 - ``_DefaultName``: default name used for this task (a str). 510 Subclasses may also specify the following class variables: 512 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task 513 with a runDataRef method that takes exactly one argument: a data reference. If your task does 514 not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner`` 515 for more information. 516 - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing. 518 Subclasses must specify a method named ``runDataRef``: 520 - By default ``runDataRef`` accepts a single butler data reference, but you can specify an alternate 521 task runner (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run 522 method needs something else. 523 - ``runDataRef`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for 524 evolution of the task since new values may be added without harming existing code. 525 - The data returned by ``runDataRef`` must be picklable if your task is to support multiprocessing. 527 RunnerClass = TaskRunner
528 canMultiprocess =
True 532 """A hook to allow a task to change the values of its config *after* the camera-specific 533 overrides are loaded but before any command-line overrides are applied. 537 config : instance of task's ``ConfigClass`` 542 This is necessary in some cases because the camera-specific overrides may retarget subtasks, 543 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion. 547 This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply 553 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
554 """Parse an argument list and run the command. 558 args : `list`, optional 559 List of command-line arguments; if `None` use `sys.argv`. 560 config : `lsst.pex.config.Config`-type, optional 561 Config for task. If `None` use `Task.ConfigClass`. 562 log : `lsst.log.Log`-type, optional 563 Log. If `None` use the default log. 564 doReturnResults : `bool`, optional 565 If `True`, return the results of this task. Default is `False`. This is only intended for 566 unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you 567 call it enough times) and it will fail when using multiprocessing if the returned data cannot be 572 struct : `lsst.pipe.base.Struct` 575 - ``argumentParser``: the argument parser. 576 - ``parsedCmd``: the parsed command returned by the argument parser's 577 `lsst.pipe.base.ArgumentParser.parse_args` method. 578 - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`). 579 - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation. 580 This will typically be a list of `None` unless ``doReturnResults`` is `True`; 581 see `Task.RunnerClass` (`TaskRunner` by default) for more information. 585 Calling this method with no arguments specified is the standard way to run a command-line task 586 from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other 587 file in that directory. 589 If one or more of the dataIds fails then this routine will exit (with a status giving the 590 number of failed dataIds) rather than returning this struct; this behaviour can be 591 overridden by specifying the ``--noExit`` command-line option. 594 commandAsStr =
" ".join(sys.argv)
601 config = cls.ConfigClass()
602 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.
applyOverrides)
604 parsedCmd.log.info(
"Running: %s", commandAsStr)
606 taskRunner = cls.
RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
607 resultList = taskRunner.run(parsedCmd)
610 nFailed = sum(((res.exitStatus != 0)
for res
in resultList))
611 except (TypeError, AttributeError)
as e:
613 parsedCmd.log.warn(
"Unable to retrieve exit status (%s); assuming success", e)
618 parsedCmd.log.error(
"%d dataRefs failed; not exiting as --noExit was set", nFailed)
623 argumentParser=argumentParser,
625 taskRunner=taskRunner,
626 resultList=resultList,
630 def _makeArgumentParser(cls):
631 """Create and return an argument parser. 635 parser : `lsst.pipe.base.ArgumentParser` 636 The argument parser for this task. 640 By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of 641 dataset type ``raw``. 643 Your task subclass may need to override this method to change the dataset type or data ref level, 644 or to add additional data ID arguments. If you add additional data ID arguments or your task's 645 runDataRef method takes more than a single data reference then you will also have to provide a 646 task-specific task runner (see TaskRunner for more information). 649 parser.add_id_argument(name=
"--id", datasetType=
"raw",
650 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
654 """Write the configuration used for processing the data, or check that an existing 655 one is equal to the new one if present. 659 butler : `lsst.daf.persistence.Butler` 660 Data butler used to write the config. The config is written to dataset type 661 `CmdLineTask._getConfigName`. 662 clobber : `bool`, optional 663 A boolean flag that controls what happens if a config already has been saved: 664 - `True`: overwrite or rename the existing config, depending on ``doBackup``. 665 - `False`: raise `TaskError` if this config does not match the existing config. 666 doBackup : bool, optional 667 Set to `True` to backup the config files if clobbering. 670 if configName
is None:
673 butler.put(self.
config, configName, doBackup=doBackup)
674 elif butler.datasetExists(configName, write=
True):
677 oldConfig = butler.get(configName, immediate=
True)
678 except Exception
as exc:
679 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
682 def logConfigMismatch(msg):
683 self.
log.fatal(
"Comparing configuration: %s", msg)
685 if not self.
config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
687 (
"Config does not match existing task config %r on disk; tasks configurations " +
688 "must be consistent within the same output repo (override with --clobber-config)") %
691 butler.put(self.
config, configName)
694 """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`. 698 butler : `lsst.daf.persistence.Butler` 699 Data butler used to write the schema. Each schema is written to the dataset type specified as the 700 key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`. 701 clobber : `bool`, optional 702 A boolean flag that controls what happens if a schema already has been saved: 703 - `True`: overwrite or rename the existing schema, depending on ``doBackup``. 704 - `False`: raise `TaskError` if this schema does not match the existing schema. 705 doBackup : `bool`, optional 706 Set to `True` to backup the schema files if clobbering. 710 If ``clobber`` is `False` and an existing schema does not match a current schema, 711 then some schemas may have been saved successfully and others may not, and there is no easy way to 715 schemaDataset = dataset +
"_schema" 717 butler.put(catalog, schemaDataset, doBackup=doBackup)
718 elif butler.datasetExists(schemaDataset, write=
True):
719 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
720 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
722 (
"New schema does not match schema %r on disk; schemas must be " +
723 " consistent within the same output repo (override with --clobber-config)") %
726 butler.put(catalog, schemaDataset)
729 """Write the metadata produced from processing the data. 734 Butler data reference used to write the metadata. 735 The metadata is written to dataset type `CmdLineTask._getMetadataName`. 739 if metadataName
is not None:
741 except Exception
as e:
742 self.
log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
745 """Compare and write package versions. 749 butler : `lsst.daf.persistence.Butler` 750 Data butler used to read/write the package versions. 751 clobber : `bool`, optional 752 A boolean flag that controls what happens if versions already have been saved: 753 - `True`: overwrite or rename the existing version info, depending on ``doBackup``. 754 - `False`: raise `TaskError` if this version info does not match the existing. 755 doBackup : `bool`, optional 756 If `True` and clobbering, old package version files are backed up. 757 dataset : `str`, optional 758 Name of dataset to read/write. 763 Raised if there is a version mismatch with current and persisted lists of package versions. 767 Note that this operation is subject to a race condition. 769 packages = Packages.fromSystem()
772 return butler.put(packages, dataset, doBackup=doBackup)
773 if not butler.datasetExists(dataset, write=
True):
774 return butler.put(packages, dataset)
777 old = butler.get(dataset, immediate=
True)
778 except Exception
as exc:
779 raise type(exc)(
"Unable to read stored version dataset %s (%s); " 780 "consider using --clobber-versions or --no-versions" %
785 diff = packages.difference(old)
788 "Version mismatch (" +
789 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
790 "); consider using --clobber-versions or --no-versions")
792 extra = packages.extra(old)
795 butler.put(old, dataset, doBackup=doBackup)
797 def _getConfigName(self):
798 """Get the name of the config dataset type, or `None` if config is not to be persisted. 802 The name may depend on the config; that is why this is not a class method. 804 return self._DefaultName +
"_config" 806 def _getMetadataName(self):
807 """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted. 811 The name may depend on the config; that is why this is not a class method. 813 return self._DefaultName +
"_metadata"
def _makeArgumentParser(cls)
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
def _precallImpl(self, task, parsedCmd)
def runTask(self, task, dataRef, kwargs)
def getFullMetadata(self)
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
def getAllSchemaCatalogs(self)
def writeSchemas(self, butler, clobber=False, doBackup=True)
def prepareForMultiProcessing(self)
def _getMetadataName(self)
def makeTask(self, parsedCmd=None, args=None)
def writeMetadata(self, dataRef)
def precall(self, parsedCmd)
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
def profile(filename, log=None)
def makeTask(self, parsedCmd=None, args=None)
def getTargetList(parsedCmd, kwargs)
def writeConfig(self, butler, clobber=False, doBackup=True)
def applyOverrides(cls, config)
def runTask(self, task, dataRef, kwargs)