22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base
import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
37 from lsst.base
import Packages
38 from lsst.log
import Log
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner"]
43 def _poolFunctionWrapper(function, arg):
44 """Wrapper around function to catch exceptions that don't inherit from Exception
46 Such exceptions aren't caught by multiprocessing, which causes the slave
47 process to crash and you end up hitting the timeout.
55 cls, exc, tb = sys.exc_info()
56 log = Log.getDefaultLogger()
57 log.warn(
"Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58 raise Exception(
"Unhandled exception: %s (%s)" % (cls.__name__, exc))
61 def _runPool(pool, timeout, function, iterable):
62 """Wrapper around pool.map_async, to handle timeout
64 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
67 Further wraps the function in _poolFunctionWrapper to catch exceptions
68 that don't inherit from Exception.
70 return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
73 @contextlib.contextmanager
75 """!Context manager for profiling with cProfile
77 @param filename filename to which to write profile (profiling disabled if None or empty)
78 @param log log object for logging the profile operations
80 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
81 it returns None), which allows additional control over profiling. You can obtain this using
82 the "as" clause, e.g.:
84 with profile(filename) as prof:
87 The output cumulative profile can be printed with a command-line like:
89 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
95 from cProfile
import Profile
98 log.info(
"Enabling cProfile profiling")
102 profile.dump_stats(filename)
104 log.info(
"cProfile stats written to %s" % filename)
108 """Run a command-line task, using multiprocessing if requested.
110 Each command-line task (subclass of CmdLineTask) has a task runner. By
111 default it is this class, but some tasks require a subclass. See the
112 manual "how to write a command-line task" in the pipe_tasks documentation
113 for more information. See CmdLineTask.parseAndRun to see how a task runner
116 You may use this task runner for your command-line task if your task has
117 a run method that takes exactly one argument: a butler data reference.
118 Otherwise you must provide a task-specific subclass of this runner for
119 your task's `RunnerClass` that overrides TaskRunner.getTargetList and
120 possibly TaskRunner.\_\_call\_\_. See TaskRunner.getTargetList for
123 This design matches the common pattern for command-line tasks: the run
124 method takes a single data reference, of some suitable name. Additional
125 arguments are rare, and if present, require a subclass of TaskRunner that
126 calls these additional arguments by name.
128 Instances of this class must be picklable in order to be compatible with
129 multiprocessing. If multiprocessing is requested
130 (parsedCmd.numProcesses > 1) then run() calls prepareForMultiProcessing
131 to jettison optional non-picklable elements. If your task runner is not
132 compatible with multiprocessing then indicate this in your task by setting
133 class variable canMultiprocess=False.
135 Due to a python bug [1], handling a KeyboardInterrupt properly requires
136 specifying a timeout [2]. This timeout (in sec) can be specified as the
137 "timeout" element in the output from ArgumentParser (the "parsedCmd"), if
138 available, otherwise we use TaskRunner.TIMEOUT.
140 [1] http://bugs.python.org/issue8296
141 [2] http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool)
145 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
146 """!Construct a TaskRunner
148 @warning Do not store parsedCmd, as this instance is pickled (if
149 multiprocessing) and parsedCmd may contain non-picklable elements.
150 It certainly contains more data than we need to send to each
151 instance of the task.
153 @param TaskClass The class of the task to run
154 @param parsedCmd The parsed command-line arguments, as returned by
155 the task's argument parser's parse_args method.
156 @param doReturnResults Should run return the collected result from
157 each invocation of the task? This is only intended for unit tests
158 and similar use. It can easily exhaust memory (if the task
159 returns enough data and you call it enough times) and it will
160 fail when using multiprocessing if the returned data cannot be
163 @throws ImportError if multiprocessing requested (and the task
164 supports it) but the multiprocessing library cannot be
176 self.
timeout = getattr(parsedCmd,
'timeout',
None)
181 if not TaskClass.canMultiprocess:
182 self.log.warn(
"This task does not support multiprocessing; using one process")
186 """Prepare this instance for multiprocessing
188 Optional non-picklable elements are removed.
190 This is only called if the task is run under multiprocessing.
195 """!Run the task on all targets.
197 The task is run under multiprocessing if numProcesses > 1; otherwise
198 processing is serial.
200 @return a list of results returned by TaskRunner.\_\_call\_\_, or an
201 empty list if TaskRunner.\_\_call\_\_ is not called (e.g. if
202 TaskRunner.precall returns `False`). See TaskRunner.\_\_call\_\_
207 disableImplicitThreading()
208 import multiprocessing
210 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
211 mapFunc = functools.partial(_runPool, pool, self.
timeout)
217 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None
220 if len(targetList) > 0:
221 with
profile(profileName, log):
223 resultList = list(mapFunc(self, targetList))
225 log.warn(
"Not running the task because there is no data to process; "
226 "you may preview data using \"--show data\"")
236 """!Return a list of (dataRef, kwargs) for TaskRunner.\_\_call\_\_.
238 @param parsedCmd the parsed command object (an argparse.Namespace)
239 returned by \ref argumentParser.ArgumentParser.parse_args
240 "ArgumentParser.parse_args".
241 @param **kwargs any additional keyword arguments. In the default
242 TaskRunner this is an empty dict, but having it simplifies
243 overriding TaskRunner for tasks whose run method takes additional
244 arguments (see case (1) below).
246 The default implementation of TaskRunner.getTargetList and
247 TaskRunner.\_\_call\_\_ works for any command-line task whose run
248 method takes exactly one argument: a data reference. Otherwise you
249 must provide a variant of TaskRunner that overrides
250 TaskRunner.getTargetList and possibly TaskRunner.\_\_call\_\_.
253 (1) If your command-line task has a `run` method that takes one data
254 reference followed by additional arguments, then you need only
255 override TaskRunner.getTargetList to return the additional arguments
256 as an argument dict. To make this easier, your overridden version of
257 getTargetList may call TaskRunner.getTargetList with the extra
258 arguments as keyword arguments. For example, the following adds an
259 argument dict containing a single key: "calExpList", whose value is
260 the list of data IDs for the calexp ID argument:
264 def getTargetList(parsedCmd):
265 return TaskRunner.getTargetList(
267 calExpList=parsedCmd.calexp.idList
271 It is equivalent to this slightly longer version:
275 def getTargetList(parsedCmd):
276 argDict = dict(calExpList=parsedCmd.calexp.idList)
277 return [(dataId, argDict) for dataId in parsedCmd.id.idList]
280 (2) If your task does not meet condition (1) then you must override
281 both TaskRunner.getTargetList and TaskRunner.\_\_call\_\_. You may do
282 this however you see fit, so long as TaskRunner.getTargetList
283 returns a list, each of whose elements is sent to
284 TaskRunner.\_\_call\_\_, which runs your task.
286 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
289 """!Create a Task instance
291 @param[in] parsedCmd parsed command-line options (used for extra
292 task args by some task runners)
293 @param[in] args args tuple passed to TaskRunner.\_\_call\_\_
294 (used for extra task arguments by some task runners)
296 makeTask() can be called with either the 'parsedCmd' argument or
297 'args' argument set to None, but it must construct identical Task
298 instances in either case.
300 Subclasses may ignore this method entirely if they reimplement
301 both TaskRunner.precall and TaskRunner.\_\_call\_\_
305 def _precallImpl(self, task, parsedCmd):
306 """The main work of 'precall'
308 We write package versions, schemas and configs, or compare these to
309 existing files on disk if present.
311 if not parsedCmd.noVersions:
312 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
317 """Hook for code that should run exactly once, before multiprocessing
319 Must return True if TaskRunner.\_\_call\_\_ should subsequently be
322 @warning Implementations must take care to ensure that no unpicklable
323 attributes are added to the TaskRunner itself, for compatibility
324 with multiprocessing.
326 The default implementation writes package versions, schemas and
327 configs, or compares them to existing files on disk if present.
329 task = self.
makeTask(parsedCmd=parsedCmd)
336 except Exception
as e:
337 task.log.fatal(
"Failed in task initialization: %s", e)
338 if not isinstance(e, TaskError):
339 traceback.print_exc(file=sys.stderr)
344 """!Run the Task on a single target.
346 This default implementation assumes that the 'args' is a tuple
347 containing a data reference and a dict of keyword arguments.
349 @warning if you override this method and wish to return something
350 when doReturnResults is false, then it must be picklable to support
351 multiprocessing and it should be small enough that pickling and
352 unpickling do not add excessive overhead.
354 @param args Arguments for Task.run()
357 - None if doReturnResults false
358 - A pipe_base Struct containing these fields if doReturnResults true:
359 - dataRef: the provided data reference
360 - metadata: task metadata after execution of run
361 - result: result returned by task run, or None if the task fails
363 dataRef, kwargs = args
365 self.
log = Log.getDefaultLogger()
366 if hasattr(dataRef,
"dataId"):
367 self.log.MDC(
"LABEL", str(dataRef.dataId))
368 elif isinstance(dataRef, (list, tuple)):
369 self.log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
373 result = task.run(dataRef, **kwargs)
376 result = task.run(dataRef, **kwargs)
377 except Exception
as e:
379 if hasattr(dataRef,
"dataId"):
380 task.log.fatal(
"Failed on dataId=%s: %s", dataRef.dataId, e)
381 elif isinstance(dataRef, (list, tuple)):
382 task.log.fatal(
"Failed on dataId=[%s]: %s",
383 ", ".join(str(ref.dataId)
for ref
in dataRef), e)
385 task.log.fatal(
"Failed on dataRef=%s: %s", dataRef, e)
387 if not isinstance(e, TaskError):
388 traceback.print_exc(file=sys.stderr)
389 task.writeMetadata(dataRef)
392 self.log.MDCRemove(
"LABEL")
397 metadata=task.metadata,
403 """!A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to
408 """!A variant of the base version that passes a butler argument to the task's constructor
410 @param[in] parsedCmd parsed command-line options, as returned by the argument parser;
411 if specified then args is ignored
412 @param[in] args other arguments; if parsedCmd is None then this must be specified
414 @throw RuntimeError if parsedCmd and args are both None
416 if parsedCmd
is not None:
417 butler = parsedCmd.butler
418 elif args
is not None:
419 dataRef, kwargs = args
420 butler = dataRef.butlerSubset.butler
422 raise RuntimeError(
"parsedCmd or args must be specified")
427 """!Base class for command-line tasks: tasks that may be executed from the command line
429 See \ref pipeBase_introduction "pipe_base introduction" to learn what tasks are,
430 and \ref pipeTasks_writeCmdLineTask "how to write a command-line task" for more information
431 about writing command-line tasks.
432 If the second link is broken (as it will be before the documentation is cross-linked)
433 then look at the main page of pipe_tasks documentation for a link.
435 Subclasses must specify the following class variables:
436 * ConfigClass: configuration class for your task (a subclass of \ref lsst.pex.config.config.Config
437 "lsst.pex.config.Config", or if your task needs no configuration, then
438 \ref lsst.pex.config.config.Config "lsst.pex.config.Config" itself)
439 * _DefaultName: default name used for this task (a str)
441 Subclasses may also specify the following class variables:
442 * RunnerClass: a task runner class. The default is TaskRunner, which works for any task
443 with a run method that takes exactly one argument: a data reference. If your task does
444 not meet this requirement then you must supply a variant of TaskRunner; see TaskRunner
445 for more information.
446 * canMultiprocess: the default is True; set False if your task does not support multiprocessing.
448 Subclasses must specify a method named "run":
449 - By default `run` accepts a single butler data reference, but you can specify an alternate task runner
450 (subclass of TaskRunner) as the value of class variable `RunnerClass` if your run method needs
452 - `run` is expected to return its data in a Struct. This provides safety for evolution of the task
453 since new values may be added without harming existing code.
454 - The data returned by `run` must be picklable if your task is to support multiprocessing.
456 RunnerClass = TaskRunner
457 canMultiprocess =
True
461 """!A hook to allow a task to change the values of its config *after* the camera-specific
462 overrides are loaded but before any command-line overrides are applied.
464 This is necessary in some cases because the camera-specific overrides may retarget subtasks,
465 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
467 @warning This is called by CmdLineTask.parseAndRun; other ways of constructing a config
468 will not apply these overrides.
470 @param[in] cls the class object
471 @param[in] config task configuration (an instance of cls.ConfigClass)
476 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
477 """!Parse an argument list and run the command
479 Calling this method with no arguments specified is the standard way to run a command-line task
480 from the command line. For an example see pipe_tasks `bin/makeSkyMap.py` or almost any other
481 file in that directory.
483 @param cls the class object
484 @param args list of command-line arguments; if `None` use sys.argv
485 @param config config for task (instance of pex_config Config); if `None` use cls.ConfigClass()
486 @param log log (instance of lsst.log.Log); if `None` use the default log
487 @param doReturnResults Return the collected results from each invocation of the task?
488 This is only intended for unit tests and similar use.
489 It can easily exhaust memory (if the task returns enough data and you call it enough times)
490 and it will fail when using multiprocessing if the returned data cannot be pickled.
492 @return a Struct containing:
493 - argumentParser: the argument parser
494 - parsedCmd: the parsed command returned by the argument parser's parse_args method
495 - taskRunner: the task runner used to run the task (an instance of cls.RunnerClass)
496 - resultList: results returned by the task runner's run method, one entry per invocation.
497 This will typically be a list of `None` unless doReturnResults is `True`;
498 see cls.RunnerClass (TaskRunner by default) for more information.
501 commandAsStr =
" ".join(sys.argv)
504 commandAsStr =
"{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
506 argumentParser = cls._makeArgumentParser()
508 config = cls.ConfigClass()
509 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
511 parsedCmd.log.info(
"Running: %s", commandAsStr)
513 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
514 resultList = taskRunner.run(parsedCmd)
516 argumentParser=argumentParser,
518 taskRunner=taskRunner,
519 resultList=resultList,
523 def _makeArgumentParser(cls):
524 """!Create and return an argument parser
526 @param[in] cls the class object
527 @return the argument parser for this task.
529 By default this returns an ArgumentParser with one ID argument named `--id` of dataset type "raw".
531 Your task subclass may need to override this method to change the dataset type or data ref level,
532 or to add additional data ID arguments. If you add additional data ID arguments or your task's
533 run method takes more than a single data reference then you will also have to provide a task-specific
534 task runner (see TaskRunner for more information).
536 parser = ArgumentParser(name=cls._DefaultName)
537 parser.add_id_argument(name=
"--id", datasetType=
"raw",
538 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
542 """!Write the configuration used for processing the data, or check that an existing
543 one is equal to the new one if present.
545 @param[in] butler data butler used to write the config.
546 The config is written to dataset type self._getConfigName()
547 @param[in] clobber a boolean flag that controls what happens if a config already has been saved:
548 - True: overwrite or rename the existing config, depending on `doBackup`
549 - False: raise TaskError if this config does not match the existing config
550 @param[in] doBackup if clobbering, should we backup the old files?
553 if configName
is None:
556 butler.put(self.config, configName, doBackup=doBackup)
557 elif butler.datasetExists(configName):
560 oldConfig = butler.get(configName, immediate=
True)
561 except Exception
as exc:
562 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
565 def logConfigMismatch(msg):
566 self.log.fatal(
"Comparing configuration: %s", msg)
568 if not self.config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
570 (
"Config does not match existing task config %r on disk; tasks configurations " +
571 "must be consistent within the same output repo (override with --clobber-config)") %
574 butler.put(self.config, configName)
577 """!Write the schemas returned by \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs"
579 @param[in] butler data butler used to write the schema.
580 Each schema is written to the dataset type specified as the key in the dict returned by
581 \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs".
582 @param[in] clobber a boolean flag that controls what happens if a schema already has been saved:
583 - True: overwrite or rename the existing schema, depending on `doBackup`
584 - False: raise TaskError if this schema does not match the existing schema
585 @param[in] doBackup if clobbering, should we backup the old files?
587 @warning if clobber is False and an existing schema does not match a current schema,
588 then some schemas may have been saved successfully and others may not, and there is no easy way to
591 for dataset, catalog
in self.getAllSchemaCatalogs().items():
592 schemaDataset = dataset +
"_schema"
594 butler.put(catalog, schemaDataset, doBackup=doBackup)
595 elif butler.datasetExists(schemaDataset):
596 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
597 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
599 (
"New schema does not match schema %r on disk; schemas must be " +
600 " consistent within the same output repo (override with --clobber-config)") %
603 butler.put(catalog, schemaDataset)
606 """!Write the metadata produced from processing the data
608 @param[in] dataRef butler data reference used to write the metadata.
609 The metadata is written to dataset type self._getMetadataName()
613 if metadataName
is not None:
614 dataRef.put(self.getFullMetadata(), metadataName)
615 except Exception
as e:
616 self.log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
619 """!Compare and write package versions
621 We retrieve the persisted list of packages and compare with what we're currently using.
622 We raise TaskError if there's a version mismatch.
624 Note that this operation is subject to a race condition.
626 @param[in] butler data butler used to read/write the package versions
627 @param[in] clobber a boolean flag that controls what happens if versions already have been saved:
628 - True: overwrite or rename the existing version info, depending on `doBackup`
629 - False: raise TaskError if this version info does not match the existing
630 @param[in] doBackup if clobbering, should we backup the old files?
631 @param[in] dataset name of dataset to read/write
633 packages = Packages.fromSystem()
636 return butler.put(packages, dataset, doBackup=doBackup)
637 if not butler.datasetExists(dataset):
638 return butler.put(packages, dataset)
641 old = butler.get(dataset, immediate=
True)
642 except Exception
as exc:
643 raise type(exc)(
"Unable to read stored version dataset %s (%s); "
644 "consider using --clobber-versions or --no-versions" %
649 diff = packages.difference(old)
652 "Version mismatch (" +
653 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
654 "); consider using --clobber-versions or --no-versions")
656 extra = packages.extra(old)
659 butler.put(old, dataset, doBackup=doBackup)
661 def _getConfigName(self):
662 """!Return the name of the config dataset type, or None if config is not to be persisted
664 @note The name may depend on the config; that is why this is not a class method.
666 return self._DefaultName +
"_config"
668 def _getMetadataName(self):
669 """!Return the name of the metadata dataset type, or None if metadata is not to be persisted
671 @note The name may depend on the config; that is why this is not a class method.
673 return self._DefaultName +
"_metadata"
def __init__
Construct a TaskRunner.
def makeTask
Create a Task instance.
def writePackageVersions
Compare and write package versions.
def makeTask
A variant of the base version that passes a butler argument to the task's constructor.
def run
Run the task on all targets.
def __call__
Run the Task on a single target.
def writeConfig
Write the configuration used for processing the data, or check that an existing one is equal to the n...
def _getConfigName
Return the name of the config dataset type, or None if config is not to be persisted.
def applyOverrides
A hook to allow a task to change the values of its config after the camera-specific overrides are loa...
def parseAndRun
Parse an argument list and run the command.
def _getMetadataName
Return the name of the metadata dataset type, or None if metadata is not to be persisted.
def writeMetadata
Write the metadata produced from processing the data.
def prepareForMultiProcessing
def getTargetList
Return a list of (dataRef, kwargs) for TaskRunner.
A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to their construc...
def profile
Context manager for profiling with cProfile.
def writeSchemas
Write the schemas returned by getAllSchemaCatalogs.
Base class for command-line tasks: tasks that may be executed from the command line.