22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base
import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
37 from lsst.base
import Packages
38 from lsst.log
import Log
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner"]
43 def _poolFunctionWrapper(function, arg):
44 """Wrapper around function to catch exceptions that don't inherit from Exception
46 Such exceptions aren't caught by multiprocessing, which causes the slave
47 process to crash and you end up hitting the timeout.
55 cls, exc, tb = sys.exc_info()
56 log = Log.getDefaultLogger()
57 log.warn(
"Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58 raise Exception(
"Unhandled exception: %s (%s)" % (cls.__name__, exc))
61 def _runPool(pool, timeout, function, iterable):
62 """Wrapper around pool.map_async, to handle timeout
64 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
67 Further wraps the function in _poolFunctionWrapper to catch exceptions
68 that don't inherit from Exception.
70 return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
73 @contextlib.contextmanager
75 """!Context manager for profiling with cProfile
77 @param filename filename to which to write profile (profiling disabled if None or empty)
78 @param log log object for logging the profile operations
80 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
81 it returns None), which allows additional control over profiling. You can obtain this using
82 the "as" clause, e.g.:
84 with profile(filename) as prof:
87 The output cumulative profile can be printed with a command-line like:
89 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
95 from cProfile
import Profile
98 log.info(
"Enabling cProfile profiling")
102 profile.dump_stats(filename)
104 log.info(
"cProfile stats written to %s" % filename)
108 """Run a command-line task, using multiprocessing if requested.
110 Each command-line task (subclass of CmdLineTask) has a task runner. By
111 default it is this class, but some tasks require a subclass. See the
112 manual "how to write a command-line task" in the pipe_tasks documentation
113 for more information. See CmdLineTask.parseAndRun to see how a task runner
116 You may use this task runner for your command-line task if your task has
117 a run method that takes exactly one argument: a butler data reference.
118 Otherwise you must provide a task-specific subclass of this runner for
119 your task's `RunnerClass` that overrides TaskRunner.getTargetList and
120 possibly TaskRunner.\_\_call\_\_. See TaskRunner.getTargetList for
123 This design matches the common pattern for command-line tasks: the run
124 method takes a single data reference, of some suitable name. Additional
125 arguments are rare, and if present, require a subclass of TaskRunner that
126 calls these additional arguments by name.
128 Instances of this class must be picklable in order to be compatible with
129 multiprocessing. If multiprocessing is requested
130 (parsedCmd.numProcesses > 1) then run() calls prepareForMultiProcessing
131 to jettison optional non-picklable elements. If your task runner is not
132 compatible with multiprocessing then indicate this in your task by setting
133 class variable canMultiprocess=False.
135 Due to a python bug [1], handling a KeyboardInterrupt properly requires
136 specifying a timeout [2]. This timeout (in sec) can be specified as the
137 "timeout" element in the output from ArgumentParser (the "parsedCmd"), if
138 available, otherwise we use TaskRunner.TIMEOUT.
140 [1] http://bugs.python.org/issue8296
141 [2] http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool)
145 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
146 """!Construct a TaskRunner
148 @warning Do not store parsedCmd, as this instance is pickled (if
149 multiprocessing) and parsedCmd may contain non-picklable elements.
150 It certainly contains more data than we need to send to each
151 instance of the task.
153 @param TaskClass The class of the task to run
154 @param parsedCmd The parsed command-line arguments, as returned by
155 the task's argument parser's parse_args method.
156 @param doReturnResults Should run return the collected result from
157 each invocation of the task? This is only intended for unit tests
158 and similar use. It can easily exhaust memory (if the task
159 returns enough data and you call it enough times) and it will
160 fail when using multiprocessing if the returned data cannot be
163 Note that even if doReturnResults is False a struct with a single
164 member "exitStatus" is returned, with value 0 or 1 to be returned
167 @throws ImportError if multiprocessing requested (and the task
168 supports it) but the multiprocessing library cannot be
180 self.
timeout = getattr(parsedCmd,
'timeout',
None)
185 if not TaskClass.canMultiprocess:
186 self.log.warn(
"This task does not support multiprocessing; using one process")
190 """Prepare this instance for multiprocessing
192 Optional non-picklable elements are removed.
194 This is only called if the task is run under multiprocessing.
199 """!Run the task on all targets.
201 The task is run under multiprocessing if numProcesses > 1; otherwise
202 processing is serial.
204 @return a list of results returned by TaskRunner.\_\_call\_\_, or an
205 empty list if TaskRunner.\_\_call\_\_ is not called (e.g. if
206 TaskRunner.precall returns `False`). See TaskRunner.\_\_call\_\_
211 disableImplicitThreading()
212 import multiprocessing
214 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
215 mapFunc = functools.partial(_runPool, pool, self.
timeout)
221 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None
224 if len(targetList) > 0:
225 with
profile(profileName, log):
227 resultList = list(mapFunc(self, targetList))
229 log.warn(
"Not running the task because there is no data to process; "
230 "you may preview data using \"--show data\"")
240 """!Return a list of (dataRef, kwargs) for TaskRunner.\_\_call\_\_.
242 @param parsedCmd the parsed command object (an argparse.Namespace)
243 returned by \ref argumentParser.ArgumentParser.parse_args
244 "ArgumentParser.parse_args".
245 @param **kwargs any additional keyword arguments. In the default
246 TaskRunner this is an empty dict, but having it simplifies
247 overriding TaskRunner for tasks whose run method takes additional
248 arguments (see case (1) below).
250 The default implementation of TaskRunner.getTargetList and
251 TaskRunner.\_\_call\_\_ works for any command-line task whose run
252 method takes exactly one argument: a data reference. Otherwise you
253 must provide a variant of TaskRunner that overrides
254 TaskRunner.getTargetList and possibly TaskRunner.\_\_call\_\_.
257 (1) If your command-line task has a `run` method that takes one data
258 reference followed by additional arguments, then you need only
259 override TaskRunner.getTargetList to return the additional arguments
260 as an argument dict. To make this easier, your overridden version of
261 getTargetList may call TaskRunner.getTargetList with the extra
262 arguments as keyword arguments. For example, the following adds an
263 argument dict containing a single key: "calExpList", whose value is
264 the list of data IDs for the calexp ID argument:
268 def getTargetList(parsedCmd):
269 return TaskRunner.getTargetList(
271 calExpList=parsedCmd.calexp.idList
275 It is equivalent to this slightly longer version:
279 def getTargetList(parsedCmd):
280 argDict = dict(calExpList=parsedCmd.calexp.idList)
281 return [(dataId, argDict) for dataId in parsedCmd.id.idList]
284 (2) If your task does not meet condition (1) then you must override
285 both TaskRunner.getTargetList and TaskRunner.\_\_call\_\_. You may do
286 this however you see fit, so long as TaskRunner.getTargetList
287 returns a list, each of whose elements is sent to
288 TaskRunner.\_\_call\_\_, which runs your task.
290 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
293 """!Create a Task instance
295 @param[in] parsedCmd parsed command-line options (used for extra
296 task args by some task runners)
297 @param[in] args args tuple passed to TaskRunner.\_\_call\_\_
298 (used for extra task arguments by some task runners)
300 makeTask() can be called with either the 'parsedCmd' argument or
301 'args' argument set to None, but it must construct identical Task
302 instances in either case.
304 Subclasses may ignore this method entirely if they reimplement
305 both TaskRunner.precall and TaskRunner.\_\_call\_\_
309 def _precallImpl(self, task, parsedCmd):
310 """The main work of 'precall'
312 We write package versions, schemas and configs, or compare these to
313 existing files on disk if present.
315 if not parsedCmd.noVersions:
316 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
321 """Hook for code that should run exactly once, before multiprocessing
323 Must return True if TaskRunner.\_\_call\_\_ should subsequently be
326 @warning Implementations must take care to ensure that no unpicklable
327 attributes are added to the TaskRunner itself, for compatibility
328 with multiprocessing.
330 The default implementation writes package versions, schemas and
331 configs, or compares them to existing files on disk if present.
333 task = self.
makeTask(parsedCmd=parsedCmd)
340 except Exception
as e:
341 task.log.fatal(
"Failed in task initialization: %s", e)
342 if not isinstance(e, TaskError):
343 traceback.print_exc(file=sys.stderr)
348 """!Run the Task on a single target.
350 This default implementation assumes that the 'args' is a tuple
351 containing a data reference and a dict of keyword arguments.
353 @warning if you override this method and wish to return something
354 when doReturnResults is false, then it must be picklable to support
355 multiprocessing and it should be small enough that pickling and
356 unpickling do not add excessive overhead.
358 @param args Arguments for Task.run()
361 - None if doReturnResults false
362 - A pipe_base Struct containing these fields if doReturnResults true:
363 - dataRef: the provided data reference
364 - metadata: task metadata after execution of run
365 - result: result returned by task run, or None if the task fails
367 dataRef, kwargs = args
369 self.
log = Log.getDefaultLogger()
370 if hasattr(dataRef,
"dataId"):
371 self.log.MDC(
"LABEL", str(dataRef.dataId))
372 elif isinstance(dataRef, (list, tuple)):
373 self.log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
378 result = task.run(dataRef, **kwargs)
381 result = task.run(dataRef, **kwargs)
382 except Exception
as e:
387 if hasattr(dataRef,
"dataId"):
388 task.log.fatal(
"Failed on dataId=%s: %s", dataRef.dataId, e)
389 elif isinstance(dataRef, (list, tuple)):
390 task.log.fatal(
"Failed on dataId=[%s]: %s",
391 ", ".join(str(ref.dataId)
for ref
in dataRef), e)
393 task.log.fatal(
"Failed on dataRef=%s: %s", dataRef, e)
395 if not isinstance(e, TaskError):
396 traceback.print_exc(file=sys.stderr)
397 task.writeMetadata(dataRef)
400 self.log.MDCRemove(
"LABEL")
404 exitStatus=exitStatus,
406 metadata=task.metadata,
411 exitStatus=exitStatus,
416 """!A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to
421 """!A variant of the base version that passes a butler argument to the task's constructor
423 @param[in] parsedCmd parsed command-line options, as returned by the argument parser;
424 if specified then args is ignored
425 @param[in] args other arguments; if parsedCmd is None then this must be specified
427 @throw RuntimeError if parsedCmd and args are both None
429 if parsedCmd
is not None:
430 butler = parsedCmd.butler
431 elif args
is not None:
432 dataRef, kwargs = args
433 butler = dataRef.butlerSubset.butler
435 raise RuntimeError(
"parsedCmd or args must be specified")
440 """!Base class for command-line tasks: tasks that may be executed from the command line
442 See \ref pipeBase_introduction "pipe_base introduction" to learn what tasks are,
443 and \ref pipeTasks_writeCmdLineTask "how to write a command-line task" for more information
444 about writing command-line tasks.
445 If the second link is broken (as it will be before the documentation is cross-linked)
446 then look at the main page of pipe_tasks documentation for a link.
448 Subclasses must specify the following class variables:
449 * ConfigClass: configuration class for your task (a subclass of \ref lsst.pex.config.config.Config
450 "lsst.pex.config.Config", or if your task needs no configuration, then
451 \ref lsst.pex.config.config.Config "lsst.pex.config.Config" itself)
452 * _DefaultName: default name used for this task (a str)
454 Subclasses may also specify the following class variables:
455 * RunnerClass: a task runner class. The default is TaskRunner, which works for any task
456 with a run method that takes exactly one argument: a data reference. If your task does
457 not meet this requirement then you must supply a variant of TaskRunner; see TaskRunner
458 for more information.
459 * canMultiprocess: the default is True; set False if your task does not support multiprocessing.
461 Subclasses must specify a method named "run":
462 - By default `run` accepts a single butler data reference, but you can specify an alternate task runner
463 (subclass of TaskRunner) as the value of class variable `RunnerClass` if your run method needs
465 - `run` is expected to return its data in a Struct. This provides safety for evolution of the task
466 since new values may be added without harming existing code.
467 - The data returned by `run` must be picklable if your task is to support multiprocessing.
469 RunnerClass = TaskRunner
470 canMultiprocess =
True
474 """!A hook to allow a task to change the values of its config *after* the camera-specific
475 overrides are loaded but before any command-line overrides are applied.
477 This is necessary in some cases because the camera-specific overrides may retarget subtasks,
478 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
480 @warning This is called by CmdLineTask.parseAndRun; other ways of constructing a config
481 will not apply these overrides.
483 @param[in] cls the class object
484 @param[in] config task configuration (an instance of cls.ConfigClass)
489 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
490 """!Parse an argument list and run the command
492 Calling this method with no arguments specified is the standard way to run a command-line task
493 from the command line. For an example see pipe_tasks `bin/makeSkyMap.py` or almost any other
494 file in that directory.
496 @param cls the class object
497 @param args list of command-line arguments; if `None` use sys.argv
498 @param config config for task (instance of pex_config Config); if `None` use cls.ConfigClass()
499 @param log log (instance of lsst.log.Log); if `None` use the default log
500 @param doReturnResults Return the collected results from each invocation of the task?
501 This is only intended for unit tests and similar use.
502 It can easily exhaust memory (if the task returns enough data and you call it enough times)
503 and it will fail when using multiprocessing if the returned data cannot be pickled.
505 @return a Struct containing:
506 - argumentParser: the argument parser
507 - parsedCmd: the parsed command returned by the argument parser's parse_args method
508 - taskRunner: the task runner used to run the task (an instance of cls.RunnerClass)
509 - resultList: results returned by the task runner's run method, one entry per invocation.
510 This will typically be a list of `None` unless doReturnResults is `True`;
511 see cls.RunnerClass (TaskRunner by default) for more information.
513 If one or more of the dataIds fails then this routine will exit (with a status giving the
514 number of failed dataIds) rather than returning this struct; this behaviour can be
515 overridden by specifying the --noExit option.
518 commandAsStr =
" ".join(sys.argv)
521 commandAsStr =
"{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
523 argumentParser = cls._makeArgumentParser()
525 config = cls.ConfigClass()
526 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
528 parsedCmd.log.info(
"Running: %s", commandAsStr)
530 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
531 resultList = taskRunner.run(parsedCmd)
534 nFailed = sum(((res.exitStatus != 0)
for res
in resultList))
535 except Exception
as e:
536 parsedCmd.log.warn(
"Unable to retrieve exit status (%s); assuming success", e)
541 parsedCmd.log.warn(
"%d dataRefs failed; not exiting as --noExit was set", nFailed)
546 argumentParser=argumentParser,
548 taskRunner=taskRunner,
549 resultList=resultList,
553 def _makeArgumentParser(cls):
554 """!Create and return an argument parser
556 @param[in] cls the class object
557 @return the argument parser for this task.
559 By default this returns an ArgumentParser with one ID argument named `--id` of dataset type "raw".
561 Your task subclass may need to override this method to change the dataset type or data ref level,
562 or to add additional data ID arguments. If you add additional data ID arguments or your task's
563 run method takes more than a single data reference then you will also have to provide a task-specific
564 task runner (see TaskRunner for more information).
566 parser = ArgumentParser(name=cls._DefaultName)
567 parser.add_id_argument(name=
"--id", datasetType=
"raw",
568 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
572 """!Write the configuration used for processing the data, or check that an existing
573 one is equal to the new one if present.
575 @param[in] butler data butler used to write the config.
576 The config is written to dataset type self._getConfigName()
577 @param[in] clobber a boolean flag that controls what happens if a config already has been saved:
578 - True: overwrite or rename the existing config, depending on `doBackup`
579 - False: raise TaskError if this config does not match the existing config
580 @param[in] doBackup if clobbering, should we backup the old files?
583 if configName
is None:
586 butler.put(self.config, configName, doBackup=doBackup)
587 elif butler.datasetExists(configName):
590 oldConfig = butler.get(configName, immediate=
True)
591 except Exception
as exc:
592 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
595 def logConfigMismatch(msg):
596 self.log.fatal(
"Comparing configuration: %s", msg)
598 if not self.config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
600 (
"Config does not match existing task config %r on disk; tasks configurations " +
601 "must be consistent within the same output repo (override with --clobber-config)") %
604 butler.put(self.config, configName)
607 """!Write the schemas returned by \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs"
609 @param[in] butler data butler used to write the schema.
610 Each schema is written to the dataset type specified as the key in the dict returned by
611 \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs".
612 @param[in] clobber a boolean flag that controls what happens if a schema already has been saved:
613 - True: overwrite or rename the existing schema, depending on `doBackup`
614 - False: raise TaskError if this schema does not match the existing schema
615 @param[in] doBackup if clobbering, should we backup the old files?
617 @warning if clobber is False and an existing schema does not match a current schema,
618 then some schemas may have been saved successfully and others may not, and there is no easy way to
621 for dataset, catalog
in self.getAllSchemaCatalogs().items():
622 schemaDataset = dataset +
"_schema"
624 butler.put(catalog, schemaDataset, doBackup=doBackup)
625 elif butler.datasetExists(schemaDataset):
626 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
627 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
629 (
"New schema does not match schema %r on disk; schemas must be " +
630 " consistent within the same output repo (override with --clobber-config)") %
633 butler.put(catalog, schemaDataset)
636 """!Write the metadata produced from processing the data
638 @param[in] dataRef butler data reference used to write the metadata.
639 The metadata is written to dataset type self._getMetadataName()
643 if metadataName
is not None:
644 dataRef.put(self.getFullMetadata(), metadataName)
645 except Exception
as e:
646 self.log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
649 """!Compare and write package versions
651 We retrieve the persisted list of packages and compare with what we're currently using.
652 We raise TaskError if there's a version mismatch.
654 Note that this operation is subject to a race condition.
656 @param[in] butler data butler used to read/write the package versions
657 @param[in] clobber a boolean flag that controls what happens if versions already have been saved:
658 - True: overwrite or rename the existing version info, depending on `doBackup`
659 - False: raise TaskError if this version info does not match the existing
660 @param[in] doBackup if clobbering, should we backup the old files?
661 @param[in] dataset name of dataset to read/write
663 packages = Packages.fromSystem()
666 return butler.put(packages, dataset, doBackup=doBackup)
667 if not butler.datasetExists(dataset):
668 return butler.put(packages, dataset)
671 old = butler.get(dataset, immediate=
True)
672 except Exception
as exc:
673 raise type(exc)(
"Unable to read stored version dataset %s (%s); "
674 "consider using --clobber-versions or --no-versions" %
679 diff = packages.difference(old)
682 "Version mismatch (" +
683 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
684 "); consider using --clobber-versions or --no-versions")
686 extra = packages.extra(old)
689 butler.put(old, dataset, doBackup=doBackup)
691 def _getConfigName(self):
692 """!Return the name of the config dataset type, or None if config is not to be persisted
694 @note The name may depend on the config; that is why this is not a class method.
696 return self._DefaultName +
"_config"
698 def _getMetadataName(self):
699 """!Return the name of the metadata dataset type, or None if metadata is not to be persisted
701 @note The name may depend on the config; that is why this is not a class method.
703 return self._DefaultName +
"_metadata"
def __init__
Construct a TaskRunner.
def makeTask
Create a Task instance.
def writePackageVersions
Compare and write package versions.
def makeTask
A variant of the base version that passes a butler argument to the task's constructor.
def run
Run the task on all targets.
def __call__
Run the Task on a single target.
def writeConfig
Write the configuration used for processing the data, or check that an existing one is equal to the n...
def _getConfigName
Return the name of the config dataset type, or None if config is not to be persisted.
def applyOverrides
A hook to allow a task to change the values of its config after the camera-specific overrides are loa...
def parseAndRun
Parse an argument list and run the command.
def _getMetadataName
Return the name of the metadata dataset type, or None if metadata is not to be persisted.
def writeMetadata
Write the metadata produced from processing the data.
def prepareForMultiProcessing
def getTargetList
Return a list of (dataRef, kwargs) for TaskRunner.
A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to their construc...
def profile
Context manager for profiling with cProfile.
def writeSchemas
Write the schemas returned by getAllSchemaCatalogs.
Base class for command-line tasks: tasks that may be executed from the command line.