22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base
import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
37 from lsst.base
import Packages
38 from lsst.log
import Log
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner"]
43 def _poolFunctionWrapper(function, arg):
44 """Wrapper around function to catch exceptions that don't inherit from Exception 46 Such exceptions aren't caught by multiprocessing, which causes the slave 47 process to crash and you end up hitting the timeout. 55 cls, exc, tb = sys.exc_info()
56 log = Log.getDefaultLogger()
57 log.warn(
"Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58 raise Exception(
"Unhandled exception: %s (%s)" % (cls.__name__, exc))
61 def _runPool(pool, timeout, function, iterable):
62 """Wrapper around pool.map_async, to handle timeout 64 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see 65 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 67 Further wraps the function in _poolFunctionWrapper to catch exceptions 68 that don't inherit from Exception. 70 return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
73 @contextlib.contextmanager
75 """!Context manager for profiling with cProfile 77 @param filename filename to which to write profile (profiling disabled if None or empty) 78 @param log log object for logging the profile operations 80 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise 81 it returns None), which allows additional control over profiling. You can obtain this using 82 the "as" clause, e.g.: 84 with profile(filename) as prof: 87 The output cumulative profile can be printed with a command-line like: 89 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)' 95 from cProfile
import Profile
98 log.info(
"Enabling cProfile profiling")
102 profile.dump_stats(filename)
104 log.info(
"cProfile stats written to %s" % filename)
108 """Run a command-line task, using multiprocessing if requested. 110 Each command-line task (subclass of CmdLineTask) has a task runner. By 111 default it is this class, but some tasks require a subclass. See the 112 manual "how to write a command-line task" in the pipe_tasks documentation 113 for more information. See CmdLineTask.parseAndRun to see how a task runner 116 You may use this task runner for your command-line task if your task has 117 a run method that takes exactly one argument: a butler data reference. 118 Otherwise you must provide a task-specific subclass of this runner for 119 your task's `RunnerClass` that overrides TaskRunner.getTargetList and 120 possibly TaskRunner.\_\_call\_\_. See TaskRunner.getTargetList for 123 This design matches the common pattern for command-line tasks: the run 124 method takes a single data reference, of some suitable name. Additional 125 arguments are rare, and if present, require a subclass of TaskRunner that 126 calls these additional arguments by name. 128 Instances of this class must be picklable in order to be compatible with 129 multiprocessing. If multiprocessing is requested 130 (parsedCmd.numProcesses > 1) then run() calls prepareForMultiProcessing 131 to jettison optional non-picklable elements. If your task runner is not 132 compatible with multiprocessing then indicate this in your task by setting 133 class variable canMultiprocess=False. 135 Due to a python bug [1], handling a KeyboardInterrupt properly requires 136 specifying a timeout [2]. This timeout (in sec) can be specified as the 137 "timeout" element in the output from ArgumentParser (the "parsedCmd"), if 138 available, otherwise we use TaskRunner.TIMEOUT. 140 [1] http://bugs.python.org/issue8296 141 [2] http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool) 145 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
146 """!Construct a TaskRunner 148 @warning Do not store parsedCmd, as this instance is pickled (if 149 multiprocessing) and parsedCmd may contain non-picklable elements. 150 It certainly contains more data than we need to send to each 151 instance of the task. 153 @param TaskClass The class of the task to run 154 @param parsedCmd The parsed command-line arguments, as returned by 155 the task's argument parser's parse_args method. 156 @param doReturnResults Should run return the collected result from 157 each invocation of the task? This is only intended for unit tests 158 and similar use. It can easily exhaust memory (if the task 159 returns enough data and you call it enough times) and it will 160 fail when using multiprocessing if the returned data cannot be 163 Note that even if doReturnResults is False a struct with a single 164 member "exitStatus" is returned, with value 0 or 1 to be returned 167 @throws ImportError if multiprocessing requested (and the task 168 supports it) but the multiprocessing library cannot be 180 self.
timeout = getattr(parsedCmd,
'timeout',
None)
185 if not TaskClass.canMultiprocess:
186 self.
log.warn(
"This task does not support multiprocessing; using one process")
190 """Prepare this instance for multiprocessing 192 Optional non-picklable elements are removed. 194 This is only called if the task is run under multiprocessing. 199 """!Run the task on all targets. 201 The task is run under multiprocessing if numProcesses > 1; otherwise 202 processing is serial. 204 @return a list of results returned by TaskRunner.\_\_call\_\_, or an 205 empty list if TaskRunner.\_\_call\_\_ is not called (e.g. if 206 TaskRunner.precall returns `False`). See TaskRunner.\_\_call\_\_ 211 disableImplicitThreading()
212 import multiprocessing
214 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
215 mapFunc = functools.partial(_runPool, pool, self.
timeout)
221 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None 224 if len(targetList) > 0:
225 with
profile(profileName, log):
227 resultList = list(mapFunc(self, targetList))
229 log.warn(
"Not running the task because there is no data to process; " 230 "you may preview data using \"--show data\"")
240 """!Return a list of (dataRef, kwargs) for TaskRunner.\_\_call\_\_. 242 @param parsedCmd the parsed command object (an argparse.Namespace) 243 returned by \ref argumentParser.ArgumentParser.parse_args 244 "ArgumentParser.parse_args". 245 @param **kwargs any additional keyword arguments. In the default 246 TaskRunner this is an empty dict, but having it simplifies 247 overriding TaskRunner for tasks whose run method takes additional 248 arguments (see case (1) below). 250 The default implementation of TaskRunner.getTargetList and 251 TaskRunner.\_\_call\_\_ works for any command-line task whose run 252 method takes exactly one argument: a data reference. Otherwise you 253 must provide a variant of TaskRunner that overrides 254 TaskRunner.getTargetList and possibly TaskRunner.\_\_call\_\_. 257 (1) If your command-line task has a `run` method that takes one data 258 reference followed by additional arguments, then you need only 259 override TaskRunner.getTargetList to return the additional arguments 260 as an argument dict. To make this easier, your overridden version of 261 getTargetList may call TaskRunner.getTargetList with the extra 262 arguments as keyword arguments. For example, the following adds an 263 argument dict containing a single key: "calExpList", whose value is 264 the list of data IDs for the calexp ID argument: 268 def getTargetList(parsedCmd): 269 return TaskRunner.getTargetList( 271 calExpList=parsedCmd.calexp.idList 275 It is equivalent to this slightly longer version: 279 def getTargetList(parsedCmd): 280 argDict = dict(calExpList=parsedCmd.calexp.idList) 281 return [(dataId, argDict) for dataId in parsedCmd.id.idList] 284 (2) If your task does not meet condition (1) then you must override 285 both TaskRunner.getTargetList and TaskRunner.\_\_call\_\_. You may do 286 this however you see fit, so long as TaskRunner.getTargetList 287 returns a list, each of whose elements is sent to 288 TaskRunner.\_\_call\_\_, which runs your task. 290 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
293 """!Create a Task instance 295 @param[in] parsedCmd parsed command-line options (used for extra 296 task args by some task runners) 297 @param[in] args args tuple passed to TaskRunner.\_\_call\_\_ 298 (used for extra task arguments by some task runners) 300 makeTask() can be called with either the 'parsedCmd' argument or 301 'args' argument set to None, but it must construct identical Task 302 instances in either case. 304 Subclasses may ignore this method entirely if they reimplement 305 both TaskRunner.precall and TaskRunner.\_\_call\_\_ 309 def _precallImpl(self, task, parsedCmd):
310 """The main work of 'precall' 312 We write package versions, schemas and configs, or compare these to 313 existing files on disk if present. 315 if not parsedCmd.noVersions:
316 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
321 """Hook for code that should run exactly once, before multiprocessing 323 Must return True if TaskRunner.\_\_call\_\_ should subsequently be 326 @warning Implementations must take care to ensure that no unpicklable 327 attributes are added to the TaskRunner itself, for compatibility 328 with multiprocessing. 330 The default implementation writes package versions, schemas and 331 configs, or compares them to existing files on disk if present. 333 task = self.
makeTask(parsedCmd=parsedCmd)
340 except Exception
as e:
341 task.log.fatal(
"Failed in task initialization: %s", e)
342 if not isinstance(e, TaskError):
343 traceback.print_exc(file=sys.stderr)
348 """!Run the Task on a single target. 350 This default implementation assumes that the 'args' is a tuple 351 containing a data reference and a dict of keyword arguments. 353 @warning if you override this method and wish to return something 354 when doReturnResults is false, then it must be picklable to support 355 multiprocessing and it should be small enough that pickling and 356 unpickling do not add excessive overhead. 358 @param args Arguments for Task.run() 361 - None if doReturnResults false 362 - A pipe_base Struct containing these fields if doReturnResults true: 363 - dataRef: the provided data reference 364 - metadata: task metadata after execution of run 365 - result: result returned by task run, or None if the task fails 367 dataRef, kwargs = args
369 self.
log = Log.getDefaultLogger()
370 if hasattr(dataRef,
"dataId"):
371 self.
log.MDC(
"LABEL", str(dataRef.dataId))
372 elif isinstance(dataRef, (list, tuple)):
373 self.
log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
378 result = task.run(dataRef, **kwargs)
381 result = task.run(dataRef, **kwargs)
382 except Exception
as e:
387 if hasattr(dataRef,
"dataId"):
388 task.log.fatal(
"Failed on dataId=%s: %s", dataRef.dataId, e)
389 elif isinstance(dataRef, (list, tuple)):
390 task.log.fatal(
"Failed on dataId=[%s]: %s",
391 ", ".join(str(ref.dataId)
for ref
in dataRef), e)
393 task.log.fatal(
"Failed on dataRef=%s: %s", dataRef, e)
395 if not isinstance(e, TaskError):
396 traceback.print_exc(file=sys.stderr)
397 task.writeMetadata(dataRef)
400 self.
log.MDCRemove(
"LABEL")
404 exitStatus=exitStatus,
406 metadata=task.metadata,
411 exitStatus=exitStatus,
416 """!A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to 421 """!A variant of the base version that passes a butler argument to the task's constructor 423 @param[in] parsedCmd parsed command-line options, as returned by the argument parser; 424 if specified then args is ignored 425 @param[in] args other arguments; if parsedCmd is None then this must be specified 427 @throw RuntimeError if parsedCmd and args are both None 429 if parsedCmd
is not None:
430 butler = parsedCmd.butler
431 elif args
is not None:
432 dataRef, kwargs = args
433 butler = dataRef.butlerSubset.butler
435 raise RuntimeError(
"parsedCmd or args must be specified")
440 """!Base class for command-line tasks: tasks that may be executed from the command line 442 See \ref pipeBase_introduction "pipe_base introduction" to learn what tasks are, 443 and \ref pipeTasks_writeCmdLineTask "how to write a command-line task" for more information 444 about writing command-line tasks. 445 If the second link is broken (as it will be before the documentation is cross-linked) 446 then look at the main page of pipe_tasks documentation for a link. 448 Subclasses must specify the following class variables: 449 * ConfigClass: configuration class for your task (a subclass of \ref lsst.pex.config.config.Config 450 "lsst.pex.config.Config", or if your task needs no configuration, then 451 \ref lsst.pex.config.config.Config "lsst.pex.config.Config" itself) 452 * _DefaultName: default name used for this task (a str) 454 Subclasses may also specify the following class variables: 455 * RunnerClass: a task runner class. The default is TaskRunner, which works for any task 456 with a run method that takes exactly one argument: a data reference. If your task does 457 not meet this requirement then you must supply a variant of TaskRunner; see TaskRunner 458 for more information. 459 * canMultiprocess: the default is True; set False if your task does not support multiprocessing. 461 Subclasses must specify a method named "run": 462 - By default `run` accepts a single butler data reference, but you can specify an alternate task runner 463 (subclass of TaskRunner) as the value of class variable `RunnerClass` if your run method needs 465 - `run` is expected to return its data in a Struct. This provides safety for evolution of the task 466 since new values may be added without harming existing code. 467 - The data returned by `run` must be picklable if your task is to support multiprocessing. 469 RunnerClass = TaskRunner
470 canMultiprocess =
True 474 """!A hook to allow a task to change the values of its config *after* the camera-specific 475 overrides are loaded but before any command-line overrides are applied. 477 This is necessary in some cases because the camera-specific overrides may retarget subtasks, 478 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion. 480 @warning This is called by CmdLineTask.parseAndRun; other ways of constructing a config 481 will not apply these overrides. 483 @param[in] cls the class object 484 @param[in] config task configuration (an instance of cls.ConfigClass) 489 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
490 """!Parse an argument list and run the command 492 Calling this method with no arguments specified is the standard way to run a command-line task 493 from the command line. For an example see pipe_tasks `bin/makeSkyMap.py` or almost any other 494 file in that directory. 496 @param cls the class object 497 @param args list of command-line arguments; if `None` use sys.argv 498 @param config config for task (instance of pex_config Config); if `None` use cls.ConfigClass() 499 @param log log (instance of lsst.log.Log); if `None` use the default log 500 @param doReturnResults Return the collected results from each invocation of the task? 501 This is only intended for unit tests and similar use. 502 It can easily exhaust memory (if the task returns enough data and you call it enough times) 503 and it will fail when using multiprocessing if the returned data cannot be pickled. 505 @return a Struct containing: 506 - argumentParser: the argument parser 507 - parsedCmd: the parsed command returned by the argument parser's parse_args method 508 - taskRunner: the task runner used to run the task (an instance of cls.RunnerClass) 509 - resultList: results returned by the task runner's run method, one entry per invocation. 510 This will typically be a list of `None` unless doReturnResults is `True`; 511 see cls.RunnerClass (TaskRunner by default) for more information. 513 If one or more of the dataIds fails then this routine will exit (with a status giving the 514 number of failed dataIds) rather than returning this struct; this behaviour can be 515 overridden by specifying the --noExit option. 518 commandAsStr =
" ".join(sys.argv)
521 commandAsStr =
"{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
525 config = cls.ConfigClass()
526 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.
applyOverrides)
528 parsedCmd.log.info(
"Running: %s", commandAsStr)
530 taskRunner = cls.
RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
531 resultList = taskRunner.run(parsedCmd)
534 nFailed = sum(((res.exitStatus != 0)
for res
in resultList))
535 except Exception
as e:
536 parsedCmd.log.warn(
"Unable to retrieve exit status (%s); assuming success", e)
541 parsedCmd.log.warn(
"%d dataRefs failed; not exiting as --noExit was set", nFailed)
546 argumentParser=argumentParser,
548 taskRunner=taskRunner,
549 resultList=resultList,
553 def _makeArgumentParser(cls):
554 """!Create and return an argument parser 556 @param[in] cls the class object 557 @return the argument parser for this task. 559 By default this returns an ArgumentParser with one ID argument named `--id` of dataset type "raw". 561 Your task subclass may need to override this method to change the dataset type or data ref level, 562 or to add additional data ID arguments. If you add additional data ID arguments or your task's 563 run method takes more than a single data reference then you will also have to provide a task-specific 564 task runner (see TaskRunner for more information). 567 parser.add_id_argument(name=
"--id", datasetType=
"raw",
568 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
572 """!Write the configuration used for processing the data, or check that an existing 573 one is equal to the new one if present. 575 @param[in] butler data butler used to write the config. 576 The config is written to dataset type self._getConfigName() 577 @param[in] clobber a boolean flag that controls what happens if a config already has been saved: 578 - True: overwrite or rename the existing config, depending on `doBackup` 579 - False: raise TaskError if this config does not match the existing config 580 @param[in] doBackup if clobbering, should we backup the old files? 583 if configName
is None:
586 butler.put(self.
config, configName, doBackup=doBackup)
587 elif butler.datasetExists(configName):
590 oldConfig = butler.get(configName, immediate=
True)
591 except Exception
as exc:
592 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
595 def logConfigMismatch(msg):
596 self.
log.fatal(
"Comparing configuration: %s", msg)
598 if not self.
config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
600 (
"Config does not match existing task config %r on disk; tasks configurations " +
601 "must be consistent within the same output repo (override with --clobber-config)") %
604 butler.put(self.
config, configName)
607 """!Write the schemas returned by \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs" 609 @param[in] butler data butler used to write the schema. 610 Each schema is written to the dataset type specified as the key in the dict returned by 611 \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs". 612 @param[in] clobber a boolean flag that controls what happens if a schema already has been saved: 613 - True: overwrite or rename the existing schema, depending on `doBackup` 614 - False: raise TaskError if this schema does not match the existing schema 615 @param[in] doBackup if clobbering, should we backup the old files? 617 @warning if clobber is False and an existing schema does not match a current schema, 618 then some schemas may have been saved successfully and others may not, and there is no easy way to 622 schemaDataset = dataset +
"_schema" 624 butler.put(catalog, schemaDataset, doBackup=doBackup)
625 elif butler.datasetExists(schemaDataset):
626 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
627 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
629 (
"New schema does not match schema %r on disk; schemas must be " +
630 " consistent within the same output repo (override with --clobber-config)") %
633 butler.put(catalog, schemaDataset)
636 """!Write the metadata produced from processing the data 638 @param[in] dataRef butler data reference used to write the metadata. 639 The metadata is written to dataset type self._getMetadataName() 643 if metadataName
is not None:
645 except Exception
as e:
646 self.
log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
649 """!Compare and write package versions 651 We retrieve the persisted list of packages and compare with what we're currently using. 652 We raise TaskError if there's a version mismatch. 654 Note that this operation is subject to a race condition. 656 @param[in] butler data butler used to read/write the package versions 657 @param[in] clobber a boolean flag that controls what happens if versions already have been saved: 658 - True: overwrite or rename the existing version info, depending on `doBackup` 659 - False: raise TaskError if this version info does not match the existing 660 @param[in] doBackup if clobbering, should we backup the old files? 661 @param[in] dataset name of dataset to read/write 663 packages = Packages.fromSystem()
666 return butler.put(packages, dataset, doBackup=doBackup)
667 if not butler.datasetExists(dataset):
668 return butler.put(packages, dataset)
671 old = butler.get(dataset, immediate=
True)
672 except Exception
as exc:
673 raise type(exc)(
"Unable to read stored version dataset %s (%s); " 674 "consider using --clobber-versions or --no-versions" %
679 diff = packages.difference(old)
682 "Version mismatch (" +
683 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
684 "); consider using --clobber-versions or --no-versions")
686 extra = packages.extra(old)
689 butler.put(old, dataset, doBackup=doBackup)
691 def _getConfigName(self):
692 """!Return the name of the config dataset type, or None if config is not to be persisted 694 @note The name may depend on the config; that is why this is not a class method. 696 return self._DefaultName +
"_config" 698 def _getMetadataName(self):
699 """!Return the name of the metadata dataset type, or None if metadata is not to be persisted 701 @note The name may depend on the config; that is why this is not a class method. 703 return self._DefaultName +
"_metadata" Use to report errors for which a traceback is not useful.
def _getConfigName(self)
Return the name of the config dataset type, or None if config is not to be persisted.
An argument parser for pipeline tasks that is based on argparse.ArgumentParser.
def _makeArgumentParser(cls)
Create and return an argument parser.
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
Parse an argument list and run the command.
def _precallImpl(self, task, parsedCmd)
A struct to which you can add any fields.
def getFullMetadata(self)
Get metadata for all tasks.
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
Compare and write package versions.
def getAllSchemaCatalogs(self)
Call getSchemaCatalogs() on all tasks in the hiearchy, combining the results into a single dict...
Base class for data processing tasks.
def __call__(self, args)
Run the Task on a single target.
def writeSchemas(self, butler, clobber=False, doBackup=True)
Write the schemas returned by getAllSchemaCatalogs.
def prepareForMultiProcessing(self)
def _getMetadataName(self)
Return the name of the metadata dataset type, or None if metadata is not to be persisted.
def makeTask(self, parsedCmd=None, args=None)
Create a Task instance.
def writeMetadata(self, dataRef)
Write the metadata produced from processing the data.
def precall(self, parsedCmd)
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
Construct a TaskRunner.
def profile(filename, log=None)
Context manager for profiling with cProfile.
def makeTask(self, parsedCmd=None, args=None)
A variant of the base version that passes a butler argument to the task's constructor.
A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to their construc...
def run(self, parsedCmd)
Run the task on all targets.
def getTargetList(parsedCmd, kwargs)
Return a list of (dataRef, kwargs) for TaskRunner.
def writeConfig(self, butler, clobber=False, doBackup=True)
Write the configuration used for processing the data, or check that an existing one is equal to the n...
def applyOverrides(cls, config)
A hook to allow a task to change the values of its config after the camera-specific overrides are loa...
Base class for command-line tasks: tasks that may be executed from the command line.