22 from __future__
import absolute_import, division
28 from builtins
import str
29 from builtins
import object
32 from lsst.base
import disableImplicitThreading
33 import lsst.afw.table
as afwTable
34 from .task
import Task, TaskError
35 from .struct
import Struct
36 from .argumentParser
import ArgumentParser
37 from lsst.base
import Packages
38 from lsst.log
import Log
40 __all__ = [
"CmdLineTask",
"TaskRunner",
"ButlerInitializedTaskRunner"]
43 def _poolFunctionWrapper(function, arg):
44 """Wrapper around function to catch exceptions that don't inherit from Exception
46 Such exceptions aren't caught by multiprocessing, which causes the slave
47 process to crash and you end up hitting the timeout.
55 cls, exc, tb = sys.exc_info()
56 log = Log.getDefaultLogger()
57 log.warn(
"Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58 raise Exception(
"Unhandled exception: %s (%s)" % (cls.__name__, exc))
61 def _runPool(pool, timeout, function, iterable):
62 """Wrapper around pool.map_async, to handle timeout
64 This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
67 Further wraps the function in _poolFunctionWrapper to catch exceptions
68 that don't inherit from Exception.
70 return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
73 @contextlib.contextmanager
75 """!Context manager for profiling with cProfile
77 @param filename filename to which to write profile (profiling disabled if None or empty)
78 @param log log object for logging the profile operations
80 If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
81 it returns None), which allows additional control over profiling. You can obtain this using
82 the "as" clause, e.g.:
84 with profile(filename) as prof:
87 The output cumulative profile can be printed with a command-line like:
89 python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
95 from cProfile
import Profile
98 log.info(
"Enabling cProfile profiling")
102 profile.dump_stats(filename)
104 log.info(
"cProfile stats written to %s" % filename)
108 """!Run a command-line task, using multiprocessing if requested.
110 Each command-line task (subclass of CmdLineTask) has a task runner. By default it is
111 this class, but some tasks require a subclass. See the manual "how to write a command-line task"
112 in the pipe_tasks documentation for more information.
113 See CmdLineTask.parseAndRun to see how a task runner is used.
115 You may use this task runner for your command-line task if your task has a run method
116 that takes exactly one argument: a butler data reference. Otherwise you must
117 provide a task-specific subclass of this runner for your task's `RunnerClass`
118 that overrides TaskRunner.getTargetList and possibly TaskRunner.\_\_call\_\_.
119 See TaskRunner.getTargetList for details.
121 This design matches the common pattern for command-line tasks: the run method takes a single
122 data reference, of some suitable name. Additional arguments are rare, and if present, require
123 a subclass of TaskRunner that calls these additional arguments by name.
125 Instances of this class must be picklable in order to be compatible with multiprocessing.
126 If multiprocessing is requested (parsedCmd.numProcesses > 1) then run() calls prepareForMultiProcessing
127 to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing
128 then indicate this in your task by setting class variable canMultiprocess=False.
130 Due to a python bug [1], handling a KeyboardInterrupt properly requires specifying a timeout [2]. This
131 timeout (in sec) can be specified as the "timeout" element in the output from ArgumentParser
132 (the "parsedCmd"), if available, otherwise we use TaskRunner.TIMEOUT_DEFAULT.
134 [1] http://bugs.python.org/issue8296
135 [2] http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool)
139 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
140 """!Construct a TaskRunner
142 @warning Do not store parsedCmd, as this instance is pickled (if multiprocessing) and parsedCmd may
143 contain non-picklable elements. It certainly contains more data than we need to send to each
144 instance of the task.
146 @param TaskClass The class of the task to run
147 @param parsedCmd The parsed command-line arguments, as returned by the task's argument parser's
149 @param doReturnResults Should run return the collected result from each invocation of the task?
150 This is only intended for unit tests and similar use.
151 It can easily exhaust memory (if the task returns enough data and you call it enough times)
152 and it will fail when using multiprocessing if the returned data cannot be pickled.
154 @throws ImportError if multiprocessing requested (and the task supports it)
155 but the multiprocessing library cannot be imported.
166 self.
timeout = getattr(parsedCmd,
'timeout',
None)
171 if not TaskClass.canMultiprocess:
172 self.log.warn(
"This task does not support multiprocessing; using one process")
176 """!Prepare this instance for multiprocessing by removing optional non-picklable elements.
178 This is only called if the task is run under multiprocessing.
183 """!Run the task on all targets.
185 The task is run under multiprocessing if numProcesses > 1; otherwise processing is serial.
187 @return a list of results returned by TaskRunner.\_\_call\_\_, or an empty list if
188 TaskRunner.\_\_call\_\_ is not called (e.g. if TaskRunner.precall returns `False`).
189 See TaskRunner.\_\_call\_\_ for details.
193 disableImplicitThreading()
194 import multiprocessing
196 pool = multiprocessing.Pool(processes=self.
numProcesses, maxtasksperchild=1)
197 mapFunc = functools.partial(_runPool, pool, self.
timeout)
203 profileName = parsedCmd.profile
if hasattr(parsedCmd,
"profile")
else None
206 if len(targetList) > 0:
207 with
profile(profileName, log):
209 resultList = list(mapFunc(self, targetList))
211 log.warn(
"Not running the task because there is no data to process; "
212 "you may preview data using \"--show data\"")
222 """!Return a list of (dataRef, kwargs) to be used as arguments for TaskRunner.\_\_call\_\_.
224 @param parsedCmd the parsed command object (an argparse.Namespace) returned by
225 \ref argumentParser.ArgumentParser.parse_args "ArgumentParser.parse_args".
226 @param **kwargs any additional keyword arguments. In the default TaskRunner
227 this is an empty dict, but having it simplifies overriding TaskRunner for tasks
228 whose run method takes additional arguments (see case (1) below).
230 The default implementation of TaskRunner.getTargetList and TaskRunner.\_\_call\_\_ works for any
231 command-line task whose run method takes exactly one argument: a data reference.
232 Otherwise you must provide a variant of TaskRunner that overrides TaskRunner.getTargetList
233 and possibly TaskRunner.\_\_call\_\_. There are two cases:
235 (1) If your command-line task has a `run` method that takes one data reference followed by additional
236 arguments, then you need only override TaskRunner.getTargetList to return the additional arguments as
237 an argument dict. To make this easier, your overridden version of getTargetList may call
238 TaskRunner.getTargetList with the extra arguments as keyword arguments. For example,
239 the following adds an argument dict containing a single key: "calExpList", whose value is the list
240 of data IDs for the calexp ID argument:
244 def getTargetList(parsedCmd):
245 return TaskRunner.getTargetList(parsedCmd, calExpList=parsedCmd.calexp.idList)
248 It is equivalent to this slightly longer version:
252 def getTargetList(parsedCmd):
253 argDict = dict(calExpList=parsedCmd.calexp.idList)
254 return [(dataId, argDict) for dataId in parsedCmd.id.idList]
257 (2) If your task does not meet condition (1) then you must override both TaskRunner.getTargetList
258 and TaskRunner.\_\_call\_\_. You may do this however you see fit, so long as TaskRunner.getTargetList
259 returns a list, each of whose elements is sent to TaskRunner.\_\_call\_\_, which runs your task.
261 return [(ref, kwargs)
for ref
in parsedCmd.id.refList]
264 """!Create a Task instance
266 @param[in] parsedCmd parsed command-line options (used for extra task args by some task runners)
267 @param[in] args args tuple passed to TaskRunner.\_\_call\_\_ (used for extra task arguments
268 by some task runners)
270 makeTask() can be called with either the 'parsedCmd' argument or 'args' argument set to None,
271 but it must construct identical Task instances in either case.
273 Subclasses may ignore this method entirely if they reimplement both TaskRunner.precall and
274 TaskRunner.\_\_call\_\_
278 def _precallImpl(self, task, parsedCmd):
279 """The main work of 'precall'
281 We write package versions, schemas and configs, or compare these to existing
282 files on disk if present.
284 if not parsedCmd.noVersions:
285 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
290 """!Hook for code that should run exactly once, before multiprocessing is invoked.
292 Must return True if TaskRunner.\_\_call\_\_ should subsequently be called.
294 @warning Implementations must take care to ensure that no unpicklable attributes are added to
295 the TaskRunner itself, for compatibility with multiprocessing.
297 The default implementation writes package versions, schemas and configs, or compares
298 them to existing files on disk if present.
300 task = self.
makeTask(parsedCmd=parsedCmd)
307 except Exception
as e:
308 task.log.fatal(
"Failed in task initialization: %s", e)
309 if not isinstance(e, TaskError):
310 traceback.print_exc(file=sys.stderr)
315 """!Run the Task on a single target.
317 This default implementation assumes that the 'args' is a tuple
318 containing a data reference and a dict of keyword arguments.
320 @warning if you override this method and wish to return something when
321 doReturnResults is false, then it must be picklable to support
322 multiprocessing and it should be small enough that pickling and
323 unpickling do not add excessive overhead.
325 @param args Arguments for Task.run()
328 - None if doReturnResults false
329 - A pipe_base Struct containing these fields if doReturnResults true:
330 - dataRef: the provided data reference
331 - metadata: task metadata after execution of run
332 - result: result returned by task run, or None if the task fails
334 dataRef, kwargs = args
336 self.
log = Log.getDefaultLogger()
337 if hasattr(dataRef,
"dataId"):
338 self.log.MDC(
"LABEL", str(dataRef.dataId))
339 elif isinstance(dataRef, (list, tuple)):
340 self.log.MDC(
"LABEL", str([ref.dataId
for ref
in dataRef
if hasattr(ref,
"dataId")]))
344 result = task.run(dataRef, **kwargs)
347 result = task.run(dataRef, **kwargs)
348 except Exception
as e:
350 if hasattr(dataRef,
"dataId"):
351 task.log.fatal(
"Failed on dataId=%s: %s", dataRef.dataId, e)
352 elif isinstance(dataRef, (list, tuple)):
353 task.log.fatal(
"Failed on dataId=[%s]: %s",
354 ", ".join(str(ref.dataId)
for ref
in dataRef), e)
356 task.log.fatal(
"Failed on dataRef=%s: %s", dataRef, e)
358 if not isinstance(e, TaskError):
359 traceback.print_exc(file=sys.stderr)
360 task.writeMetadata(dataRef)
365 metadata=task.metadata,
371 """!A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to
376 """!A variant of the base version that passes a butler argument to the task's constructor
378 @param[in] parsedCmd parsed command-line options, as returned by the argument parser;
379 if specified then args is ignored
380 @param[in] args other arguments; if parsedCmd is None then this must be specified
382 @throw RuntimeError if parsedCmd and args are both None
384 if parsedCmd
is not None:
385 butler = parsedCmd.butler
386 elif args
is not None:
387 dataRef, kwargs = args
388 butler = dataRef.butlerSubset.butler
390 raise RuntimeError(
"parsedCmd or args must be specified")
395 """!Base class for command-line tasks: tasks that may be executed from the command line
397 See \ref pipeBase_introduction "pipe_base introduction" to learn what tasks are,
398 and \ref pipeTasks_writeCmdLineTask "how to write a command-line task" for more information
399 about writing command-line tasks.
400 If the second link is broken (as it will be before the documentation is cross-linked)
401 then look at the main page of pipe_tasks documentation for a link.
403 Subclasses must specify the following class variables:
404 * ConfigClass: configuration class for your task (a subclass of \ref lsst.pex.config.config.Config
405 "lsst.pex.config.Config", or if your task needs no configuration, then
406 \ref lsst.pex.config.config.Config "lsst.pex.config.Config" itself)
407 * _DefaultName: default name used for this task (a str)
409 Subclasses may also specify the following class variables:
410 * RunnerClass: a task runner class. The default is TaskRunner, which works for any task
411 with a run method that takes exactly one argument: a data reference. If your task does
412 not meet this requirement then you must supply a variant of TaskRunner; see TaskRunner
413 for more information.
414 * canMultiprocess: the default is True; set False if your task does not support multiprocessing.
416 Subclasses must specify a method named "run":
417 - By default `run` accepts a single butler data reference, but you can specify an alternate task runner
418 (subclass of TaskRunner) as the value of class variable `RunnerClass` if your run method needs
420 - `run` is expected to return its data in a Struct. This provides safety for evolution of the task
421 since new values may be added without harming existing code.
422 - The data returned by `run` must be picklable if your task is to support multiprocessing.
424 RunnerClass = TaskRunner
425 canMultiprocess =
True
429 """!A hook to allow a task to change the values of its config *after* the camera-specific
430 overrides are loaded but before any command-line overrides are applied.
432 This is necessary in some cases because the camera-specific overrides may retarget subtasks,
433 wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
435 @warning This is called by CmdLineTask.parseAndRun; other ways of constructing a config
436 will not apply these overrides.
438 @param[in] cls the class object
439 @param[in] config task configuration (an instance of cls.ConfigClass)
444 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
445 """!Parse an argument list and run the command
447 Calling this method with no arguments specified is the standard way to run a command-line task
448 from the command line. For an example see pipe_tasks `bin/makeSkyMap.py` or almost any other
449 file in that directory.
451 @param cls the class object
452 @param args list of command-line arguments; if `None` use sys.argv
453 @param config config for task (instance of pex_config Config); if `None` use cls.ConfigClass()
454 @param log log (instance of lsst.log.Log); if `None` use the default log
455 @param doReturnResults Return the collected results from each invocation of the task?
456 This is only intended for unit tests and similar use.
457 It can easily exhaust memory (if the task returns enough data and you call it enough times)
458 and it will fail when using multiprocessing if the returned data cannot be pickled.
460 @return a Struct containing:
461 - argumentParser: the argument parser
462 - parsedCmd: the parsed command returned by the argument parser's parse_args method
463 - taskRunner: the task runner used to run the task (an instance of cls.RunnerClass)
464 - resultList: results returned by the task runner's run method, one entry per invocation.
465 This will typically be a list of `None` unless doReturnResults is `True`;
466 see cls.RunnerClass (TaskRunner by default) for more information.
469 commandAsStr =
" ".join(sys.argv)
472 commandAsStr =
"{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
474 argumentParser = cls._makeArgumentParser()
476 config = cls.ConfigClass()
477 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
479 parsedCmd.log.info(
"Running: %s", commandAsStr)
481 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
482 resultList = taskRunner.run(parsedCmd)
484 argumentParser=argumentParser,
486 taskRunner=taskRunner,
487 resultList=resultList,
491 def _makeArgumentParser(cls):
492 """!Create and return an argument parser
494 @param[in] cls the class object
495 @return the argument parser for this task.
497 By default this returns an ArgumentParser with one ID argument named `--id` of dataset type "raw".
499 Your task subclass may need to override this method to change the dataset type or data ref level,
500 or to add additional data ID arguments. If you add additional data ID arguments or your task's
501 run method takes more than a single data reference then you will also have to provide a task-specific
502 task runner (see TaskRunner for more information).
504 parser = ArgumentParser(name=cls._DefaultName)
505 parser.add_id_argument(name=
"--id", datasetType=
"raw",
506 help=
"data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
510 """!Write the configuration used for processing the data, or check that an existing
511 one is equal to the new one if present.
513 @param[in] butler data butler used to write the config.
514 The config is written to dataset type self._getConfigName()
515 @param[in] clobber a boolean flag that controls what happens if a config already has been saved:
516 - True: overwrite or rename the existing config, depending on `doBackup`
517 - False: raise TaskError if this config does not match the existing config
518 @param[in] doBackup if clobbering, should we backup the old files?
521 if configName
is None:
524 butler.put(self.config, configName, doBackup=doBackup)
525 elif butler.datasetExists(configName):
528 oldConfig = butler.get(configName, immediate=
True)
529 except Exception
as exc:
530 raise type(exc)(
"Unable to read stored config file %s (%s); consider using --clobber-config" %
533 def logConfigMismatch(msg):
534 self.log.fatal(
"Comparing configuration: %s", msg)
536 if not self.config.compare(oldConfig, shortcut=
False, output=logConfigMismatch):
538 (
"Config does not match existing task config %r on disk; tasks configurations " +
539 "must be consistent within the same output repo (override with --clobber-config)") %
542 butler.put(self.config, configName)
545 """!Write the schemas returned by \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs"
547 @param[in] butler data butler used to write the schema.
548 Each schema is written to the dataset type specified as the key in the dict returned by
549 \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs".
550 @param[in] clobber a boolean flag that controls what happens if a schema already has been saved:
551 - True: overwrite or rename the existing schema, depending on `doBackup`
552 - False: raise TaskError if this schema does not match the existing schema
553 @param[in] doBackup if clobbering, should we backup the old files?
555 @warning if clobber is False and an existing schema does not match a current schema,
556 then some schemas may have been saved successfully and others may not, and there is no easy way to
559 for dataset, catalog
in self.getAllSchemaCatalogs().items():
560 schemaDataset = dataset +
"_schema"
562 butler.put(catalog, schemaDataset, doBackup=doBackup)
563 elif butler.datasetExists(schemaDataset):
564 oldSchema = butler.get(schemaDataset, immediate=
True).getSchema()
565 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
567 (
"New schema does not match schema %r on disk; schemas must be " +
568 " consistent within the same output repo (override with --clobber-config)") %
571 butler.put(catalog, schemaDataset)
574 """!Write the metadata produced from processing the data
576 @param[in] dataRef butler data reference used to write the metadata.
577 The metadata is written to dataset type self._getMetadataName()
581 if metadataName
is not None:
582 dataRef.put(self.getFullMetadata(), metadataName)
583 except Exception
as e:
584 self.log.warn(
"Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
587 """!Compare and write package versions
589 We retrieve the persisted list of packages and compare with what we're currently using.
590 We raise TaskError if there's a version mismatch.
592 Note that this operation is subject to a race condition.
594 @param[in] butler data butler used to read/write the package versions
595 @param[in] clobber a boolean flag that controls what happens if versions already have been saved:
596 - True: overwrite or rename the existing version info, depending on `doBackup`
597 - False: raise TaskError if this version info does not match the existing
598 @param[in] doBackup if clobbering, should we backup the old files?
599 @param[in] dataset name of dataset to read/write
601 packages = Packages.fromSystem()
604 return butler.put(packages, dataset, doBackup=doBackup)
605 if not butler.datasetExists(dataset):
606 return butler.put(packages, dataset)
609 old = butler.get(dataset, immediate=
True)
610 except Exception
as exc:
611 raise type(exc)(
"Unable to read stored version dataset %s (%s); "
612 "consider using --clobber-versions or --no-versions" %
617 diff = packages.difference(old)
620 "Version mismatch (" +
621 "; ".join(
"%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0])
for pkg
in diff) +
622 "); consider using --clobber-versions or --no-versions")
624 extra = packages.extra(old)
627 butler.put(old, dataset, doBackup=doBackup)
629 def _getConfigName(self):
630 """!Return the name of the config dataset type, or None if config is not to be persisted
632 @note The name may depend on the config; that is why this is not a class method.
634 return self._DefaultName +
"_config"
636 def _getMetadataName(self):
637 """!Return the name of the metadata dataset type, or None if metadata is not to be persisted
639 @note The name may depend on the config; that is why this is not a class method.
641 return self._DefaultName +
"_metadata"
def precall
Hook for code that should run exactly once, before multiprocessing is invoked.
def __init__
Construct a TaskRunner.
def makeTask
Create a Task instance.
def writePackageVersions
Compare and write package versions.
def makeTask
A variant of the base version that passes a butler argument to the task's constructor.
def run
Run the task on all targets.
def __call__
Run the Task on a single target.
def writeConfig
Write the configuration used for processing the data, or check that an existing one is equal to the n...
def _getConfigName
Return the name of the config dataset type, or None if config is not to be persisted.
def applyOverrides
A hook to allow a task to change the values of its config after the camera-specific overrides are loa...
def parseAndRun
Parse an argument list and run the command.
def _getMetadataName
Return the name of the metadata dataset type, or None if metadata is not to be persisted.
def writeMetadata
Write the metadata produced from processing the data.
def prepareForMultiProcessing
Prepare this instance for multiprocessing by removing optional non-picklable elements.
def getTargetList
Return a list of (dataRef, kwargs) to be used as arguments for TaskRunner.
A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to their construc...
Run a command-line task, using multiprocessing if requested.
def profile
Context manager for profiling with cProfile.
def writeSchemas
Write the schemas returned by getAllSchemaCatalogs.
Base class for command-line tasks: tasks that may be executed from the command line.