lsst.pipe.base  16.0-9-g41f434e+2
cmdLineTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008-2015 AURA/LSST.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <https://www.lsstcorp.org/LegalNotices/>.
21 #
22 from __future__ import absolute_import, division
23 import sys
24 import traceback
25 import functools
26 import contextlib
27 
28 from builtins import str
29 from builtins import object
30 
31 import lsst.utils
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table as afwTable
34 from .task import Task, TaskError
35 from .struct import Struct
36 from .argumentParser import ArgumentParser
37 from lsst.base import Packages
38 from lsst.log import Log
39 
40 __all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner", "LegacyTaskRunner"]
41 
42 
43 def _runPool(pool, timeout, function, iterable):
44  """Wrapper around ``pool.map_async``, to handle timeout
45 
46  This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
47  http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
48  """
49  return pool.map_async(function, iterable).get(timeout)
50 
51 
52 @contextlib.contextmanager
53 def profile(filename, log=None):
54  """Context manager for profiling with cProfile.
55 
56 
57  Parameters
58  ----------
59  filename : `str`
60  Filename to which to write profile (profiling disabled if `None` or empty).
61  log : `lsst.log.Log`, optional
62  Log object for logging the profile operations.
63 
64  If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
65  it returns None), which allows additional control over profiling. You can obtain this using
66  the "as" clause, e.g.:
67 
68  with profile(filename) as prof:
69  runYourCodeHere()
70 
71  The output cumulative profile can be printed with a command-line like::
72 
73  python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
74  """
75  if not filename:
76  # Nothing to do
77  yield
78  return
79  from cProfile import Profile
80  profile = Profile()
81  if log is not None:
82  log.info("Enabling cProfile profiling")
83  profile.enable()
84  yield profile
85  profile.disable()
86  profile.dump_stats(filename)
87  if log is not None:
88  log.info("cProfile stats written to %s" % filename)
89 
90 
91 class TaskRunner(object):
92  """Run a command-line task, using `multiprocessing` if requested.
93 
94  Parameters
95  ----------
96  TaskClass : `lsst.pipe.base.Task` subclass
97  The class of the task to run.
98  parsedCmd : `argparse.Namespace`
99  The parsed command-line arguments, as returned by the task's argument parser's
100  `~lsst.pipe.base.ArgumentParser.parse_args` method.
101 
102  .. warning::
103 
104  Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may
105  contain non-picklable elements. It certainly contains more data than we need to send to each
106  instance of the task.
107  doReturnResults : `bool`, optional
108  Should run return the collected result from each invocation of the task? This is only intended for
109  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
110  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
111  pickled.
112 
113  Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned,
114  with value 0 or 1 to be returned to the unix shell.
115 
116  Raises
117  ------
118  ImportError
119  If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be
120  imported.
121 
122  Notes
123  -----
124  Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this
125  class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more
126  information. See `CmdLineTask.parseAndRun` to see how a task runner is used.
127 
128  You may use this task runner for your command-line task if your task has a runDataRef method that takes
129  exactly one argument: a butler data reference. Otherwise you must provide a task-specific subclass of
130  this runner for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly
131  `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
132 
133  This design matches the common pattern for command-line tasks: the runDataRef method takes a single data
134  reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of
135  `TaskRunner` that calls these additional arguments by name.
136 
137  Instances of this class must be picklable in order to be compatible with multiprocessing. If
138  multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `runDataRef` calls
139  `prepareForMultiProcessing` to jettison optional non-picklable elements. If your task runner is not
140  compatible with multiprocessing then indicate this in your task by setting class variable
141  ``canMultiprocess=False``.
142 
143  Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This
144  timeout (in sec) can be specified as the ``timeout`` element in the output from
145  `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
146 
147  By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as
148  MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns
149  multiple processes and when multiple users are running on a shared system. Users can override this
150  behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable.
151 
152  .. __: http://bugs.python.org/issue8296
153  .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
154  """
155 
156  TIMEOUT = 3600*24*30
157  """Default timeout (seconds) for multiprocessing."""
158 
159  def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
160  self.TaskClass = TaskClass
161  self.doReturnResults = bool(doReturnResults)
162  self.config = parsedCmd.config
163  self.log = parsedCmd.log
164  self.doRaise = bool(parsedCmd.doraise)
165  self.clobberConfig = bool(parsedCmd.clobberConfig)
166  self.doBackup = not bool(parsedCmd.noBackupConfig)
167  self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
168 
169  self.timeout = getattr(parsedCmd, 'timeout', None)
170  if self.timeout is None or self.timeout <= 0:
171  self.timeout = self.TIMEOUT
172 
173  if self.numProcesses > 1:
174  if not TaskClass.canMultiprocess:
175  self.log.warn("This task does not support multiprocessing; using one process")
176  self.numProcesses = 1
177 
179  """Prepare this instance for multiprocessing
180 
181  Optional non-picklable elements are removed.
182 
183  This is only called if the task is run under multiprocessing.
184  """
185  self.log = None
186 
187  def run(self, parsedCmd):
188  """Run the task on all targets.
189 
190  Parameters
191  ----------
192  parsedCmd : `argparse.Namespace`
193  Parsed command `argparse.Namespace`.
194 
195  Returns
196  -------
197  resultList : `list`
198  A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__`
199  is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
200  for details.
201 
202  Notes
203  -----
204  The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise
205  processing is serial.
206  """
207  resultList = []
208  disableImplicitThreading() # To prevent thread contention
209  if self.numProcesses > 1:
210  import multiprocessing
212  pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
213  mapFunc = functools.partial(_runPool, pool, self.timeout)
214  else:
215  pool = None
216  mapFunc = map
217 
218  if self.precall(parsedCmd):
219  profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
220  log = parsedCmd.log
221  targetList = self.getTargetList(parsedCmd)
222  if len(targetList) > 0:
223  with profile(profileName, log):
224  # Run the task using self.__call__
225  resultList = list(mapFunc(self, targetList))
226  else:
227  log.warn("Not running the task because there is no data to process; "
228  "you may preview data using \"--show data\"")
229 
230  if pool is not None:
231  pool.close()
232  pool.join()
233 
234  return resultList
235 
236  @staticmethod
237  def getTargetList(parsedCmd, **kwargs):
238  """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
239 
240  Parameters
241  ----------
242  parsedCmd : `argparse.Namespace`
243  The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
244  kwargs
245  Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having
246  it simplifies overriding `TaskRunner` for tasks whose runDataRef method takes additional arguments
247  (see case (1) below).
248 
249  Notes
250  -----
251  The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any
252  command-line task whose runDataRef method takes exactly one argument: a data reference. Otherwise you
253  must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly
254  `TaskRunner.__call__`. There are two cases.
255 
256  **Case 1**
257 
258  If your command-line task has a ``runDataRef`` method that takes one data reference followed by
259  additional arguments, then you need only override `TaskRunner.getTargetList` to return the additional
260  arguments as an argument dict. To make this easier, your overridden version of
261  `~TaskRunner.getTargetList` may call `TaskRunner.getTargetList` with the extra arguments as keyword
262  arguments. For example, the following adds an argument dict containing a single key: "calExpList",
263  whose value is the list of data IDs for the calexp ID argument::
264 
265  def getTargetList(parsedCmd):
266  return TaskRunner.getTargetList(
267  parsedCmd,
268  calExpList=parsedCmd.calexp.idList
269  )
270 
271  It is equivalent to this slightly longer version::
272 
273  @staticmethod
274  def getTargetList(parsedCmd):
275  argDict = dict(calExpList=parsedCmd.calexp.idList)
276  return [(dataId, argDict) for dataId in parsedCmd.id.idList]
277 
278  **Case 2**
279 
280  If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and
281  `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList`
282  returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task.
283  """
284  return [(ref, kwargs) for ref in parsedCmd.id.refList]
285 
286  def makeTask(self, parsedCmd=None, args=None):
287  """Create a Task instance.
288 
289  Parameters
290  ----------
291  parsedCmd
292  Parsed command-line options (used for extra task args by some task runners).
293  args
294  Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners).
295 
296  Notes
297  -----
298  ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None,
299  but it must construct identical Task instances in either case.
300 
301  Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and
302  `TaskRunner.__call__`.
303  """
304  return self.TaskClass(config=self.config, log=self.log)
305 
306  def _precallImpl(self, task, parsedCmd):
307  """The main work of `precall`.
308 
309  We write package versions, schemas and configs, or compare these to existing files on disk if present.
310  """
311  if not parsedCmd.noVersions:
312  task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
313  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
314  task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
315 
316  def precall(self, parsedCmd):
317  """Hook for code that should run exactly once, before multiprocessing.
318 
319  Notes
320  -----
321  Must return True if `TaskRunner.__call__` should subsequently be called.
322 
323  .. warning::
324 
325  Implementations must take care to ensure that no unpicklable attributes are added to the
326  TaskRunner itself, for compatibility with multiprocessing.
327 
328  The default implementation writes package versions, schemas and configs, or compares them to existing
329  files on disk if present.
330  """
331  task = self.makeTask(parsedCmd=parsedCmd)
332 
333  if self.doRaise:
334  self._precallImpl(task, parsedCmd)
335  else:
336  try:
337  self._precallImpl(task, parsedCmd)
338  except Exception as e:
339  task.log.fatal("Failed in task initialization: %s", e)
340  if not isinstance(e, TaskError):
341  traceback.print_exc(file=sys.stderr)
342  return False
343  return True
344 
345  def __call__(self, args):
346  """Run the Task on a single target.
347 
348  Parameters
349  ----------
350  args
351  Arguments for Task.runDataRef()
352 
353  Returns
354  -------
355  struct : `lsst.pipe.base.Struct`
356  Contains these fields if ``doReturnResults`` is `True`:
357 
358  - ``dataRef``: the provided data reference.
359  - ``metadata``: task metadata after execution of run.
360  - ``result``: result returned by task run, or `None` if the task fails.
361  - ``exitStatus``: 0 if the task completed successfully, 1 otherwise.
362 
363  If ``doReturnResults`` is `False` the struct contains:
364 
365  - ``exitStatus``: 0 if the task completed successfully, 1 otherwise.
366 
367  Notes
368  -----
369  This default implementation assumes that the ``args`` is a tuple containing a data reference and a
370  dict of keyword arguments.
371 
372  .. warning::
373 
374  If you override this method and wish to return something when ``doReturnResults`` is `False`,
375  then it must be picklable to support multiprocessing and it should be small enough that pickling
376  and unpickling do not add excessive overhead.
377  """
378  dataRef, kwargs = args
379  if self.log is None:
380  self.log = Log.getDefaultLogger()
381  if hasattr(dataRef, "dataId"):
382  self.log.MDC("LABEL", str(dataRef.dataId))
383  elif isinstance(dataRef, (list, tuple)):
384  self.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
385  task = self.makeTask(args=args)
386  result = None # in case the task fails
387  exitStatus = 0 # exit status for the shell
388  if self.doRaise:
389  result = self.runTask(task, dataRef, kwargs)
390  else:
391  try:
392  result = self.runTask(task, dataRef, kwargs)
393  except Exception as e:
394  # The shell exit value will be the number of dataRefs returning
395  # non-zero, so the actual value used here is lost.
396  exitStatus = 1
397 
398  # don't use a try block as we need to preserve the original exception
399  eName = type(e).__name__
400  if hasattr(dataRef, "dataId"):
401  task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
402  elif isinstance(dataRef, (list, tuple)):
403  task.log.fatal("Failed on dataIds=[%s]: %s: %s",
404  ", ".join(str(ref.dataId) for ref in dataRef), eName, e)
405  else:
406  task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
407 
408  if not isinstance(e, TaskError):
409  traceback.print_exc(file=sys.stderr)
410 
411  # Ensure all errors have been logged and aren't hanging around in a buffer
412  sys.stdout.flush()
413  sys.stderr.flush()
414 
415  task.writeMetadata(dataRef)
416 
417  # remove MDC so it does not show up outside of task context
418  self.log.MDCRemove("LABEL")
419 
420  if self.doReturnResults:
421  return Struct(
422  exitStatus=exitStatus,
423  dataRef=dataRef,
424  metadata=task.metadata,
425  result=result,
426  )
427  else:
428  return Struct(
429  exitStatus=exitStatus,
430  )
431 
432  def runTask(self, task, dataRef, kwargs):
433  """Make the actual call to `runDataRef` for this task.
434 
435  Parameters
436  ----------
437  task : `lsst.pipe.base.CmdLineTask` class
438  The class of the task to run.
439  dataRef
440  Butler data reference that contains the data the task will process.
441  kwargs
442  Any additional keyword arguments. See `TaskRunner.getTargetList` above.
443 
444  Notes
445  -----
446  The default implementation of `TaskRunner.runTask` works for any command-line task which has a
447  runDataRef method that takes a data reference and an optional set of additional keyword arguments.
448  This method returns the results generated by the task's `runDataRef` method.
449 
450  """
451  return task.runDataRef(dataRef, **kwargs)
452 
453 
455  """A `TaskRunner` for `CmdLineTask`\ s which calls the `Task`\ 's `run` method on a `dataRef` rather
456  than the `runDataRef` method.
457  """
458 
459  def runTask(self, task, dataRef, kwargs):
460  """Call `run` for this task instead of `runDataRef`. See `TaskRunner.runTask` above for details.
461  """
462  return task.run(dataRef, **kwargs)
463 
464 
466  """A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to
467  their constructor.
468  """
469 
470  def makeTask(self, parsedCmd=None, args=None):
471  """A variant of the base version that passes a butler argument to the task's constructor.
472 
473  Parameters
474  ----------
475  parsedCmd : `argparse.Namespace`
476  Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified
477  then args is ignored.
478  args
479  Other arguments; if ``parsedCmd`` is `None` then this must be specified.
480 
481  Raises
482  ------
483  RuntimeError
484  Raised if ``parsedCmd`` and ``args`` are both `None`.
485  """
486  if parsedCmd is not None:
487  butler = parsedCmd.butler
488  elif args is not None:
489  dataRef, kwargs = args
490  butler = dataRef.butlerSubset.butler
491  else:
492  raise RuntimeError("parsedCmd or args must be specified")
493  return self.TaskClass(config=self.config, log=self.log, butler=butler)
494 
495 
497  """Base class for command-line tasks: tasks that may be executed from the command-line.
498 
499  Notes
500  -----
501  See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for
502  more information about writing command-line tasks.
503 
504  Subclasses must specify the following class variables:
505 
506  - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your
507  task needs no configuration, then `lsst.pex.config.Config` itself).
508  - ``_DefaultName``: default name used for this task (a str).
509 
510  Subclasses may also specify the following class variables:
511 
512  - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task
513  with a runDataRef method that takes exactly one argument: a data reference. If your task does
514  not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
515  for more information.
516  - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing.
517 
518  Subclasses must specify a method named ``runDataRef``:
519 
520  - By default ``runDataRef`` accepts a single butler data reference, but you can specify an alternate
521  task runner (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run
522  method needs something else.
523  - ``runDataRef`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for
524  evolution of the task since new values may be added without harming existing code.
525  - The data returned by ``runDataRef`` must be picklable if your task is to support multiprocessing.
526  """
527  RunnerClass = TaskRunner
528  canMultiprocess = True
529 
530  @classmethod
531  def applyOverrides(cls, config):
532  """A hook to allow a task to change the values of its config *after* the camera-specific
533  overrides are loaded but before any command-line overrides are applied.
534 
535  Parameters
536  ----------
537  config : instance of task's ``ConfigClass``
538  Task configuration.
539 
540  Notes
541  -----
542  This is necessary in some cases because the camera-specific overrides may retarget subtasks,
543  wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
544 
545  .. warning::
546 
547  This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply
548  these overrides.
549  """
550  pass
551 
552  @classmethod
553  def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
554  """Parse an argument list and run the command.
555 
556  Parameters
557  ----------
558  args : `list`, optional
559  List of command-line arguments; if `None` use `sys.argv`.
560  config : `lsst.pex.config.Config`-type, optional
561  Config for task. If `None` use `Task.ConfigClass`.
562  log : `lsst.log.Log`-type, optional
563  Log. If `None` use the default log.
564  doReturnResults : `bool`, optional
565  If `True`, return the results of this task. Default is `False`. This is only intended for
566  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
567  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
568  pickled.
569 
570  Returns
571  -------
572  struct : `lsst.pipe.base.Struct`
573  Fields are:
574 
575  - ``argumentParser``: the argument parser.
576  - ``parsedCmd``: the parsed command returned by the argument parser's
577  `lsst.pipe.base.ArgumentParser.parse_args` method.
578  - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`).
579  - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation.
580  This will typically be a list of `None` unless ``doReturnResults`` is `True`;
581  see `Task.RunnerClass` (`TaskRunner` by default) for more information.
582 
583  Notes
584  -----
585  Calling this method with no arguments specified is the standard way to run a command-line task
586  from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other
587  file in that directory.
588 
589  If one or more of the dataIds fails then this routine will exit (with a status giving the
590  number of failed dataIds) rather than returning this struct; this behaviour can be
591  overridden by specifying the ``--noExit`` command-line option.
592  """
593  if args is None:
594  commandAsStr = " ".join(sys.argv)
595  args = sys.argv[1:]
596  else:
597  commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
598 
599  argumentParser = cls._makeArgumentParser()
600  if config is None:
601  config = cls.ConfigClass()
602  parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
603  # print this message after parsing the command so the log is fully configured
604  parsedCmd.log.info("Running: %s", commandAsStr)
605 
606  taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
607  resultList = taskRunner.run(parsedCmd)
608 
609  try:
610  nFailed = sum(((res.exitStatus != 0) for res in resultList))
611  except (TypeError, AttributeError) as e:
612  # NOTE: TypeError if resultList is None, AttributeError if it doesn't have exitStatus.
613  parsedCmd.log.warn("Unable to retrieve exit status (%s); assuming success", e)
614  nFailed = 0
615 
616  if nFailed > 0:
617  if parsedCmd.noExit:
618  parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
619  else:
620  sys.exit(nFailed)
621 
622  return Struct(
623  argumentParser=argumentParser,
624  parsedCmd=parsedCmd,
625  taskRunner=taskRunner,
626  resultList=resultList,
627  )
628 
629  @classmethod
630  def _makeArgumentParser(cls):
631  """Create and return an argument parser.
632 
633  Returns
634  -------
635  parser : `lsst.pipe.base.ArgumentParser`
636  The argument parser for this task.
637 
638  Notes
639  -----
640  By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of
641  dataset type ``raw``.
642 
643  Your task subclass may need to override this method to change the dataset type or data ref level,
644  or to add additional data ID arguments. If you add additional data ID arguments or your task's
645  runDataRef method takes more than a single data reference then you will also have to provide a
646  task-specific task runner (see TaskRunner for more information).
647  """
648  parser = ArgumentParser(name=cls._DefaultName)
649  parser.add_id_argument(name="--id", datasetType="raw",
650  help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
651  return parser
652 
653  def writeConfig(self, butler, clobber=False, doBackup=True):
654  """Write the configuration used for processing the data, or check that an existing
655  one is equal to the new one if present.
656 
657  Parameters
658  ----------
659  butler : `lsst.daf.persistence.Butler`
660  Data butler used to write the config. The config is written to dataset type
661  `CmdLineTask._getConfigName`.
662  clobber : `bool`, optional
663  A boolean flag that controls what happens if a config already has been saved:
664  - `True`: overwrite or rename the existing config, depending on ``doBackup``.
665  - `False`: raise `TaskError` if this config does not match the existing config.
666  doBackup : bool, optional
667  Set to `True` to backup the config files if clobbering.
668  """
669  configName = self._getConfigName()
670  if configName is None:
671  return
672  if clobber:
673  butler.put(self.config, configName, doBackup=doBackup)
674  elif butler.datasetExists(configName, write=True):
675  # this may be subject to a race condition; see #2789
676  try:
677  oldConfig = butler.get(configName, immediate=True)
678  except Exception as exc:
679  raise type(exc)("Unable to read stored config file %s (%s); consider using --clobber-config" %
680  (configName, exc))
681 
682  def logConfigMismatch(msg):
683  self.log.fatal("Comparing configuration: %s", msg)
684 
685  if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
686  raise TaskError(
687  ("Config does not match existing task config %r on disk; tasks configurations " +
688  "must be consistent within the same output repo (override with --clobber-config)") %
689  (configName,))
690  else:
691  butler.put(self.config, configName)
692 
693  def writeSchemas(self, butler, clobber=False, doBackup=True):
694  """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`.
695 
696  Parameters
697  ----------
698  butler : `lsst.daf.persistence.Butler`
699  Data butler used to write the schema. Each schema is written to the dataset type specified as the
700  key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
701  clobber : `bool`, optional
702  A boolean flag that controls what happens if a schema already has been saved:
703  - `True`: overwrite or rename the existing schema, depending on ``doBackup``.
704  - `False`: raise `TaskError` if this schema does not match the existing schema.
705  doBackup : `bool`, optional
706  Set to `True` to backup the schema files if clobbering.
707 
708  Notes
709  -----
710  If ``clobber`` is `False` and an existing schema does not match a current schema,
711  then some schemas may have been saved successfully and others may not, and there is no easy way to
712  tell which is which.
713  """
714  for dataset, catalog in self.getAllSchemaCatalogs().items():
715  schemaDataset = dataset + "_schema"
716  if clobber:
717  butler.put(catalog, schemaDataset, doBackup=doBackup)
718  elif butler.datasetExists(schemaDataset, write=True):
719  oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
720  if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
721  raise TaskError(
722  ("New schema does not match schema %r on disk; schemas must be " +
723  " consistent within the same output repo (override with --clobber-config)") %
724  (dataset,))
725  else:
726  butler.put(catalog, schemaDataset)
727 
728  def writeMetadata(self, dataRef):
729  """Write the metadata produced from processing the data.
730 
731  Parameters
732  ----------
733  dataRef
734  Butler data reference used to write the metadata.
735  The metadata is written to dataset type `CmdLineTask._getMetadataName`.
736  """
737  try:
738  metadataName = self._getMetadataName()
739  if metadataName is not None:
740  dataRef.put(self.getFullMetadata(), metadataName)
741  except Exception as e:
742  self.log.warn("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
743 
744  def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
745  """Compare and write package versions.
746 
747  Parameters
748  ----------
749  butler : `lsst.daf.persistence.Butler`
750  Data butler used to read/write the package versions.
751  clobber : `bool`, optional
752  A boolean flag that controls what happens if versions already have been saved:
753  - `True`: overwrite or rename the existing version info, depending on ``doBackup``.
754  - `False`: raise `TaskError` if this version info does not match the existing.
755  doBackup : `bool`, optional
756  If `True` and clobbering, old package version files are backed up.
757  dataset : `str`, optional
758  Name of dataset to read/write.
759 
760  Raises
761  ------
762  TaskError
763  Raised if there is a version mismatch with current and persisted lists of package versions.
764 
765  Notes
766  -----
767  Note that this operation is subject to a race condition.
768  """
769  packages = Packages.fromSystem()
770 
771  if clobber:
772  return butler.put(packages, dataset, doBackup=doBackup)
773  if not butler.datasetExists(dataset, write=True):
774  return butler.put(packages, dataset)
775 
776  try:
777  old = butler.get(dataset, immediate=True)
778  except Exception as exc:
779  raise type(exc)("Unable to read stored version dataset %s (%s); "
780  "consider using --clobber-versions or --no-versions" %
781  (dataset, exc))
782  # Note that because we can only detect python modules that have been imported, the stored
783  # list of products may be more or less complete than what we have now. What's important is
784  # that the products that are in common have the same version.
785  diff = packages.difference(old)
786  if diff:
787  raise TaskError(
788  "Version mismatch (" +
789  "; ".join("%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0]) for pkg in diff) +
790  "); consider using --clobber-versions or --no-versions")
791  # Update the old set of packages in case we have more packages that haven't been persisted.
792  extra = packages.extra(old)
793  if extra:
794  old.update(packages)
795  butler.put(old, dataset, doBackup=doBackup)
796 
797  def _getConfigName(self):
798  """Get the name of the config dataset type, or `None` if config is not to be persisted.
799 
800  Notes
801  -----
802  The name may depend on the config; that is why this is not a class method.
803  """
804  return self._DefaultName + "_config"
805 
806  def _getMetadataName(self):
807  """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted.
808 
809  Notes
810  -----
811  The name may depend on the config; that is why this is not a class method.
812  """
813  return self._DefaultName + "_metadata"
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
Definition: cmdLineTask.py:553
def _precallImpl(self, task, parsedCmd)
Definition: cmdLineTask.py:306
def runTask(self, task, dataRef, kwargs)
Definition: cmdLineTask.py:432
def getFullMetadata(self)
Definition: task.py:213
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
Definition: cmdLineTask.py:744
def getAllSchemaCatalogs(self)
Definition: task.py:191
def writeSchemas(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:693
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:286
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
Definition: cmdLineTask.py:159
def profile(filename, log=None)
Definition: cmdLineTask.py:53
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:470
def getTargetList(parsedCmd, kwargs)
Definition: cmdLineTask.py:237
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:653
def runTask(self, task, dataRef, kwargs)
Definition: cmdLineTask.py:459