lsst.pipe.base  14.0-6-ge2c9487+54
cmdLineTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008-2015 AURA/LSST.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <https://www.lsstcorp.org/LegalNotices/>.
21 #
22 from __future__ import absolute_import, division
23 import sys
24 import traceback
25 import functools
26 import contextlib
27 
28 from builtins import str
29 from builtins import object
30 
31 import lsst.utils
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table as afwTable
34 from .task import Task, TaskError
35 from .struct import Struct
36 from .argumentParser import ArgumentParser
37 from lsst.base import Packages
38 from lsst.log import Log
39 
40 __all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner"]
41 
42 
43 def _poolFunctionWrapper(function, arg):
44  """Wrapper around function to catch exceptions that don't inherit from `Exception`.
45 
46  Such exceptions aren't caught by multiprocessing, which causes the slave process to crash and you end up
47  hitting the timeout.
48  """
49  try:
50  return function(arg)
51  except Exception:
52  raise # No worries
53  except:
54  # Need to wrap the exception with something multiprocessing will recognise
55  cls, exc, tb = sys.exc_info()
56  log = Log.getDefaultLogger()
57  log.warn("Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58  raise Exception("Unhandled exception: %s (%s)" % (cls.__name__, exc))
59 
60 
61 def _runPool(pool, timeout, function, iterable):
62  """Wrapper around ``pool.map_async``, to handle timeout
63 
64  This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65  http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
66 
67  Further wraps the function in ``_poolFunctionWrapper`` to catch exceptions
68  that don't inherit from `Exception`.
69  """
70  return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
71 
72 
73 @contextlib.contextmanager
74 def profile(filename, log=None):
75  """Context manager for profiling with cProfile.
76 
77 
78  Parameters
79  ----------
80  filename : `str`
81  Filename to which to write profile (profiling disabled if `None` or empty).
82  log : `lsst.log.Log`, optional
83  Log object for logging the profile operations.
84 
85  If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
86  it returns None), which allows additional control over profiling. You can obtain this using
87  the "as" clause, e.g.:
88 
89  with profile(filename) as prof:
90  runYourCodeHere()
91 
92  The output cumulative profile can be printed with a command-line like::
93 
94  python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
95  """
96  if not filename:
97  # Nothing to do
98  yield
99  return
100  from cProfile import Profile
101  profile = Profile()
102  if log is not None:
103  log.info("Enabling cProfile profiling")
104  profile.enable()
105  yield profile
106  profile.disable()
107  profile.dump_stats(filename)
108  if log is not None:
109  log.info("cProfile stats written to %s" % filename)
110 
111 
112 class TaskRunner(object):
113  """Run a command-line task, using `multiprocessing` if requested.
114 
115  Parameters
116  ----------
117  TaskClass : `lsst.pipe.base.Task` subclass
118  The class of the task to run.
119  parsedCmd : `argparse.Namespace`
120  The parsed command-line arguments, as returned by the task's argument parser's
121  `~lsst.pipe.base.ArgumentParser.parse_args` method.
122 
123  .. warning::
124 
125  Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may
126  contain non-picklable elements. It certainly contains more data than we need to send to each
127  instance of the task.
128  doReturnResults : `bool`, optional
129  Should run return the collected result from each invocation of the task? This is only intended for
130  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
131  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
132  pickled.
133 
134  Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned,
135  with value 0 or 1 to be returned to the unix shell.
136 
137  Raises
138  ------
139  ImportError
140  If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be
141  imported.
142 
143  Notes
144  -----
145  Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this
146  class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more
147  information. See `CmdLineTask.parseAndRun` to see how a task runner is used.
148 
149  You may use this task runner for your command-line task if your task has a run method that takes exactly
150  one argument: a butler data reference. Otherwise you must provide a task-specific subclass of this runner
151  for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly
152  `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
153 
154  This design matches the common pattern for command-line tasks: the run method takes a single data
155  reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of
156  `TaskRunner` that calls these additional arguments by name.
157 
158  Instances of this class must be picklable in order to be compatible with multiprocessing. If
159  multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `run` calls `prepareForMultiProcessing`
160  to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing
161  then indicate this in your task by setting class variable ``canMultiprocess=False``.
162 
163  Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This
164  timeout (in sec) can be specified as the ``timeout`` element in the output from
165  `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
166 
167  .. __: http://bugs.python.org/issue8296
168  .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
169  """
170 
171  TIMEOUT = 3600*24*30
172  """Default timeout (seconds) for multiprocessing."""
173 
174  def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
175  self.TaskClass = TaskClass
176  self.doReturnResults = bool(doReturnResults)
177  self.config = parsedCmd.config
178  self.log = parsedCmd.log
179  self.doRaise = bool(parsedCmd.doraise)
180  self.clobberConfig = bool(parsedCmd.clobberConfig)
181  self.doBackup = not bool(parsedCmd.noBackupConfig)
182  self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
183 
184  self.timeout = getattr(parsedCmd, 'timeout', None)
185  if self.timeout is None or self.timeout <= 0:
186  self.timeout = self.TIMEOUT
187 
188  if self.numProcesses > 1:
189  if not TaskClass.canMultiprocess:
190  self.log.warn("This task does not support multiprocessing; using one process")
191  self.numProcesses = 1
192 
194  """Prepare this instance for multiprocessing
195 
196  Optional non-picklable elements are removed.
197 
198  This is only called if the task is run under multiprocessing.
199  """
200  self.log = None
201 
202  def run(self, parsedCmd):
203  """Run the task on all targets.
204 
205  Parameters
206  ----------
207  parsedCmd : `argparse.Namespace`
208  Parsed command `argparse.Namespace`.
209 
210  Returns
211  -------
212  resultList : `list`
213  A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__`
214  is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
215  for details.
216 
217  Notes
218  -----
219  The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise
220  processing is serial.
221  """
222  resultList = []
223  if self.numProcesses > 1:
224  disableImplicitThreading() # To prevent thread contention
225  import multiprocessing
227  pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
228  mapFunc = functools.partial(_runPool, pool, self.timeout)
229  else:
230  pool = None
231  mapFunc = map
232 
233  if self.precall(parsedCmd):
234  profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
235  log = parsedCmd.log
236  targetList = self.getTargetList(parsedCmd)
237  if len(targetList) > 0:
238  with profile(profileName, log):
239  # Run the task using self.__call__
240  resultList = list(mapFunc(self, targetList))
241  else:
242  log.warn("Not running the task because there is no data to process; "
243  "you may preview data using \"--show data\"")
244 
245  if pool is not None:
246  pool.close()
247  pool.join()
248 
249  return resultList
250 
251  @staticmethod
252  def getTargetList(parsedCmd, **kwargs):
253  """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
254 
255  Parameters
256  ----------
257  parsedCmd : `argparse.Namespace`
258  The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
259  kwargs
260  Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having
261  it simplifies overriding `TaskRunner` for tasks whose run method takes additional arguments
262  (see case (1) below).
263 
264  Notes
265  -----
266  The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any
267  command-line task whose run method takes exactly one argument: a data reference. Otherwise you
268  must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly
269  `TaskRunner.__call__`. There are two cases.
270 
271  **Case 1**
272 
273  If your command-line task has a ``run`` method that takes one data reference followed by additional
274  arguments, then you need only override `TaskRunner.getTargetList` to return the additional arguments
275  as an argument dict. To make this easier, your overridden version of `~TaskRunner.getTargetList` may
276  call `TaskRunner.getTargetList` with the extra arguments as keyword arguments. For example, the
277  following adds an argument dict containing a single key: "calExpList", whose value is the list of data
278  IDs for the calexp ID argument::
279 
280  def getTargetList(parsedCmd):
281  return TaskRunner.getTargetList(
282  parsedCmd,
283  calExpList=parsedCmd.calexp.idList
284  )
285 
286  It is equivalent to this slightly longer version::
287 
288  @staticmethod
289  def getTargetList(parsedCmd):
290  argDict = dict(calExpList=parsedCmd.calexp.idList)
291  return [(dataId, argDict) for dataId in parsedCmd.id.idList]
292 
293  **Case 2**
294 
295  If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and
296  `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList`
297  returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task.
298  """
299  return [(ref, kwargs) for ref in parsedCmd.id.refList]
300 
301  def makeTask(self, parsedCmd=None, args=None):
302  """Create a Task instance.
303 
304  Parameters
305  ----------
306  parsedCmd
307  Parsed command-line options (used for extra task args by some task runners).
308  args
309  Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners).
310 
311  Notes
312  -----
313  ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None,
314  but it must construct identical Task instances in either case.
315 
316  Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and
317  `TaskRunner.__call__`.
318  """
319  return self.TaskClass(config=self.config, log=self.log)
320 
321  def _precallImpl(self, task, parsedCmd):
322  """The main work of `precall`.
323 
324  We write package versions, schemas and configs, or compare these to existing files on disk if present.
325  """
326  if not parsedCmd.noVersions:
327  task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
328  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
329  task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
330 
331  def precall(self, parsedCmd):
332  """Hook for code that should run exactly once, before multiprocessing.
333 
334  Notes
335  -----
336  Must return True if `TaskRunner.__call__` should subsequently be called.
337 
338  .. warning::
339 
340  Implementations must take care to ensure that no unpicklable
341  attributes are added to the TaskRunner itself, for compatibility
342  with multiprocessing.
343 
344  The default implementation writes package versions, schemas and configs, or compares them to existing
345  files on disk if present.
346  """
347  task = self.makeTask(parsedCmd=parsedCmd)
348 
349  if self.doRaise:
350  self._precallImpl(task, parsedCmd)
351  else:
352  try:
353  self._precallImpl(task, parsedCmd)
354  except Exception as e:
355  task.log.fatal("Failed in task initialization: %s", e)
356  if not isinstance(e, TaskError):
357  traceback.print_exc(file=sys.stderr)
358  return False
359  return True
360 
361  def __call__(self, args):
362  """Run the Task on a single target.
363 
364  Parameters
365  ----------
366  args
367  Arguments for Task.run()
368 
369  Returns
370  -------
371  struct : `lsst.pipe.base.Struct`
372  Contains these fields if ``doReturnResults`` is `True`:
373 
374  - ``dataRef``: the provided data reference.
375  - ``metadata``: task metadata after execution of run.
376  - ``result``: result returned by task run, or `None` if the task fails.
377  - ``exitStatus`: 0 if the task completed successfully, 1 otherwise.
378 
379  If ``doReturnResults`` is `False` the struct contains:
380 
381  - ``exitStatus`: 0 if the task completed successfully, 1 otherwise.
382 
383  Notes
384  -----
385  This default implementation assumes that the ``args`` is a tuple
386  containing a data reference and a dict of keyword arguments.
387 
388  .. warning::
389 
390  If you override this method and wish to return something when ``doReturnResults`` is `False`,
391  then it must be picklable to support multiprocessing and it should be small enough that pickling
392  and unpickling do not add excessive overhead.
393  """
394  dataRef, kwargs = args
395  if self.log is None:
396  self.log = Log.getDefaultLogger()
397  if hasattr(dataRef, "dataId"):
398  self.log.MDC("LABEL", str(dataRef.dataId))
399  elif isinstance(dataRef, (list, tuple)):
400  self.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
401  task = self.makeTask(args=args)
402  result = None # in case the task fails
403  exitStatus = 0 # exit status for the shell
404  if self.doRaise:
405  result = task.run(dataRef, **kwargs)
406  else:
407  try:
408  result = task.run(dataRef, **kwargs)
409  except Exception as e:
410  # The shell exit value will be the number of dataRefs returning
411  # non-zero, so the actual value used here is lost.
412  exitStatus = 1
413 
414  # don't use a try block as we need to preserve the original exception
415  eName = type(e).__name__
416  if hasattr(dataRef, "dataId"):
417  task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
418  elif isinstance(dataRef, (list, tuple)):
419  task.log.fatal("Failed on dataIds=[%s]: %s: %s",
420  ", ".join(str(ref.dataId) for ref in dataRef), eName, e)
421  else:
422  task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
423 
424  if not isinstance(e, TaskError):
425  traceback.print_exc(file=sys.stderr)
426  task.writeMetadata(dataRef)
427 
428  # remove MDC so it does not show up outside of task context
429  self.log.MDCRemove("LABEL")
430 
431  if self.doReturnResults:
432  return Struct(
433  exitStatus=exitStatus,
434  dataRef=dataRef,
435  metadata=task.metadata,
436  result=result,
437  )
438  else:
439  return Struct(
440  exitStatus=exitStatus,
441  )
442 
443 
445  """A TaskRunner for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to
446  their constructor.
447  """
448 
449  def makeTask(self, parsedCmd=None, args=None):
450  """A variant of the base version that passes a butler argument to the task's constructor.
451 
452  Parameters
453  ----------
454  parsedCmd : `argparse.Namespace`
455  Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified
456  then args is ignored.
457  args
458  Other arguments; if ``parsedCmd`` is `None` then this must be specified.
459 
460  Raises
461  ------
462  RuntimeError
463  Raised if ``parsedCmd`` and ``args`` are both `None`.
464  """
465  if parsedCmd is not None:
466  butler = parsedCmd.butler
467  elif args is not None:
468  dataRef, kwargs = args
469  butler = dataRef.butlerSubset.butler
470  else:
471  raise RuntimeError("parsedCmd or args must be specified")
472  return self.TaskClass(config=self.config, log=self.log, butler=butler)
473 
474 
476  """Base class for command-line tasks: tasks that may be executed from the command-line.
477 
478  Notes
479  -----
480  See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for
481  more information about writing command-line tasks.
482 
483  Subclasses must specify the following class variables:
484 
485  - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your
486  task needs no configuration, then `lsst.pex.config.Config` itself).
487  - ``_DefaultName``: default name used for this task (a str).
488 
489  Subclasses may also specify the following class variables:
490 
491  - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task
492  with a run method that takes exactly one argument: a data reference. If your task does
493  not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
494  for more information.
495  - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing.
496 
497  Subclasses must specify a method named ``run``:
498 
499  - By default ``run`` accepts a single butler data reference, but you can specify an alternate task runner
500  (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run method needs
501  something else.
502  - ``run`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for evolution
503  of the task since new values may be added without harming existing code.
504  - The data returned by ``run`` must be picklable if your task is to support multiprocessing.
505  """
506  RunnerClass = TaskRunner
507  canMultiprocess = True
508 
509  @classmethod
510  def applyOverrides(cls, config):
511  """A hook to allow a task to change the values of its config *after* the camera-specific
512  overrides are loaded but before any command-line overrides are applied.
513 
514  Parameters
515  ----------
516  config : instance of task's ``ConfigClass``
517  Task configuration.
518 
519  Notes
520  -----
521  This is necessary in some cases because the camera-specific overrides may retarget subtasks,
522  wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
523 
524  .. warning::
525 
526  This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply
527  these overrides.
528  """
529  pass
530 
531  @classmethod
532  def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
533  """Parse an argument list and run the command.
534 
535  Parameters
536  ----------
537  args : `list`, optional
538  List of command-line arguments; if `None` use `sys.argv`.
539  config : `lsst.pex.config.Config`-type, optional
540  Config for task. If `None` use `Task.ConfigClass`.
541  log : `lsst.log.Log`-type, optional
542  Log. If `None` use the default log.
543  doReturnResults : `bool`, optional
544  If `True`, return the results of this task. Default is `False`. This is only intended for
545  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
546  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
547  pickled.
548 
549  Returns
550  -------
551  struct : `lsst.pipe.base.Struct`
552  Fields are:
553 
554  - ``argumentParser``: the argument parser.
555  - ``parsedCmd``: the parsed command returned by the argument parser's
556  `lsst.pipe.base.ArgumentParser.parse_args` method.
557  - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`).
558  - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation.
559  This will typically be a list of `None` unless ``doReturnResults`` is `True`;
560  see `Task.RunnerClass` (`TaskRunner` by default) for more information.
561 
562  Notes
563  -----
564  Calling this method with no arguments specified is the standard way to run a command-line task
565  from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other
566  file in that directory.
567 
568  If one or more of the dataIds fails then this routine will exit (with a status giving the
569  number of failed dataIds) rather than returning this struct; this behaviour can be
570  overridden by specifying the ``--noExit`` command-line option.
571  """
572  if args is None:
573  commandAsStr = " ".join(sys.argv)
574  args = sys.argv[1:]
575  else:
576  commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
577 
578  argumentParser = cls._makeArgumentParser()
579  if config is None:
580  config = cls.ConfigClass()
581  parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
582  # print this message after parsing the command so the log is fully configured
583  parsedCmd.log.info("Running: %s", commandAsStr)
584 
585  taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
586  resultList = taskRunner.run(parsedCmd)
587 
588  try:
589  nFailed = sum(((res.exitStatus != 0) for res in resultList))
590  except (TypeError, AttributeError) as e:
591  # NOTE: TypeError if resultList is None, AttributeError if it doesn't have exitStatus.
592  parsedCmd.log.warn("Unable to retrieve exit status (%s); assuming success", e)
593  nFailed = 0
594 
595  if nFailed > 0:
596  if parsedCmd.noExit:
597  parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
598  else:
599  sys.exit(nFailed)
600 
601  return Struct(
602  argumentParser=argumentParser,
603  parsedCmd=parsedCmd,
604  taskRunner=taskRunner,
605  resultList=resultList,
606  )
607 
608  @classmethod
609  def _makeArgumentParser(cls):
610  """Create and return an argument parser.
611 
612  Returns
613  -------
614  parser : `lsst.pipe.base.ArgumentParser`
615  The argument parser for this task.
616 
617  Notes
618  -----
619  By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of
620  dataset type ``raw``.
621 
622  Your task subclass may need to override this method to change the dataset type or data ref level,
623  or to add additional data ID arguments. If you add additional data ID arguments or your task's
624  run method takes more than a single data reference then you will also have to provide a task-specific
625  task runner (see TaskRunner for more information).
626  """
627  parser = ArgumentParser(name=cls._DefaultName)
628  parser.add_id_argument(name="--id", datasetType="raw",
629  help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
630  return parser
631 
632  def writeConfig(self, butler, clobber=False, doBackup=True):
633  """Write the configuration used for processing the data, or check that an existing
634  one is equal to the new one if present.
635 
636  Parameters
637  ----------
638  butler : `lsst.daf.persistence.Butler`
639  Data butler used to write the config. The config is written to dataset type
640  `CmdLineTask._getConfigName`.
641  clobber : `bool`, optional
642  A boolean flag that controls what happens if a config already has been saved:
643  - `True`: overwrite or rename the existing config, depending on ``doBackup``.
644  - `False`: raise `TaskError` if this config does not match the existing config.
645  doBackup : bool, optional
646  Set to `True` to backup the config files if clobbering.
647  """
648  configName = self._getConfigName()
649  if configName is None:
650  return
651  if clobber:
652  butler.put(self.config, configName, doBackup=doBackup)
653  elif butler.datasetExists(configName, write=True):
654  # this may be subject to a race condition; see #2789
655  try:
656  oldConfig = butler.get(configName, immediate=True)
657  except Exception as exc:
658  raise type(exc)("Unable to read stored config file %s (%s); consider using --clobber-config" %
659  (configName, exc))
660 
661  def logConfigMismatch(msg):
662  self.log.fatal("Comparing configuration: %s", msg)
663 
664  if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
665  raise TaskError(
666  ("Config does not match existing task config %r on disk; tasks configurations " +
667  "must be consistent within the same output repo (override with --clobber-config)") %
668  (configName,))
669  else:
670  butler.put(self.config, configName)
671 
672  def writeSchemas(self, butler, clobber=False, doBackup=True):
673  """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`.
674 
675  Parameters
676  ----------
677  butler : `lsst.daf.persistence.Butler`
678  Data butler used to write the schema. Each schema is written to the dataset type specified as the
679  key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
680  clobber : `bool`, optional
681  A boolean flag that controls what happens if a schema already has been saved:
682  - `True`: overwrite or rename the existing schema, depending on ``doBackup``.
683  - `False`: raise `TaskError` if this schema does not match the existing schema.
684  doBackup : `bool`, optional
685  Set to `True` to backup the schema files if clobbering.
686 
687  Notes
688  -----
689  If ``clobber`` is `False` and an existing schema does not match a current schema,
690  then some schemas may have been saved successfully and others may not, and there is no easy way to
691  tell which is which.
692  """
693  for dataset, catalog in self.getAllSchemaCatalogs().items():
694  schemaDataset = dataset + "_schema"
695  if clobber:
696  butler.put(catalog, schemaDataset, doBackup=doBackup)
697  elif butler.datasetExists(schemaDataset, write=True):
698  oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
699  if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
700  raise TaskError(
701  ("New schema does not match schema %r on disk; schemas must be " +
702  " consistent within the same output repo (override with --clobber-config)") %
703  (dataset,))
704  else:
705  butler.put(catalog, schemaDataset)
706 
707  def writeMetadata(self, dataRef):
708  """Write the metadata produced from processing the data.
709 
710  Parameters
711  ----------
712  dataRef
713  Butler data reference used to write the metadata.
714  The metadata is written to dataset type `CmdLineTask._getMetadataName`.
715  """
716  try:
717  metadataName = self._getMetadataName()
718  if metadataName is not None:
719  dataRef.put(self.getFullMetadata(), metadataName)
720  except Exception as e:
721  self.log.warn("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
722 
723  def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
724  """Compare and write package versions.
725 
726  Parameters
727  ----------
728  butler : `lsst.daf.persistence.Butler`
729  Data butler used to read/write the package versions.
730  clobber : `bool`, optional
731  A boolean flag that controls what happens if versions already have been saved:
732  - `True`: overwrite or rename the existing version info, depending on ``doBackup``.
733  - `False`: raise `TaskError` if this version info does not match the existing.
734  doBackup : `bool`, optional
735  If `True` and clobbering, old package version files are backed up.
736  dataset : `str`, optional
737  Name of dataset to read/write.
738 
739  Raises
740  ------
741  TaskError
742  Raised if there is a version mismatch with current and persisted lists of package versions.
743 
744  Notes
745  -----
746  Note that this operation is subject to a race condition.
747  """
748  packages = Packages.fromSystem()
749 
750  if clobber:
751  return butler.put(packages, dataset, doBackup=doBackup)
752  if not butler.datasetExists(dataset, write=True):
753  return butler.put(packages, dataset)
754 
755  try:
756  old = butler.get(dataset, immediate=True)
757  except Exception as exc:
758  raise type(exc)("Unable to read stored version dataset %s (%s); "
759  "consider using --clobber-versions or --no-versions" %
760  (dataset, exc))
761  # Note that because we can only detect python modules that have been imported, the stored
762  # list of products may be more or less complete than what we have now. What's important is
763  # that the products that are in common have the same version.
764  diff = packages.difference(old)
765  if diff:
766  raise TaskError(
767  "Version mismatch (" +
768  "; ".join("%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0]) for pkg in diff) +
769  "); consider using --clobber-versions or --no-versions")
770  # Update the old set of packages in case we have more packages that haven't been persisted.
771  extra = packages.extra(old)
772  if extra:
773  old.update(packages)
774  butler.put(old, dataset, doBackup=doBackup)
775 
776  def _getConfigName(self):
777  """Get the name of the config dataset type, or `None` if config is not to be persisted.
778 
779  Notes
780  -----
781  The name may depend on the config; that is why this is not a class method.
782  """
783  return self._DefaultName + "_config"
784 
785  def _getMetadataName(self):
786  """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted.
787 
788  Notes
789  -----
790  The name may depend on the config; that is why this is not a class method.
791  """
792  return self._DefaultName + "_metadata"
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
Definition: cmdLineTask.py:532
def _precallImpl(self, task, parsedCmd)
Definition: cmdLineTask.py:321
def getFullMetadata(self)
Definition: task.py:212
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
Definition: cmdLineTask.py:723
def getAllSchemaCatalogs(self)
Definition: task.py:190
def writeSchemas(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:672
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:301
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
Definition: cmdLineTask.py:174
def profile(filename, log=None)
Definition: cmdLineTask.py:74
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:449
def getTargetList(parsedCmd, kwargs)
Definition: cmdLineTask.py:252
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:632