lsst.pipe.base  15.0-3-ga695220+4
cmdLineTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008-2015 AURA/LSST.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <https://www.lsstcorp.org/LegalNotices/>.
21 #
22 from __future__ import absolute_import, division
23 import sys
24 import traceback
25 import functools
26 import contextlib
27 
28 from builtins import str
29 from builtins import object
30 
31 import lsst.utils
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table as afwTable
34 from .task import Task, TaskError
35 from .struct import Struct
36 from .argumentParser import ArgumentParser
37 from lsst.base import Packages
38 from lsst.log import Log
39 
40 __all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner"]
41 
42 
43 def _poolFunctionWrapper(function, arg):
44  """Wrapper around function to catch exceptions that don't inherit from `Exception`.
45 
46  Such exceptions aren't caught by multiprocessing, which causes the slave process to crash and you end up
47  hitting the timeout.
48  """
49  try:
50  return function(arg)
51  except Exception:
52  raise # No worries
53  except:
54  # Need to wrap the exception with something multiprocessing will recognise
55  cls, exc, tb = sys.exc_info()
56  log = Log.getDefaultLogger()
57  log.warn("Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58  raise Exception("Unhandled exception: %s (%s)" % (cls.__name__, exc))
59 
60 
61 def _runPool(pool, timeout, function, iterable):
62  """Wrapper around ``pool.map_async``, to handle timeout
63 
64  This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65  http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
66 
67  Further wraps the function in ``_poolFunctionWrapper`` to catch exceptions
68  that don't inherit from `Exception`.
69  """
70  return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
71 
72 
73 @contextlib.contextmanager
74 def profile(filename, log=None):
75  """Context manager for profiling with cProfile.
76 
77 
78  Parameters
79  ----------
80  filename : `str`
81  Filename to which to write profile (profiling disabled if `None` or empty).
82  log : `lsst.log.Log`, optional
83  Log object for logging the profile operations.
84 
85  If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
86  it returns None), which allows additional control over profiling. You can obtain this using
87  the "as" clause, e.g.:
88 
89  with profile(filename) as prof:
90  runYourCodeHere()
91 
92  The output cumulative profile can be printed with a command-line like::
93 
94  python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
95  """
96  if not filename:
97  # Nothing to do
98  yield
99  return
100  from cProfile import Profile
101  profile = Profile()
102  if log is not None:
103  log.info("Enabling cProfile profiling")
104  profile.enable()
105  yield profile
106  profile.disable()
107  profile.dump_stats(filename)
108  if log is not None:
109  log.info("cProfile stats written to %s" % filename)
110 
111 
112 class TaskRunner(object):
113  """Run a command-line task, using `multiprocessing` if requested.
114 
115  Parameters
116  ----------
117  TaskClass : `lsst.pipe.base.Task` subclass
118  The class of the task to run.
119  parsedCmd : `argparse.Namespace`
120  The parsed command-line arguments, as returned by the task's argument parser's
121  `~lsst.pipe.base.ArgumentParser.parse_args` method.
122 
123  .. warning::
124 
125  Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may
126  contain non-picklable elements. It certainly contains more data than we need to send to each
127  instance of the task.
128  doReturnResults : `bool`, optional
129  Should run return the collected result from each invocation of the task? This is only intended for
130  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
131  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
132  pickled.
133 
134  Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned,
135  with value 0 or 1 to be returned to the unix shell.
136 
137  Raises
138  ------
139  ImportError
140  If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be
141  imported.
142 
143  Notes
144  -----
145  Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this
146  class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more
147  information. See `CmdLineTask.parseAndRun` to see how a task runner is used.
148 
149  You may use this task runner for your command-line task if your task has a run method that takes exactly
150  one argument: a butler data reference. Otherwise you must provide a task-specific subclass of this runner
151  for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly
152  `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
153 
154  This design matches the common pattern for command-line tasks: the run method takes a single data
155  reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of
156  `TaskRunner` that calls these additional arguments by name.
157 
158  Instances of this class must be picklable in order to be compatible with multiprocessing. If
159  multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `run` calls `prepareForMultiProcessing`
160  to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing
161  then indicate this in your task by setting class variable ``canMultiprocess=False``.
162 
163  Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This
164  timeout (in sec) can be specified as the ``timeout`` element in the output from
165  `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
166 
167  By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as
168  MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns
169  multiple processes and when multiple users are running on a shared system. Users can override this
170  behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable.
171 
172  .. __: http://bugs.python.org/issue8296
173  .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
174  """
175 
176  TIMEOUT = 3600*24*30
177  """Default timeout (seconds) for multiprocessing."""
178 
179  def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
180  self.TaskClass = TaskClass
181  self.doReturnResults = bool(doReturnResults)
182  self.config = parsedCmd.config
183  self.log = parsedCmd.log
184  self.doRaise = bool(parsedCmd.doraise)
185  self.clobberConfig = bool(parsedCmd.clobberConfig)
186  self.doBackup = not bool(parsedCmd.noBackupConfig)
187  self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
188 
189  self.timeout = getattr(parsedCmd, 'timeout', None)
190  if self.timeout is None or self.timeout <= 0:
191  self.timeout = self.TIMEOUT
192 
193  if self.numProcesses > 1:
194  if not TaskClass.canMultiprocess:
195  self.log.warn("This task does not support multiprocessing; using one process")
196  self.numProcesses = 1
197 
199  """Prepare this instance for multiprocessing
200 
201  Optional non-picklable elements are removed.
202 
203  This is only called if the task is run under multiprocessing.
204  """
205  self.log = None
206 
207  def run(self, parsedCmd):
208  """Run the task on all targets.
209 
210  Parameters
211  ----------
212  parsedCmd : `argparse.Namespace`
213  Parsed command `argparse.Namespace`.
214 
215  Returns
216  -------
217  resultList : `list`
218  A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__`
219  is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
220  for details.
221 
222  Notes
223  -----
224  The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise
225  processing is serial.
226  """
227  resultList = []
228  disableImplicitThreading() # To prevent thread contention
229  if self.numProcesses > 1:
230  import multiprocessing
232  pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
233  mapFunc = functools.partial(_runPool, pool, self.timeout)
234  else:
235  pool = None
236  mapFunc = map
237 
238  if self.precall(parsedCmd):
239  profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
240  log = parsedCmd.log
241  targetList = self.getTargetList(parsedCmd)
242  if len(targetList) > 0:
243  with profile(profileName, log):
244  # Run the task using self.__call__
245  resultList = list(mapFunc(self, targetList))
246  else:
247  log.warn("Not running the task because there is no data to process; "
248  "you may preview data using \"--show data\"")
249 
250  if pool is not None:
251  pool.close()
252  pool.join()
253 
254  return resultList
255 
256  @staticmethod
257  def getTargetList(parsedCmd, **kwargs):
258  """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
259 
260  Parameters
261  ----------
262  parsedCmd : `argparse.Namespace`
263  The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
264  kwargs
265  Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having
266  it simplifies overriding `TaskRunner` for tasks whose run method takes additional arguments
267  (see case (1) below).
268 
269  Notes
270  -----
271  The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any
272  command-line task whose run method takes exactly one argument: a data reference. Otherwise you
273  must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly
274  `TaskRunner.__call__`. There are two cases.
275 
276  **Case 1**
277 
278  If your command-line task has a ``run`` method that takes one data reference followed by additional
279  arguments, then you need only override `TaskRunner.getTargetList` to return the additional arguments
280  as an argument dict. To make this easier, your overridden version of `~TaskRunner.getTargetList` may
281  call `TaskRunner.getTargetList` with the extra arguments as keyword arguments. For example, the
282  following adds an argument dict containing a single key: "calExpList", whose value is the list of data
283  IDs for the calexp ID argument::
284 
285  def getTargetList(parsedCmd):
286  return TaskRunner.getTargetList(
287  parsedCmd,
288  calExpList=parsedCmd.calexp.idList
289  )
290 
291  It is equivalent to this slightly longer version::
292 
293  @staticmethod
294  def getTargetList(parsedCmd):
295  argDict = dict(calExpList=parsedCmd.calexp.idList)
296  return [(dataId, argDict) for dataId in parsedCmd.id.idList]
297 
298  **Case 2**
299 
300  If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and
301  `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList`
302  returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task.
303  """
304  return [(ref, kwargs) for ref in parsedCmd.id.refList]
305 
306  def makeTask(self, parsedCmd=None, args=None):
307  """Create a Task instance.
308 
309  Parameters
310  ----------
311  parsedCmd
312  Parsed command-line options (used for extra task args by some task runners).
313  args
314  Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners).
315 
316  Notes
317  -----
318  ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None,
319  but it must construct identical Task instances in either case.
320 
321  Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and
322  `TaskRunner.__call__`.
323  """
324  return self.TaskClass(config=self.config, log=self.log)
325 
326  def _precallImpl(self, task, parsedCmd):
327  """The main work of `precall`.
328 
329  We write package versions, schemas and configs, or compare these to existing files on disk if present.
330  """
331  if not parsedCmd.noVersions:
332  task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
333  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
334  task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
335 
336  def precall(self, parsedCmd):
337  """Hook for code that should run exactly once, before multiprocessing.
338 
339  Notes
340  -----
341  Must return True if `TaskRunner.__call__` should subsequently be called.
342 
343  .. warning::
344 
345  Implementations must take care to ensure that no unpicklable
346  attributes are added to the TaskRunner itself, for compatibility
347  with multiprocessing.
348 
349  The default implementation writes package versions, schemas and configs, or compares them to existing
350  files on disk if present.
351  """
352  task = self.makeTask(parsedCmd=parsedCmd)
353 
354  if self.doRaise:
355  self._precallImpl(task, parsedCmd)
356  else:
357  try:
358  self._precallImpl(task, parsedCmd)
359  except Exception as e:
360  task.log.fatal("Failed in task initialization: %s", e)
361  if not isinstance(e, TaskError):
362  traceback.print_exc(file=sys.stderr)
363  return False
364  return True
365 
366  def __call__(self, args):
367  """Run the Task on a single target.
368 
369  Parameters
370  ----------
371  args
372  Arguments for Task.run()
373 
374  Returns
375  -------
376  struct : `lsst.pipe.base.Struct`
377  Contains these fields if ``doReturnResults`` is `True`:
378 
379  - ``dataRef``: the provided data reference.
380  - ``metadata``: task metadata after execution of run.
381  - ``result``: result returned by task run, or `None` if the task fails.
382  - ``exitStatus`: 0 if the task completed successfully, 1 otherwise.
383 
384  If ``doReturnResults`` is `False` the struct contains:
385 
386  - ``exitStatus`: 0 if the task completed successfully, 1 otherwise.
387 
388  Notes
389  -----
390  This default implementation assumes that the ``args`` is a tuple
391  containing a data reference and a dict of keyword arguments.
392 
393  .. warning::
394 
395  If you override this method and wish to return something when ``doReturnResults`` is `False`,
396  then it must be picklable to support multiprocessing and it should be small enough that pickling
397  and unpickling do not add excessive overhead.
398  """
399  dataRef, kwargs = args
400  if self.log is None:
401  self.log = Log.getDefaultLogger()
402  if hasattr(dataRef, "dataId"):
403  self.log.MDC("LABEL", str(dataRef.dataId))
404  elif isinstance(dataRef, (list, tuple)):
405  self.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
406  task = self.makeTask(args=args)
407  result = None # in case the task fails
408  exitStatus = 0 # exit status for the shell
409  if self.doRaise:
410  result = task.run(dataRef, **kwargs)
411  else:
412  try:
413  result = task.run(dataRef, **kwargs)
414  except Exception as e:
415  # The shell exit value will be the number of dataRefs returning
416  # non-zero, so the actual value used here is lost.
417  exitStatus = 1
418 
419  # don't use a try block as we need to preserve the original exception
420  eName = type(e).__name__
421  if hasattr(dataRef, "dataId"):
422  task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
423  elif isinstance(dataRef, (list, tuple)):
424  task.log.fatal("Failed on dataIds=[%s]: %s: %s",
425  ", ".join(str(ref.dataId) for ref in dataRef), eName, e)
426  else:
427  task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
428 
429  if not isinstance(e, TaskError):
430  traceback.print_exc(file=sys.stderr)
431 
432  # Ensure all errors have been logged and aren't hanging around in a buffer
433  sys.stdout.flush()
434  sys.stderr.flush()
435 
436  task.writeMetadata(dataRef)
437 
438  # remove MDC so it does not show up outside of task context
439  self.log.MDCRemove("LABEL")
440 
441  if self.doReturnResults:
442  return Struct(
443  exitStatus=exitStatus,
444  dataRef=dataRef,
445  metadata=task.metadata,
446  result=result,
447  )
448  else:
449  return Struct(
450  exitStatus=exitStatus,
451  )
452 
453 
455  """A TaskRunner for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to
456  their constructor.
457  """
458 
459  def makeTask(self, parsedCmd=None, args=None):
460  """A variant of the base version that passes a butler argument to the task's constructor.
461 
462  Parameters
463  ----------
464  parsedCmd : `argparse.Namespace`
465  Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified
466  then args is ignored.
467  args
468  Other arguments; if ``parsedCmd`` is `None` then this must be specified.
469 
470  Raises
471  ------
472  RuntimeError
473  Raised if ``parsedCmd`` and ``args`` are both `None`.
474  """
475  if parsedCmd is not None:
476  butler = parsedCmd.butler
477  elif args is not None:
478  dataRef, kwargs = args
479  butler = dataRef.butlerSubset.butler
480  else:
481  raise RuntimeError("parsedCmd or args must be specified")
482  return self.TaskClass(config=self.config, log=self.log, butler=butler)
483 
484 
486  """Base class for command-line tasks: tasks that may be executed from the command-line.
487 
488  Notes
489  -----
490  See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for
491  more information about writing command-line tasks.
492 
493  Subclasses must specify the following class variables:
494 
495  - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your
496  task needs no configuration, then `lsst.pex.config.Config` itself).
497  - ``_DefaultName``: default name used for this task (a str).
498 
499  Subclasses may also specify the following class variables:
500 
501  - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task
502  with a run method that takes exactly one argument: a data reference. If your task does
503  not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
504  for more information.
505  - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing.
506 
507  Subclasses must specify a method named ``run``:
508 
509  - By default ``run`` accepts a single butler data reference, but you can specify an alternate task runner
510  (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run method needs
511  something else.
512  - ``run`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for evolution
513  of the task since new values may be added without harming existing code.
514  - The data returned by ``run`` must be picklable if your task is to support multiprocessing.
515  """
516  RunnerClass = TaskRunner
517  canMultiprocess = True
518 
519  @classmethod
520  def applyOverrides(cls, config):
521  """A hook to allow a task to change the values of its config *after* the camera-specific
522  overrides are loaded but before any command-line overrides are applied.
523 
524  Parameters
525  ----------
526  config : instance of task's ``ConfigClass``
527  Task configuration.
528 
529  Notes
530  -----
531  This is necessary in some cases because the camera-specific overrides may retarget subtasks,
532  wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
533 
534  .. warning::
535 
536  This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply
537  these overrides.
538  """
539  pass
540 
541  @classmethod
542  def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
543  """Parse an argument list and run the command.
544 
545  Parameters
546  ----------
547  args : `list`, optional
548  List of command-line arguments; if `None` use `sys.argv`.
549  config : `lsst.pex.config.Config`-type, optional
550  Config for task. If `None` use `Task.ConfigClass`.
551  log : `lsst.log.Log`-type, optional
552  Log. If `None` use the default log.
553  doReturnResults : `bool`, optional
554  If `True`, return the results of this task. Default is `False`. This is only intended for
555  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
556  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
557  pickled.
558 
559  Returns
560  -------
561  struct : `lsst.pipe.base.Struct`
562  Fields are:
563 
564  - ``argumentParser``: the argument parser.
565  - ``parsedCmd``: the parsed command returned by the argument parser's
566  `lsst.pipe.base.ArgumentParser.parse_args` method.
567  - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`).
568  - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation.
569  This will typically be a list of `None` unless ``doReturnResults`` is `True`;
570  see `Task.RunnerClass` (`TaskRunner` by default) for more information.
571 
572  Notes
573  -----
574  Calling this method with no arguments specified is the standard way to run a command-line task
575  from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other
576  file in that directory.
577 
578  If one or more of the dataIds fails then this routine will exit (with a status giving the
579  number of failed dataIds) rather than returning this struct; this behaviour can be
580  overridden by specifying the ``--noExit`` command-line option.
581  """
582  if args is None:
583  commandAsStr = " ".join(sys.argv)
584  args = sys.argv[1:]
585  else:
586  commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
587 
588  argumentParser = cls._makeArgumentParser()
589  if config is None:
590  config = cls.ConfigClass()
591  parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
592  # print this message after parsing the command so the log is fully configured
593  parsedCmd.log.info("Running: %s", commandAsStr)
594 
595  taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
596  resultList = taskRunner.run(parsedCmd)
597 
598  try:
599  nFailed = sum(((res.exitStatus != 0) for res in resultList))
600  except (TypeError, AttributeError) as e:
601  # NOTE: TypeError if resultList is None, AttributeError if it doesn't have exitStatus.
602  parsedCmd.log.warn("Unable to retrieve exit status (%s); assuming success", e)
603  nFailed = 0
604 
605  if nFailed > 0:
606  if parsedCmd.noExit:
607  parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
608  else:
609  sys.exit(nFailed)
610 
611  return Struct(
612  argumentParser=argumentParser,
613  parsedCmd=parsedCmd,
614  taskRunner=taskRunner,
615  resultList=resultList,
616  )
617 
618  @classmethod
619  def _makeArgumentParser(cls):
620  """Create and return an argument parser.
621 
622  Returns
623  -------
624  parser : `lsst.pipe.base.ArgumentParser`
625  The argument parser for this task.
626 
627  Notes
628  -----
629  By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of
630  dataset type ``raw``.
631 
632  Your task subclass may need to override this method to change the dataset type or data ref level,
633  or to add additional data ID arguments. If you add additional data ID arguments or your task's
634  run method takes more than a single data reference then you will also have to provide a task-specific
635  task runner (see TaskRunner for more information).
636  """
637  parser = ArgumentParser(name=cls._DefaultName)
638  parser.add_id_argument(name="--id", datasetType="raw",
639  help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
640  return parser
641 
642  def writeConfig(self, butler, clobber=False, doBackup=True):
643  """Write the configuration used for processing the data, or check that an existing
644  one is equal to the new one if present.
645 
646  Parameters
647  ----------
648  butler : `lsst.daf.persistence.Butler`
649  Data butler used to write the config. The config is written to dataset type
650  `CmdLineTask._getConfigName`.
651  clobber : `bool`, optional
652  A boolean flag that controls what happens if a config already has been saved:
653  - `True`: overwrite or rename the existing config, depending on ``doBackup``.
654  - `False`: raise `TaskError` if this config does not match the existing config.
655  doBackup : bool, optional
656  Set to `True` to backup the config files if clobbering.
657  """
658  configName = self._getConfigName()
659  if configName is None:
660  return
661  if clobber:
662  butler.put(self.config, configName, doBackup=doBackup)
663  elif butler.datasetExists(configName, write=True):
664  # this may be subject to a race condition; see #2789
665  try:
666  oldConfig = butler.get(configName, immediate=True)
667  except Exception as exc:
668  raise type(exc)("Unable to read stored config file %s (%s); consider using --clobber-config" %
669  (configName, exc))
670 
671  def logConfigMismatch(msg):
672  self.log.fatal("Comparing configuration: %s", msg)
673 
674  if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
675  raise TaskError(
676  ("Config does not match existing task config %r on disk; tasks configurations " +
677  "must be consistent within the same output repo (override with --clobber-config)") %
678  (configName,))
679  else:
680  butler.put(self.config, configName)
681 
682  def writeSchemas(self, butler, clobber=False, doBackup=True):
683  """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`.
684 
685  Parameters
686  ----------
687  butler : `lsst.daf.persistence.Butler`
688  Data butler used to write the schema. Each schema is written to the dataset type specified as the
689  key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
690  clobber : `bool`, optional
691  A boolean flag that controls what happens if a schema already has been saved:
692  - `True`: overwrite or rename the existing schema, depending on ``doBackup``.
693  - `False`: raise `TaskError` if this schema does not match the existing schema.
694  doBackup : `bool`, optional
695  Set to `True` to backup the schema files if clobbering.
696 
697  Notes
698  -----
699  If ``clobber`` is `False` and an existing schema does not match a current schema,
700  then some schemas may have been saved successfully and others may not, and there is no easy way to
701  tell which is which.
702  """
703  for dataset, catalog in self.getAllSchemaCatalogs().items():
704  schemaDataset = dataset + "_schema"
705  if clobber:
706  butler.put(catalog, schemaDataset, doBackup=doBackup)
707  elif butler.datasetExists(schemaDataset, write=True):
708  oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
709  if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
710  raise TaskError(
711  ("New schema does not match schema %r on disk; schemas must be " +
712  " consistent within the same output repo (override with --clobber-config)") %
713  (dataset,))
714  else:
715  butler.put(catalog, schemaDataset)
716 
717  def writeMetadata(self, dataRef):
718  """Write the metadata produced from processing the data.
719 
720  Parameters
721  ----------
722  dataRef
723  Butler data reference used to write the metadata.
724  The metadata is written to dataset type `CmdLineTask._getMetadataName`.
725  """
726  try:
727  metadataName = self._getMetadataName()
728  if metadataName is not None:
729  dataRef.put(self.getFullMetadata(), metadataName)
730  except Exception as e:
731  self.log.warn("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
732 
733  def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
734  """Compare and write package versions.
735 
736  Parameters
737  ----------
738  butler : `lsst.daf.persistence.Butler`
739  Data butler used to read/write the package versions.
740  clobber : `bool`, optional
741  A boolean flag that controls what happens if versions already have been saved:
742  - `True`: overwrite or rename the existing version info, depending on ``doBackup``.
743  - `False`: raise `TaskError` if this version info does not match the existing.
744  doBackup : `bool`, optional
745  If `True` and clobbering, old package version files are backed up.
746  dataset : `str`, optional
747  Name of dataset to read/write.
748 
749  Raises
750  ------
751  TaskError
752  Raised if there is a version mismatch with current and persisted lists of package versions.
753 
754  Notes
755  -----
756  Note that this operation is subject to a race condition.
757  """
758  packages = Packages.fromSystem()
759 
760  if clobber:
761  return butler.put(packages, dataset, doBackup=doBackup)
762  if not butler.datasetExists(dataset, write=True):
763  return butler.put(packages, dataset)
764 
765  try:
766  old = butler.get(dataset, immediate=True)
767  except Exception as exc:
768  raise type(exc)("Unable to read stored version dataset %s (%s); "
769  "consider using --clobber-versions or --no-versions" %
770  (dataset, exc))
771  # Note that because we can only detect python modules that have been imported, the stored
772  # list of products may be more or less complete than what we have now. What's important is
773  # that the products that are in common have the same version.
774  diff = packages.difference(old)
775  if diff:
776  raise TaskError(
777  "Version mismatch (" +
778  "; ".join("%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0]) for pkg in diff) +
779  "); consider using --clobber-versions or --no-versions")
780  # Update the old set of packages in case we have more packages that haven't been persisted.
781  extra = packages.extra(old)
782  if extra:
783  old.update(packages)
784  butler.put(old, dataset, doBackup=doBackup)
785 
786  def _getConfigName(self):
787  """Get the name of the config dataset type, or `None` if config is not to be persisted.
788 
789  Notes
790  -----
791  The name may depend on the config; that is why this is not a class method.
792  """
793  return self._DefaultName + "_config"
794 
795  def _getMetadataName(self):
796  """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted.
797 
798  Notes
799  -----
800  The name may depend on the config; that is why this is not a class method.
801  """
802  return self._DefaultName + "_metadata"
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
Definition: cmdLineTask.py:542
def _precallImpl(self, task, parsedCmd)
Definition: cmdLineTask.py:326
def getFullMetadata(self)
Definition: task.py:212
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
Definition: cmdLineTask.py:733
def getAllSchemaCatalogs(self)
Definition: task.py:190
def writeSchemas(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:682
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:306
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
Definition: cmdLineTask.py:179
def profile(filename, log=None)
Definition: cmdLineTask.py:74
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:459
def getTargetList(parsedCmd, kwargs)
Definition: cmdLineTask.py:257
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:642