lsst.pipe.base  14.0-7-gc1eb65a
cmdLineTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008-2015 AURA/LSST.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <https://www.lsstcorp.org/LegalNotices/>.
21 #
22 from __future__ import absolute_import, division
23 import sys
24 import traceback
25 import functools
26 import contextlib
27 
28 from builtins import str
29 from builtins import object
30 
31 import lsst.utils
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table as afwTable
34 from .task import Task, TaskError
35 from .struct import Struct
36 from .argumentParser import ArgumentParser
37 from lsst.base import Packages
38 from lsst.log import Log
39 
40 __all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner"]
41 
42 
43 def _poolFunctionWrapper(function, arg):
44  """Wrapper around function to catch exceptions that don't inherit from `Exception`.
45 
46  Such exceptions aren't caught by multiprocessing, which causes the slave process to crash and you end up
47  hitting the timeout.
48  """
49  try:
50  return function(arg)
51  except Exception:
52  raise # No worries
53  except:
54  # Need to wrap the exception with something multiprocessing will recognise
55  cls, exc, tb = sys.exc_info()
56  log = Log.getDefaultLogger()
57  log.warn("Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58  raise Exception("Unhandled exception: %s (%s)" % (cls.__name__, exc))
59 
60 
61 def _runPool(pool, timeout, function, iterable):
62  """Wrapper around ``pool.map_async``, to handle timeout
63 
64  This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65  http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
66 
67  Further wraps the function in ``_poolFunctionWrapper`` to catch exceptions
68  that don't inherit from `Exception`.
69  """
70  return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
71 
72 
73 @contextlib.contextmanager
74 def profile(filename, log=None):
75  """Context manager for profiling with cProfile.
76 
77 
78  Parameters
79  ----------
80  filename : `str`
81  Filename to which to write profile (profiling disabled if `None` or empty).
82  log : `lsst.log.Log`, optional
83  Log object for logging the profile operations.
84 
85  If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
86  it returns None), which allows additional control over profiling. You can obtain this using
87  the "as" clause, e.g.:
88 
89  with profile(filename) as prof:
90  runYourCodeHere()
91 
92  The output cumulative profile can be printed with a command-line like::
93 
94  python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
95  """
96  if not filename:
97  # Nothing to do
98  yield
99  return
100  from cProfile import Profile
101  profile = Profile()
102  if log is not None:
103  log.info("Enabling cProfile profiling")
104  profile.enable()
105  yield profile
106  profile.disable()
107  profile.dump_stats(filename)
108  if log is not None:
109  log.info("cProfile stats written to %s" % filename)
110 
111 
112 class TaskRunner(object):
113  """Run a command-line task, using `multiprocessing` if requested.
114 
115  Parameters
116  ----------
117  TaskClass : `lsst.pipe.base.Task` subclass
118  The class of the task to run.
119  parsedCmd : `argparse.Namespace`
120  The parsed command-line arguments, as returned by the task's argument parser's
121  `~lsst.pipe.base.ArgumentParser.parse_args` method.
122 
123  .. warning::
124 
125  Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may
126  contain non-picklable elements. It certainly contains more data than we need to send to each
127  instance of the task.
128  doReturnResults : `bool`, optional
129  Should run return the collected result from each invocation of the task? This is only intended for
130  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
131  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
132  pickled.
133 
134  Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned,
135  with value 0 or 1 to be returned to the unix shell.
136 
137  Raises
138  ------
139  ImportError
140  If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be
141  imported.
142 
143  Notes
144  -----
145  Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this
146  class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more
147  information. See `CmdLineTask.parseAndRun` to see how a task runner is used.
148 
149  You may use this task runner for your command-line task if your task has a run method that takes exactly
150  one argument: a butler data reference. Otherwise you must provide a task-specific subclass of this runner
151  for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly
152  `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
153 
154  This design matches the common pattern for command-line tasks: the run method takes a single data
155  reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of
156  `TaskRunner` that calls these additional arguments by name.
157 
158  Instances of this class must be picklable in order to be compatible with multiprocessing. If
159  multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `run` calls `prepareForMultiProcessing`
160  to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing
161  then indicate this in your task by setting class variable ``canMultiprocess=False``.
162 
163  Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This
164  timeout (in sec) can be specified as the ``timeout`` element in the output from
165  `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
166 
167  By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as
168  MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns
169  multiple processes and when multiple users are running on a shared system. Users can override this
170  behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable.
171 
172  .. __: http://bugs.python.org/issue8296
173  .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
174  """
175 
176  TIMEOUT = 3600*24*30
177  """Default timeout (seconds) for multiprocessing."""
178 
179  def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
180  self.TaskClass = TaskClass
181  self.doReturnResults = bool(doReturnResults)
182  self.config = parsedCmd.config
183  self.log = parsedCmd.log
184  self.doRaise = bool(parsedCmd.doraise)
185  self.clobberConfig = bool(parsedCmd.clobberConfig)
186  self.doBackup = not bool(parsedCmd.noBackupConfig)
187  self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
188 
189  self.timeout = getattr(parsedCmd, 'timeout', None)
190  if self.timeout is None or self.timeout <= 0:
191  self.timeout = self.TIMEOUT
192 
193  if self.numProcesses > 1:
194  if not TaskClass.canMultiprocess:
195  self.log.warn("This task does not support multiprocessing; using one process")
196  self.numProcesses = 1
197 
199  """Prepare this instance for multiprocessing
200 
201  Optional non-picklable elements are removed.
202 
203  This is only called if the task is run under multiprocessing.
204  """
205  self.log = None
206 
207  def run(self, parsedCmd):
208  """Run the task on all targets.
209 
210  Parameters
211  ----------
212  parsedCmd : `argparse.Namespace`
213  Parsed command `argparse.Namespace`.
214 
215  Returns
216  -------
217  resultList : `list`
218  A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__`
219  is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
220  for details.
221 
222  Notes
223  -----
224  The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise
225  processing is serial.
226  """
227  resultList = []
228  disableImplicitThreading() # To prevent thread contention
229  if self.numProcesses > 1:
230  import multiprocessing
232  pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
233  mapFunc = functools.partial(_runPool, pool, self.timeout)
234  else:
235  pool = None
236  mapFunc = map
237 
238  if self.precall(parsedCmd):
239  profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
240  log = parsedCmd.log
241  targetList = self.getTargetList(parsedCmd)
242  if len(targetList) > 0:
243  with profile(profileName, log):
244  # Run the task using self.__call__
245  resultList = list(mapFunc(self, targetList))
246  else:
247  log.warn("Not running the task because there is no data to process; "
248  "you may preview data using \"--show data\"")
249 
250  if pool is not None:
251  pool.close()
252  pool.join()
253 
254  return resultList
255 
256  @staticmethod
257  def getTargetList(parsedCmd, **kwargs):
258  """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
259 
260  Parameters
261  ----------
262  parsedCmd : `argparse.Namespace`
263  The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
264  kwargs
265  Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having
266  it simplifies overriding `TaskRunner` for tasks whose run method takes additional arguments
267  (see case (1) below).
268 
269  Notes
270  -----
271  The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any
272  command-line task whose run method takes exactly one argument: a data reference. Otherwise you
273  must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly
274  `TaskRunner.__call__`. There are two cases.
275 
276  **Case 1**
277 
278  If your command-line task has a ``run`` method that takes one data reference followed by additional
279  arguments, then you need only override `TaskRunner.getTargetList` to return the additional arguments
280  as an argument dict. To make this easier, your overridden version of `~TaskRunner.getTargetList` may
281  call `TaskRunner.getTargetList` with the extra arguments as keyword arguments. For example, the
282  following adds an argument dict containing a single key: "calExpList", whose value is the list of data
283  IDs for the calexp ID argument::
284 
285  def getTargetList(parsedCmd):
286  return TaskRunner.getTargetList(
287  parsedCmd,
288  calExpList=parsedCmd.calexp.idList
289  )
290 
291  It is equivalent to this slightly longer version::
292 
293  @staticmethod
294  def getTargetList(parsedCmd):
295  argDict = dict(calExpList=parsedCmd.calexp.idList)
296  return [(dataId, argDict) for dataId in parsedCmd.id.idList]
297 
298  **Case 2**
299 
300  If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and
301  `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList`
302  returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task.
303  """
304  return [(ref, kwargs) for ref in parsedCmd.id.refList]
305 
306  def makeTask(self, parsedCmd=None, args=None):
307  """Create a Task instance.
308 
309  Parameters
310  ----------
311  parsedCmd
312  Parsed command-line options (used for extra task args by some task runners).
313  args
314  Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners).
315 
316  Notes
317  -----
318  ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None,
319  but it must construct identical Task instances in either case.
320 
321  Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and
322  `TaskRunner.__call__`.
323  """
324  return self.TaskClass(config=self.config, log=self.log)
325 
326  def _precallImpl(self, task, parsedCmd):
327  """The main work of `precall`.
328 
329  We write package versions, schemas and configs, or compare these to existing files on disk if present.
330  """
331  if not parsedCmd.noVersions:
332  task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
333  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
334  task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
335 
336  def precall(self, parsedCmd):
337  """Hook for code that should run exactly once, before multiprocessing.
338 
339  Notes
340  -----
341  Must return True if `TaskRunner.__call__` should subsequently be called.
342 
343  .. warning::
344 
345  Implementations must take care to ensure that no unpicklable
346  attributes are added to the TaskRunner itself, for compatibility
347  with multiprocessing.
348 
349  The default implementation writes package versions, schemas and configs, or compares them to existing
350  files on disk if present.
351  """
352  task = self.makeTask(parsedCmd=parsedCmd)
353 
354  if self.doRaise:
355  self._precallImpl(task, parsedCmd)
356  else:
357  try:
358  self._precallImpl(task, parsedCmd)
359  except Exception as e:
360  task.log.fatal("Failed in task initialization: %s", e)
361  if not isinstance(e, TaskError):
362  traceback.print_exc(file=sys.stderr)
363  return False
364  return True
365 
366  def __call__(self, args):
367  """Run the Task on a single target.
368 
369  Parameters
370  ----------
371  args
372  Arguments for Task.run()
373 
374  Returns
375  -------
376  struct : `lsst.pipe.base.Struct`
377  Contains these fields if ``doReturnResults`` is `True`:
378 
379  - ``dataRef``: the provided data reference.
380  - ``metadata``: task metadata after execution of run.
381  - ``result``: result returned by task run, or `None` if the task fails.
382  - ``exitStatus`: 0 if the task completed successfully, 1 otherwise.
383 
384  If ``doReturnResults`` is `False` the struct contains:
385 
386  - ``exitStatus`: 0 if the task completed successfully, 1 otherwise.
387 
388  Notes
389  -----
390  This default implementation assumes that the ``args`` is a tuple
391  containing a data reference and a dict of keyword arguments.
392 
393  .. warning::
394 
395  If you override this method and wish to return something when ``doReturnResults`` is `False`,
396  then it must be picklable to support multiprocessing and it should be small enough that pickling
397  and unpickling do not add excessive overhead.
398  """
399  dataRef, kwargs = args
400  if self.log is None:
401  self.log = Log.getDefaultLogger()
402  if hasattr(dataRef, "dataId"):
403  self.log.MDC("LABEL", str(dataRef.dataId))
404  elif isinstance(dataRef, (list, tuple)):
405  self.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
406  task = self.makeTask(args=args)
407  result = None # in case the task fails
408  exitStatus = 0 # exit status for the shell
409  if self.doRaise:
410  result = task.run(dataRef, **kwargs)
411  else:
412  try:
413  result = task.run(dataRef, **kwargs)
414  except Exception as e:
415  # The shell exit value will be the number of dataRefs returning
416  # non-zero, so the actual value used here is lost.
417  exitStatus = 1
418 
419  # don't use a try block as we need to preserve the original exception
420  eName = type(e).__name__
421  if hasattr(dataRef, "dataId"):
422  task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
423  elif isinstance(dataRef, (list, tuple)):
424  task.log.fatal("Failed on dataIds=[%s]: %s: %s",
425  ", ".join(str(ref.dataId) for ref in dataRef), eName, e)
426  else:
427  task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
428 
429  if not isinstance(e, TaskError):
430  traceback.print_exc(file=sys.stderr)
431  task.writeMetadata(dataRef)
432 
433  # remove MDC so it does not show up outside of task context
434  self.log.MDCRemove("LABEL")
435 
436  if self.doReturnResults:
437  return Struct(
438  exitStatus=exitStatus,
439  dataRef=dataRef,
440  metadata=task.metadata,
441  result=result,
442  )
443  else:
444  return Struct(
445  exitStatus=exitStatus,
446  )
447 
448 
450  """A TaskRunner for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to
451  their constructor.
452  """
453 
454  def makeTask(self, parsedCmd=None, args=None):
455  """A variant of the base version that passes a butler argument to the task's constructor.
456 
457  Parameters
458  ----------
459  parsedCmd : `argparse.Namespace`
460  Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified
461  then args is ignored.
462  args
463  Other arguments; if ``parsedCmd`` is `None` then this must be specified.
464 
465  Raises
466  ------
467  RuntimeError
468  Raised if ``parsedCmd`` and ``args`` are both `None`.
469  """
470  if parsedCmd is not None:
471  butler = parsedCmd.butler
472  elif args is not None:
473  dataRef, kwargs = args
474  butler = dataRef.butlerSubset.butler
475  else:
476  raise RuntimeError("parsedCmd or args must be specified")
477  return self.TaskClass(config=self.config, log=self.log, butler=butler)
478 
479 
481  """Base class for command-line tasks: tasks that may be executed from the command-line.
482 
483  Notes
484  -----
485  See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for
486  more information about writing command-line tasks.
487 
488  Subclasses must specify the following class variables:
489 
490  - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your
491  task needs no configuration, then `lsst.pex.config.Config` itself).
492  - ``_DefaultName``: default name used for this task (a str).
493 
494  Subclasses may also specify the following class variables:
495 
496  - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task
497  with a run method that takes exactly one argument: a data reference. If your task does
498  not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
499  for more information.
500  - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing.
501 
502  Subclasses must specify a method named ``run``:
503 
504  - By default ``run`` accepts a single butler data reference, but you can specify an alternate task runner
505  (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run method needs
506  something else.
507  - ``run`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for evolution
508  of the task since new values may be added without harming existing code.
509  - The data returned by ``run`` must be picklable if your task is to support multiprocessing.
510  """
511  RunnerClass = TaskRunner
512  canMultiprocess = True
513 
514  @classmethod
515  def applyOverrides(cls, config):
516  """A hook to allow a task to change the values of its config *after* the camera-specific
517  overrides are loaded but before any command-line overrides are applied.
518 
519  Parameters
520  ----------
521  config : instance of task's ``ConfigClass``
522  Task configuration.
523 
524  Notes
525  -----
526  This is necessary in some cases because the camera-specific overrides may retarget subtasks,
527  wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
528 
529  .. warning::
530 
531  This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply
532  these overrides.
533  """
534  pass
535 
536  @classmethod
537  def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
538  """Parse an argument list and run the command.
539 
540  Parameters
541  ----------
542  args : `list`, optional
543  List of command-line arguments; if `None` use `sys.argv`.
544  config : `lsst.pex.config.Config`-type, optional
545  Config for task. If `None` use `Task.ConfigClass`.
546  log : `lsst.log.Log`-type, optional
547  Log. If `None` use the default log.
548  doReturnResults : `bool`, optional
549  If `True`, return the results of this task. Default is `False`. This is only intended for
550  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
551  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
552  pickled.
553 
554  Returns
555  -------
556  struct : `lsst.pipe.base.Struct`
557  Fields are:
558 
559  - ``argumentParser``: the argument parser.
560  - ``parsedCmd``: the parsed command returned by the argument parser's
561  `lsst.pipe.base.ArgumentParser.parse_args` method.
562  - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`).
563  - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation.
564  This will typically be a list of `None` unless ``doReturnResults`` is `True`;
565  see `Task.RunnerClass` (`TaskRunner` by default) for more information.
566 
567  Notes
568  -----
569  Calling this method with no arguments specified is the standard way to run a command-line task
570  from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other
571  file in that directory.
572 
573  If one or more of the dataIds fails then this routine will exit (with a status giving the
574  number of failed dataIds) rather than returning this struct; this behaviour can be
575  overridden by specifying the ``--noExit`` command-line option.
576  """
577  if args is None:
578  commandAsStr = " ".join(sys.argv)
579  args = sys.argv[1:]
580  else:
581  commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
582 
583  argumentParser = cls._makeArgumentParser()
584  if config is None:
585  config = cls.ConfigClass()
586  parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
587  # print this message after parsing the command so the log is fully configured
588  parsedCmd.log.info("Running: %s", commandAsStr)
589 
590  taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
591  resultList = taskRunner.run(parsedCmd)
592 
593  try:
594  nFailed = sum(((res.exitStatus != 0) for res in resultList))
595  except (TypeError, AttributeError) as e:
596  # NOTE: TypeError if resultList is None, AttributeError if it doesn't have exitStatus.
597  parsedCmd.log.warn("Unable to retrieve exit status (%s); assuming success", e)
598  nFailed = 0
599 
600  if nFailed > 0:
601  if parsedCmd.noExit:
602  parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
603  else:
604  sys.exit(nFailed)
605 
606  return Struct(
607  argumentParser=argumentParser,
608  parsedCmd=parsedCmd,
609  taskRunner=taskRunner,
610  resultList=resultList,
611  )
612 
613  @classmethod
614  def _makeArgumentParser(cls):
615  """Create and return an argument parser.
616 
617  Returns
618  -------
619  parser : `lsst.pipe.base.ArgumentParser`
620  The argument parser for this task.
621 
622  Notes
623  -----
624  By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of
625  dataset type ``raw``.
626 
627  Your task subclass may need to override this method to change the dataset type or data ref level,
628  or to add additional data ID arguments. If you add additional data ID arguments or your task's
629  run method takes more than a single data reference then you will also have to provide a task-specific
630  task runner (see TaskRunner for more information).
631  """
632  parser = ArgumentParser(name=cls._DefaultName)
633  parser.add_id_argument(name="--id", datasetType="raw",
634  help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
635  return parser
636 
637  def writeConfig(self, butler, clobber=False, doBackup=True):
638  """Write the configuration used for processing the data, or check that an existing
639  one is equal to the new one if present.
640 
641  Parameters
642  ----------
643  butler : `lsst.daf.persistence.Butler`
644  Data butler used to write the config. The config is written to dataset type
645  `CmdLineTask._getConfigName`.
646  clobber : `bool`, optional
647  A boolean flag that controls what happens if a config already has been saved:
648  - `True`: overwrite or rename the existing config, depending on ``doBackup``.
649  - `False`: raise `TaskError` if this config does not match the existing config.
650  doBackup : bool, optional
651  Set to `True` to backup the config files if clobbering.
652  """
653  configName = self._getConfigName()
654  if configName is None:
655  return
656  if clobber:
657  butler.put(self.config, configName, doBackup=doBackup)
658  elif butler.datasetExists(configName, write=True):
659  # this may be subject to a race condition; see #2789
660  try:
661  oldConfig = butler.get(configName, immediate=True)
662  except Exception as exc:
663  raise type(exc)("Unable to read stored config file %s (%s); consider using --clobber-config" %
664  (configName, exc))
665 
666  def logConfigMismatch(msg):
667  self.log.fatal("Comparing configuration: %s", msg)
668 
669  if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
670  raise TaskError(
671  ("Config does not match existing task config %r on disk; tasks configurations " +
672  "must be consistent within the same output repo (override with --clobber-config)") %
673  (configName,))
674  else:
675  butler.put(self.config, configName)
676 
677  def writeSchemas(self, butler, clobber=False, doBackup=True):
678  """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`.
679 
680  Parameters
681  ----------
682  butler : `lsst.daf.persistence.Butler`
683  Data butler used to write the schema. Each schema is written to the dataset type specified as the
684  key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
685  clobber : `bool`, optional
686  A boolean flag that controls what happens if a schema already has been saved:
687  - `True`: overwrite or rename the existing schema, depending on ``doBackup``.
688  - `False`: raise `TaskError` if this schema does not match the existing schema.
689  doBackup : `bool`, optional
690  Set to `True` to backup the schema files if clobbering.
691 
692  Notes
693  -----
694  If ``clobber`` is `False` and an existing schema does not match a current schema,
695  then some schemas may have been saved successfully and others may not, and there is no easy way to
696  tell which is which.
697  """
698  for dataset, catalog in self.getAllSchemaCatalogs().items():
699  schemaDataset = dataset + "_schema"
700  if clobber:
701  butler.put(catalog, schemaDataset, doBackup=doBackup)
702  elif butler.datasetExists(schemaDataset, write=True):
703  oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
704  if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
705  raise TaskError(
706  ("New schema does not match schema %r on disk; schemas must be " +
707  " consistent within the same output repo (override with --clobber-config)") %
708  (dataset,))
709  else:
710  butler.put(catalog, schemaDataset)
711 
712  def writeMetadata(self, dataRef):
713  """Write the metadata produced from processing the data.
714 
715  Parameters
716  ----------
717  dataRef
718  Butler data reference used to write the metadata.
719  The metadata is written to dataset type `CmdLineTask._getMetadataName`.
720  """
721  try:
722  metadataName = self._getMetadataName()
723  if metadataName is not None:
724  dataRef.put(self.getFullMetadata(), metadataName)
725  except Exception as e:
726  self.log.warn("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
727 
728  def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
729  """Compare and write package versions.
730 
731  Parameters
732  ----------
733  butler : `lsst.daf.persistence.Butler`
734  Data butler used to read/write the package versions.
735  clobber : `bool`, optional
736  A boolean flag that controls what happens if versions already have been saved:
737  - `True`: overwrite or rename the existing version info, depending on ``doBackup``.
738  - `False`: raise `TaskError` if this version info does not match the existing.
739  doBackup : `bool`, optional
740  If `True` and clobbering, old package version files are backed up.
741  dataset : `str`, optional
742  Name of dataset to read/write.
743 
744  Raises
745  ------
746  TaskError
747  Raised if there is a version mismatch with current and persisted lists of package versions.
748 
749  Notes
750  -----
751  Note that this operation is subject to a race condition.
752  """
753  packages = Packages.fromSystem()
754 
755  if clobber:
756  return butler.put(packages, dataset, doBackup=doBackup)
757  if not butler.datasetExists(dataset, write=True):
758  return butler.put(packages, dataset)
759 
760  try:
761  old = butler.get(dataset, immediate=True)
762  except Exception as exc:
763  raise type(exc)("Unable to read stored version dataset %s (%s); "
764  "consider using --clobber-versions or --no-versions" %
765  (dataset, exc))
766  # Note that because we can only detect python modules that have been imported, the stored
767  # list of products may be more or less complete than what we have now. What's important is
768  # that the products that are in common have the same version.
769  diff = packages.difference(old)
770  if diff:
771  raise TaskError(
772  "Version mismatch (" +
773  "; ".join("%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0]) for pkg in diff) +
774  "); consider using --clobber-versions or --no-versions")
775  # Update the old set of packages in case we have more packages that haven't been persisted.
776  extra = packages.extra(old)
777  if extra:
778  old.update(packages)
779  butler.put(old, dataset, doBackup=doBackup)
780 
781  def _getConfigName(self):
782  """Get the name of the config dataset type, or `None` if config is not to be persisted.
783 
784  Notes
785  -----
786  The name may depend on the config; that is why this is not a class method.
787  """
788  return self._DefaultName + "_config"
789 
790  def _getMetadataName(self):
791  """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted.
792 
793  Notes
794  -----
795  The name may depend on the config; that is why this is not a class method.
796  """
797  return self._DefaultName + "_metadata"
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
Definition: cmdLineTask.py:537
def _precallImpl(self, task, parsedCmd)
Definition: cmdLineTask.py:326
def getFullMetadata(self)
Definition: task.py:212
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
Definition: cmdLineTask.py:728
def getAllSchemaCatalogs(self)
Definition: task.py:190
def writeSchemas(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:677
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:306
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
Definition: cmdLineTask.py:179
def profile(filename, log=None)
Definition: cmdLineTask.py:74
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:454
def getTargetList(parsedCmd, kwargs)
Definition: cmdLineTask.py:257
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:637