lsst.pipe.base  15.0-6-gfa9b38f+2
cmdLineTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008-2015 AURA/LSST.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <https://www.lsstcorp.org/LegalNotices/>.
21 #
22 from __future__ import absolute_import, division
23 import sys
24 import traceback
25 import functools
26 import contextlib
27 
28 from builtins import str
29 from builtins import object
30 
31 import lsst.utils
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table as afwTable
34 from .task import Task, TaskError
35 from .struct import Struct
36 from .argumentParser import ArgumentParser
37 from lsst.base import Packages
38 from lsst.log import Log
39 
40 __all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner"]
41 
42 
43 def _runPool(pool, timeout, function, iterable):
44  """Wrapper around ``pool.map_async``, to handle timeout
45 
46  This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
47  http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
48  """
49  return pool.map_async(function, iterable).get(timeout)
50 
51 
52 @contextlib.contextmanager
53 def profile(filename, log=None):
54  """Context manager for profiling with cProfile.
55 
56 
57  Parameters
58  ----------
59  filename : `str`
60  Filename to which to write profile (profiling disabled if `None` or empty).
61  log : `lsst.log.Log`, optional
62  Log object for logging the profile operations.
63 
64  If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
65  it returns None), which allows additional control over profiling. You can obtain this using
66  the "as" clause, e.g.:
67 
68  with profile(filename) as prof:
69  runYourCodeHere()
70 
71  The output cumulative profile can be printed with a command-line like::
72 
73  python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
74  """
75  if not filename:
76  # Nothing to do
77  yield
78  return
79  from cProfile import Profile
80  profile = Profile()
81  if log is not None:
82  log.info("Enabling cProfile profiling")
83  profile.enable()
84  yield profile
85  profile.disable()
86  profile.dump_stats(filename)
87  if log is not None:
88  log.info("cProfile stats written to %s" % filename)
89 
90 
91 class TaskRunner(object):
92  """Run a command-line task, using `multiprocessing` if requested.
93 
94  Parameters
95  ----------
96  TaskClass : `lsst.pipe.base.Task` subclass
97  The class of the task to run.
98  parsedCmd : `argparse.Namespace`
99  The parsed command-line arguments, as returned by the task's argument parser's
100  `~lsst.pipe.base.ArgumentParser.parse_args` method.
101 
102  .. warning::
103 
104  Do not store ``parsedCmd``, as this instance is pickled (if multiprocessing) and parsedCmd may
105  contain non-picklable elements. It certainly contains more data than we need to send to each
106  instance of the task.
107  doReturnResults : `bool`, optional
108  Should run return the collected result from each invocation of the task? This is only intended for
109  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
110  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
111  pickled.
112 
113  Note that even if ``doReturnResults`` is False a struct with a single member "exitStatus" is returned,
114  with value 0 or 1 to be returned to the unix shell.
115 
116  Raises
117  ------
118  ImportError
119  If multiprocessing is requested (and the task supports it) but the multiprocessing library cannot be
120  imported.
121 
122  Notes
123  -----
124  Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a task runner. By default it is this
125  class, but some tasks require a subclass. See the manual :ref:`creating-a-command-line-task` for more
126  information. See `CmdLineTask.parseAndRun` to see how a task runner is used.
127 
128  You may use this task runner for your command-line task if your task has a run method that takes exactly
129  one argument: a butler data reference. Otherwise you must provide a task-specific subclass of this runner
130  for your task's ``RunnerClass`` that overrides `TaskRunner.getTargetList` and possibly
131  `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
132 
133  This design matches the common pattern for command-line tasks: the run method takes a single data
134  reference, of some suitable name. Additional arguments are rare, and if present, require a subclass of
135  `TaskRunner` that calls these additional arguments by name.
136 
137  Instances of this class must be picklable in order to be compatible with multiprocessing. If
138  multiprocessing is requested (``parsedCmd.numProcesses > 1``) then `run` calls `prepareForMultiProcessing`
139  to jettison optional non-picklable elements. If your task runner is not compatible with multiprocessing
140  then indicate this in your task by setting class variable ``canMultiprocess=False``.
141 
142  Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires specifying a timeout`__. This
143  timeout (in sec) can be specified as the ``timeout`` element in the output from
144  `~lsst.pipe.base.ArgumentParser` (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
145 
146  By default, we disable "implicit" threading -- ie, as provided by underlying numerical libraries such as
147  MKL or BLAS. This is designed to avoid thread contention both when a single command line task spawns
148  multiple processes and when multiple users are running on a shared system. Users can override this
149  behaviour by setting the ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable.
150 
151  .. __: http://bugs.python.org/issue8296
152  .. __: http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
153  """
154 
155  TIMEOUT = 3600*24*30
156  """Default timeout (seconds) for multiprocessing."""
157 
158  def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
159  self.TaskClass = TaskClass
160  self.doReturnResults = bool(doReturnResults)
161  self.config = parsedCmd.config
162  self.log = parsedCmd.log
163  self.doRaise = bool(parsedCmd.doraise)
164  self.clobberConfig = bool(parsedCmd.clobberConfig)
165  self.doBackup = not bool(parsedCmd.noBackupConfig)
166  self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
167 
168  self.timeout = getattr(parsedCmd, 'timeout', None)
169  if self.timeout is None or self.timeout <= 0:
170  self.timeout = self.TIMEOUT
171 
172  if self.numProcesses > 1:
173  if not TaskClass.canMultiprocess:
174  self.log.warn("This task does not support multiprocessing; using one process")
175  self.numProcesses = 1
176 
178  """Prepare this instance for multiprocessing
179 
180  Optional non-picklable elements are removed.
181 
182  This is only called if the task is run under multiprocessing.
183  """
184  self.log = None
185 
186  def run(self, parsedCmd):
187  """Run the task on all targets.
188 
189  Parameters
190  ----------
191  parsedCmd : `argparse.Namespace`
192  Parsed command `argparse.Namespace`.
193 
194  Returns
195  -------
196  resultList : `list`
197  A list of results returned by `TaskRunner.__call__`, or an empty list if `TaskRunner.__call__`
198  is not called (e.g. if `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
199  for details.
200 
201  Notes
202  -----
203  The task is run under multiprocessing if `TaskRunner.numProcesses` is more than 1; otherwise
204  processing is serial.
205  """
206  resultList = []
207  disableImplicitThreading() # To prevent thread contention
208  if self.numProcesses > 1:
209  import multiprocessing
211  pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
212  mapFunc = functools.partial(_runPool, pool, self.timeout)
213  else:
214  pool = None
215  mapFunc = map
216 
217  if self.precall(parsedCmd):
218  profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
219  log = parsedCmd.log
220  targetList = self.getTargetList(parsedCmd)
221  if len(targetList) > 0:
222  with profile(profileName, log):
223  # Run the task using self.__call__
224  resultList = list(mapFunc(self, targetList))
225  else:
226  log.warn("Not running the task because there is no data to process; "
227  "you may preview data using \"--show data\"")
228 
229  if pool is not None:
230  pool.close()
231  pool.join()
232 
233  return resultList
234 
235  @staticmethod
236  def getTargetList(parsedCmd, **kwargs):
237  """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
238 
239  Parameters
240  ----------
241  parsedCmd : `argparse.Namespace`
242  The parsed command object returned by `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
243  kwargs
244  Any additional keyword arguments. In the default `TaskRunner` this is an empty dict, but having
245  it simplifies overriding `TaskRunner` for tasks whose run method takes additional arguments
246  (see case (1) below).
247 
248  Notes
249  -----
250  The default implementation of `TaskRunner.getTargetList` and `TaskRunner.__call__` works for any
251  command-line task whose run method takes exactly one argument: a data reference. Otherwise you
252  must provide a variant of TaskRunner that overrides `TaskRunner.getTargetList` and possibly
253  `TaskRunner.__call__`. There are two cases.
254 
255  **Case 1**
256 
257  If your command-line task has a ``run`` method that takes one data reference followed by additional
258  arguments, then you need only override `TaskRunner.getTargetList` to return the additional arguments
259  as an argument dict. To make this easier, your overridden version of `~TaskRunner.getTargetList` may
260  call `TaskRunner.getTargetList` with the extra arguments as keyword arguments. For example, the
261  following adds an argument dict containing a single key: "calExpList", whose value is the list of data
262  IDs for the calexp ID argument::
263 
264  def getTargetList(parsedCmd):
265  return TaskRunner.getTargetList(
266  parsedCmd,
267  calExpList=parsedCmd.calexp.idList
268  )
269 
270  It is equivalent to this slightly longer version::
271 
272  @staticmethod
273  def getTargetList(parsedCmd):
274  argDict = dict(calExpList=parsedCmd.calexp.idList)
275  return [(dataId, argDict) for dataId in parsedCmd.id.idList]
276 
277  **Case 2**
278 
279  If your task does not meet condition (1) then you must override both TaskRunner.getTargetList and
280  `TaskRunner.__call__`. You may do this however you see fit, so long as `TaskRunner.getTargetList`
281  returns a list, each of whose elements is sent to `TaskRunner.__call__`, which runs your task.
282  """
283  return [(ref, kwargs) for ref in parsedCmd.id.refList]
284 
285  def makeTask(self, parsedCmd=None, args=None):
286  """Create a Task instance.
287 
288  Parameters
289  ----------
290  parsedCmd
291  Parsed command-line options (used for extra task args by some task runners).
292  args
293  Args tuple passed to `TaskRunner.__call__` (used for extra task arguments by some task runners).
294 
295  Notes
296  -----
297  ``makeTask`` can be called with either the ``parsedCmd`` argument or ``args`` argument set to None,
298  but it must construct identical Task instances in either case.
299 
300  Subclasses may ignore this method entirely if they reimplement both `TaskRunner.precall` and
301  `TaskRunner.__call__`.
302  """
303  return self.TaskClass(config=self.config, log=self.log)
304 
305  def _precallImpl(self, task, parsedCmd):
306  """The main work of `precall`.
307 
308  We write package versions, schemas and configs, or compare these to existing files on disk if present.
309  """
310  if not parsedCmd.noVersions:
311  task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
312  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
313  task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
314 
315  def precall(self, parsedCmd):
316  """Hook for code that should run exactly once, before multiprocessing.
317 
318  Notes
319  -----
320  Must return True if `TaskRunner.__call__` should subsequently be called.
321 
322  .. warning::
323 
324  Implementations must take care to ensure that no unpicklable
325  attributes are added to the TaskRunner itself, for compatibility
326  with multiprocessing.
327 
328  The default implementation writes package versions, schemas and configs, or compares them to existing
329  files on disk if present.
330  """
331  task = self.makeTask(parsedCmd=parsedCmd)
332 
333  if self.doRaise:
334  self._precallImpl(task, parsedCmd)
335  else:
336  try:
337  self._precallImpl(task, parsedCmd)
338  except Exception as e:
339  task.log.fatal("Failed in task initialization: %s", e)
340  if not isinstance(e, TaskError):
341  traceback.print_exc(file=sys.stderr)
342  return False
343  return True
344 
345  def __call__(self, args):
346  """Run the Task on a single target.
347 
348  Parameters
349  ----------
350  args
351  Arguments for Task.run()
352 
353  Returns
354  -------
355  struct : `lsst.pipe.base.Struct`
356  Contains these fields if ``doReturnResults`` is `True`:
357 
358  - ``dataRef``: the provided data reference.
359  - ``metadata``: task metadata after execution of run.
360  - ``result``: result returned by task run, or `None` if the task fails.
361  - ``exitStatus``: 0 if the task completed successfully, 1 otherwise.
362 
363  If ``doReturnResults`` is `False` the struct contains:
364 
365  - ``exitStatus``: 0 if the task completed successfully, 1 otherwise.
366 
367  Notes
368  -----
369  This default implementation assumes that the ``args`` is a tuple
370  containing a data reference and a dict of keyword arguments.
371 
372  .. warning::
373 
374  If you override this method and wish to return something when ``doReturnResults`` is `False`,
375  then it must be picklable to support multiprocessing and it should be small enough that pickling
376  and unpickling do not add excessive overhead.
377  """
378  dataRef, kwargs = args
379  if self.log is None:
380  self.log = Log.getDefaultLogger()
381  if hasattr(dataRef, "dataId"):
382  self.log.MDC("LABEL", str(dataRef.dataId))
383  elif isinstance(dataRef, (list, tuple)):
384  self.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
385  task = self.makeTask(args=args)
386  result = None # in case the task fails
387  exitStatus = 0 # exit status for the shell
388  if self.doRaise:
389  result = task.run(dataRef, **kwargs)
390  else:
391  try:
392  result = task.run(dataRef, **kwargs)
393  except Exception as e:
394  # The shell exit value will be the number of dataRefs returning
395  # non-zero, so the actual value used here is lost.
396  exitStatus = 1
397 
398  # don't use a try block as we need to preserve the original exception
399  eName = type(e).__name__
400  if hasattr(dataRef, "dataId"):
401  task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
402  elif isinstance(dataRef, (list, tuple)):
403  task.log.fatal("Failed on dataIds=[%s]: %s: %s",
404  ", ".join(str(ref.dataId) for ref in dataRef), eName, e)
405  else:
406  task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
407 
408  if not isinstance(e, TaskError):
409  traceback.print_exc(file=sys.stderr)
410 
411  # Ensure all errors have been logged and aren't hanging around in a buffer
412  sys.stdout.flush()
413  sys.stderr.flush()
414 
415  task.writeMetadata(dataRef)
416 
417  # remove MDC so it does not show up outside of task context
418  self.log.MDCRemove("LABEL")
419 
420  if self.doReturnResults:
421  return Struct(
422  exitStatus=exitStatus,
423  dataRef=dataRef,
424  metadata=task.metadata,
425  result=result,
426  )
427  else:
428  return Struct(
429  exitStatus=exitStatus,
430  )
431 
432 
434  """A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword argument to be passed to
435  their constructor.
436  """
437 
438  def makeTask(self, parsedCmd=None, args=None):
439  """A variant of the base version that passes a butler argument to the task's constructor.
440 
441  Parameters
442  ----------
443  parsedCmd : `argparse.Namespace`
444  Parsed command-line options, as returned by the `~lsst.pipe.base.ArgumentParser`; if specified
445  then args is ignored.
446  args
447  Other arguments; if ``parsedCmd`` is `None` then this must be specified.
448 
449  Raises
450  ------
451  RuntimeError
452  Raised if ``parsedCmd`` and ``args`` are both `None`.
453  """
454  if parsedCmd is not None:
455  butler = parsedCmd.butler
456  elif args is not None:
457  dataRef, kwargs = args
458  butler = dataRef.butlerSubset.butler
459  else:
460  raise RuntimeError("parsedCmd or args must be specified")
461  return self.TaskClass(config=self.config, log=self.log, butler=butler)
462 
463 
465  """Base class for command-line tasks: tasks that may be executed from the command-line.
466 
467  Notes
468  -----
469  See :ref:`task-framework-overview` to learn what tasks are and :ref:`creating-a-command-line-task` for
470  more information about writing command-line tasks.
471 
472  Subclasses must specify the following class variables:
473 
474  - ``ConfigClass``: configuration class for your task (a subclass of `lsst.pex.config.Config`, or if your
475  task needs no configuration, then `lsst.pex.config.Config` itself).
476  - ``_DefaultName``: default name used for this task (a str).
477 
478  Subclasses may also specify the following class variables:
479 
480  - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, which works for any task
481  with a run method that takes exactly one argument: a data reference. If your task does
482  not meet this requirement then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
483  for more information.
484  - ``canMultiprocess``: the default is `True`; set `False` if your task does not support multiprocessing.
485 
486  Subclasses must specify a method named ``run``:
487 
488  - By default ``run`` accepts a single butler data reference, but you can specify an alternate task runner
489  (subclass of ``TaskRunner``) as the value of class variable ``RunnerClass`` if your run method needs
490  something else.
491  - ``run`` is expected to return its data in a `lsst.pipe.base.Struct`. This provides safety for evolution
492  of the task since new values may be added without harming existing code.
493  - The data returned by ``run`` must be picklable if your task is to support multiprocessing.
494  """
495  RunnerClass = TaskRunner
496  canMultiprocess = True
497 
498  @classmethod
499  def applyOverrides(cls, config):
500  """A hook to allow a task to change the values of its config *after* the camera-specific
501  overrides are loaded but before any command-line overrides are applied.
502 
503  Parameters
504  ----------
505  config : instance of task's ``ConfigClass``
506  Task configuration.
507 
508  Notes
509  -----
510  This is necessary in some cases because the camera-specific overrides may retarget subtasks,
511  wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
512 
513  .. warning::
514 
515  This is called by CmdLineTask.parseAndRun; other ways of constructing a config will not apply
516  these overrides.
517  """
518  pass
519 
520  @classmethod
521  def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
522  """Parse an argument list and run the command.
523 
524  Parameters
525  ----------
526  args : `list`, optional
527  List of command-line arguments; if `None` use `sys.argv`.
528  config : `lsst.pex.config.Config`-type, optional
529  Config for task. If `None` use `Task.ConfigClass`.
530  log : `lsst.log.Log`-type, optional
531  Log. If `None` use the default log.
532  doReturnResults : `bool`, optional
533  If `True`, return the results of this task. Default is `False`. This is only intended for
534  unit tests and similar use. It can easily exhaust memory (if the task returns enough data and you
535  call it enough times) and it will fail when using multiprocessing if the returned data cannot be
536  pickled.
537 
538  Returns
539  -------
540  struct : `lsst.pipe.base.Struct`
541  Fields are:
542 
543  - ``argumentParser``: the argument parser.
544  - ``parsedCmd``: the parsed command returned by the argument parser's
545  `lsst.pipe.base.ArgumentParser.parse_args` method.
546  - ``taskRunner``: the task runner used to run the task (an instance of `Task.RunnerClass`).
547  - ``resultList``: results returned by the task runner's ``run`` method, one entry per invocation.
548  This will typically be a list of `None` unless ``doReturnResults`` is `True`;
549  see `Task.RunnerClass` (`TaskRunner` by default) for more information.
550 
551  Notes
552  -----
553  Calling this method with no arguments specified is the standard way to run a command-line task
554  from the command-line. For an example see ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other
555  file in that directory.
556 
557  If one or more of the dataIds fails then this routine will exit (with a status giving the
558  number of failed dataIds) rather than returning this struct; this behaviour can be
559  overridden by specifying the ``--noExit`` command-line option.
560  """
561  if args is None:
562  commandAsStr = " ".join(sys.argv)
563  args = sys.argv[1:]
564  else:
565  commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
566 
567  argumentParser = cls._makeArgumentParser()
568  if config is None:
569  config = cls.ConfigClass()
570  parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
571  # print this message after parsing the command so the log is fully configured
572  parsedCmd.log.info("Running: %s", commandAsStr)
573 
574  taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
575  resultList = taskRunner.run(parsedCmd)
576 
577  try:
578  nFailed = sum(((res.exitStatus != 0) for res in resultList))
579  except (TypeError, AttributeError) as e:
580  # NOTE: TypeError if resultList is None, AttributeError if it doesn't have exitStatus.
581  parsedCmd.log.warn("Unable to retrieve exit status (%s); assuming success", e)
582  nFailed = 0
583 
584  if nFailed > 0:
585  if parsedCmd.noExit:
586  parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
587  else:
588  sys.exit(nFailed)
589 
590  return Struct(
591  argumentParser=argumentParser,
592  parsedCmd=parsedCmd,
593  taskRunner=taskRunner,
594  resultList=resultList,
595  )
596 
597  @classmethod
598  def _makeArgumentParser(cls):
599  """Create and return an argument parser.
600 
601  Returns
602  -------
603  parser : `lsst.pipe.base.ArgumentParser`
604  The argument parser for this task.
605 
606  Notes
607  -----
608  By default this returns an `~lsst.pipe.base.ArgumentParser` with one ID argument named `--id` of
609  dataset type ``raw``.
610 
611  Your task subclass may need to override this method to change the dataset type or data ref level,
612  or to add additional data ID arguments. If you add additional data ID arguments or your task's
613  run method takes more than a single data reference then you will also have to provide a task-specific
614  task runner (see TaskRunner for more information).
615  """
616  parser = ArgumentParser(name=cls._DefaultName)
617  parser.add_id_argument(name="--id", datasetType="raw",
618  help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
619  return parser
620 
621  def writeConfig(self, butler, clobber=False, doBackup=True):
622  """Write the configuration used for processing the data, or check that an existing
623  one is equal to the new one if present.
624 
625  Parameters
626  ----------
627  butler : `lsst.daf.persistence.Butler`
628  Data butler used to write the config. The config is written to dataset type
629  `CmdLineTask._getConfigName`.
630  clobber : `bool`, optional
631  A boolean flag that controls what happens if a config already has been saved:
632  - `True`: overwrite or rename the existing config, depending on ``doBackup``.
633  - `False`: raise `TaskError` if this config does not match the existing config.
634  doBackup : bool, optional
635  Set to `True` to backup the config files if clobbering.
636  """
637  configName = self._getConfigName()
638  if configName is None:
639  return
640  if clobber:
641  butler.put(self.config, configName, doBackup=doBackup)
642  elif butler.datasetExists(configName, write=True):
643  # this may be subject to a race condition; see #2789
644  try:
645  oldConfig = butler.get(configName, immediate=True)
646  except Exception as exc:
647  raise type(exc)("Unable to read stored config file %s (%s); consider using --clobber-config" %
648  (configName, exc))
649 
650  def logConfigMismatch(msg):
651  self.log.fatal("Comparing configuration: %s", msg)
652 
653  if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
654  raise TaskError(
655  ("Config does not match existing task config %r on disk; tasks configurations " +
656  "must be consistent within the same output repo (override with --clobber-config)") %
657  (configName,))
658  else:
659  butler.put(self.config, configName)
660 
661  def writeSchemas(self, butler, clobber=False, doBackup=True):
662  """Write the schemas returned by `lsst.pipe.base.Task.getAllSchemaCatalogs`.
663 
664  Parameters
665  ----------
666  butler : `lsst.daf.persistence.Butler`
667  Data butler used to write the schema. Each schema is written to the dataset type specified as the
668  key in the dict returned by `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
669  clobber : `bool`, optional
670  A boolean flag that controls what happens if a schema already has been saved:
671  - `True`: overwrite or rename the existing schema, depending on ``doBackup``.
672  - `False`: raise `TaskError` if this schema does not match the existing schema.
673  doBackup : `bool`, optional
674  Set to `True` to backup the schema files if clobbering.
675 
676  Notes
677  -----
678  If ``clobber`` is `False` and an existing schema does not match a current schema,
679  then some schemas may have been saved successfully and others may not, and there is no easy way to
680  tell which is which.
681  """
682  for dataset, catalog in self.getAllSchemaCatalogs().items():
683  schemaDataset = dataset + "_schema"
684  if clobber:
685  butler.put(catalog, schemaDataset, doBackup=doBackup)
686  elif butler.datasetExists(schemaDataset, write=True):
687  oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
688  if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
689  raise TaskError(
690  ("New schema does not match schema %r on disk; schemas must be " +
691  " consistent within the same output repo (override with --clobber-config)") %
692  (dataset,))
693  else:
694  butler.put(catalog, schemaDataset)
695 
696  def writeMetadata(self, dataRef):
697  """Write the metadata produced from processing the data.
698 
699  Parameters
700  ----------
701  dataRef
702  Butler data reference used to write the metadata.
703  The metadata is written to dataset type `CmdLineTask._getMetadataName`.
704  """
705  try:
706  metadataName = self._getMetadataName()
707  if metadataName is not None:
708  dataRef.put(self.getFullMetadata(), metadataName)
709  except Exception as e:
710  self.log.warn("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
711 
712  def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
713  """Compare and write package versions.
714 
715  Parameters
716  ----------
717  butler : `lsst.daf.persistence.Butler`
718  Data butler used to read/write the package versions.
719  clobber : `bool`, optional
720  A boolean flag that controls what happens if versions already have been saved:
721  - `True`: overwrite or rename the existing version info, depending on ``doBackup``.
722  - `False`: raise `TaskError` if this version info does not match the existing.
723  doBackup : `bool`, optional
724  If `True` and clobbering, old package version files are backed up.
725  dataset : `str`, optional
726  Name of dataset to read/write.
727 
728  Raises
729  ------
730  TaskError
731  Raised if there is a version mismatch with current and persisted lists of package versions.
732 
733  Notes
734  -----
735  Note that this operation is subject to a race condition.
736  """
737  packages = Packages.fromSystem()
738 
739  if clobber:
740  return butler.put(packages, dataset, doBackup=doBackup)
741  if not butler.datasetExists(dataset, write=True):
742  return butler.put(packages, dataset)
743 
744  try:
745  old = butler.get(dataset, immediate=True)
746  except Exception as exc:
747  raise type(exc)("Unable to read stored version dataset %s (%s); "
748  "consider using --clobber-versions or --no-versions" %
749  (dataset, exc))
750  # Note that because we can only detect python modules that have been imported, the stored
751  # list of products may be more or less complete than what we have now. What's important is
752  # that the products that are in common have the same version.
753  diff = packages.difference(old)
754  if diff:
755  raise TaskError(
756  "Version mismatch (" +
757  "; ".join("%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0]) for pkg in diff) +
758  "); consider using --clobber-versions or --no-versions")
759  # Update the old set of packages in case we have more packages that haven't been persisted.
760  extra = packages.extra(old)
761  if extra:
762  old.update(packages)
763  butler.put(old, dataset, doBackup=doBackup)
764 
765  def _getConfigName(self):
766  """Get the name of the config dataset type, or `None` if config is not to be persisted.
767 
768  Notes
769  -----
770  The name may depend on the config; that is why this is not a class method.
771  """
772  return self._DefaultName + "_config"
773 
774  def _getMetadataName(self):
775  """Get the name of the metadata dataset type, or `None` if metadata is not to be persisted.
776 
777  Notes
778  -----
779  The name may depend on the config; that is why this is not a class method.
780  """
781  return self._DefaultName + "_metadata"
def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False)
Definition: cmdLineTask.py:521
def _precallImpl(self, task, parsedCmd)
Definition: cmdLineTask.py:305
def getFullMetadata(self)
Definition: task.py:213
def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages")
Definition: cmdLineTask.py:712
def getAllSchemaCatalogs(self)
Definition: task.py:191
def writeSchemas(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:661
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:285
def __init__(self, TaskClass, parsedCmd, doReturnResults=False)
Definition: cmdLineTask.py:158
def profile(filename, log=None)
Definition: cmdLineTask.py:53
def makeTask(self, parsedCmd=None, args=None)
Definition: cmdLineTask.py:438
def getTargetList(parsedCmd, kwargs)
Definition: cmdLineTask.py:236
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: cmdLineTask.py:621