lsst.pipe.base  13.0-12-gaf0c0ec+9
 All Classes Namespaces Files Functions Variables Pages
cmdLineTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008-2015 AURA/LSST.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <https://www.lsstcorp.org/LegalNotices/>.
21 #
22 from __future__ import absolute_import, division
23 import sys
24 import traceback
25 import functools
26 import contextlib
27 
28 from builtins import str
29 from builtins import object
30 
31 import lsst.utils
32 from lsst.base import disableImplicitThreading
33 import lsst.afw.table as afwTable
34 from .task import Task, TaskError
35 from .struct import Struct
36 from .argumentParser import ArgumentParser
37 from lsst.base import Packages
38 from lsst.log import Log
39 
40 __all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner"]
41 
42 
43 def _poolFunctionWrapper(function, arg):
44  """Wrapper around function to catch exceptions that don't inherit from Exception
45 
46  Such exceptions aren't caught by multiprocessing, which causes the slave
47  process to crash and you end up hitting the timeout.
48  """
49  try:
50  return function(arg)
51  except Exception:
52  raise # No worries
53  except:
54  # Need to wrap the exception with something multiprocessing will recognise
55  cls, exc, tb = sys.exc_info()
56  log = Log.getDefaultLogger()
57  log.warn("Unhandled exception %s (%s):\n%s" % (cls.__name__, exc, traceback.format_exc()))
58  raise Exception("Unhandled exception: %s (%s)" % (cls.__name__, exc))
59 
60 
61 def _runPool(pool, timeout, function, iterable):
62  """Wrapper around pool.map_async, to handle timeout
63 
64  This is required so as to trigger an immediate interrupt on the KeyboardInterrupt (Ctrl-C); see
65  http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
66 
67  Further wraps the function in _poolFunctionWrapper to catch exceptions
68  that don't inherit from Exception.
69  """
70  return pool.map_async(functools.partial(_poolFunctionWrapper, function), iterable).get(timeout)
71 
72 
73 @contextlib.contextmanager
74 def profile(filename, log=None):
75  """!Context manager for profiling with cProfile
76 
77  @param filename filename to which to write profile (profiling disabled if None or empty)
78  @param log log object for logging the profile operations
79 
80  If profiling is enabled, the context manager returns the cProfile.Profile object (otherwise
81  it returns None), which allows additional control over profiling. You can obtain this using
82  the "as" clause, e.g.:
83 
84  with profile(filename) as prof:
85  runYourCodeHere()
86 
87  The output cumulative profile can be printed with a command-line like:
88 
89  python -c 'import pstats; pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
90  """
91  if not filename:
92  # Nothing to do
93  yield
94  return
95  from cProfile import Profile
96  profile = Profile()
97  if log is not None:
98  log.info("Enabling cProfile profiling")
99  profile.enable()
100  yield profile
101  profile.disable()
102  profile.dump_stats(filename)
103  if log is not None:
104  log.info("cProfile stats written to %s" % filename)
105 
106 
107 class TaskRunner(object):
108  """Run a command-line task, using multiprocessing if requested.
109 
110  Each command-line task (subclass of CmdLineTask) has a task runner. By
111  default it is this class, but some tasks require a subclass. See the
112  manual "how to write a command-line task" in the pipe_tasks documentation
113  for more information. See CmdLineTask.parseAndRun to see how a task runner
114  is used.
115 
116  You may use this task runner for your command-line task if your task has
117  a run method that takes exactly one argument: a butler data reference.
118  Otherwise you must provide a task-specific subclass of this runner for
119  your task's `RunnerClass` that overrides TaskRunner.getTargetList and
120  possibly TaskRunner.\_\_call\_\_. See TaskRunner.getTargetList for
121  details.
122 
123  This design matches the common pattern for command-line tasks: the run
124  method takes a single data reference, of some suitable name. Additional
125  arguments are rare, and if present, require a subclass of TaskRunner that
126  calls these additional arguments by name.
127 
128  Instances of this class must be picklable in order to be compatible with
129  multiprocessing. If multiprocessing is requested
130  (parsedCmd.numProcesses > 1) then run() calls prepareForMultiProcessing
131  to jettison optional non-picklable elements. If your task runner is not
132  compatible with multiprocessing then indicate this in your task by setting
133  class variable canMultiprocess=False.
134 
135  Due to a python bug [1], handling a KeyboardInterrupt properly requires
136  specifying a timeout [2]. This timeout (in sec) can be specified as the
137  "timeout" element in the output from ArgumentParser (the "parsedCmd"), if
138  available, otherwise we use TaskRunner.TIMEOUT.
139 
140  [1] http://bugs.python.org/issue8296
141  [2] http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool)
142  """
143  TIMEOUT = 3600*24*30 # Default timeout (sec) for multiprocessing
144 
145  def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
146  """!Construct a TaskRunner
147 
148  @warning Do not store parsedCmd, as this instance is pickled (if
149  multiprocessing) and parsedCmd may contain non-picklable elements.
150  It certainly contains more data than we need to send to each
151  instance of the task.
152 
153  @param TaskClass The class of the task to run
154  @param parsedCmd The parsed command-line arguments, as returned by
155  the task's argument parser's parse_args method.
156  @param doReturnResults Should run return the collected result from
157  each invocation of the task? This is only intended for unit tests
158  and similar use. It can easily exhaust memory (if the task
159  returns enough data and you call it enough times) and it will
160  fail when using multiprocessing if the returned data cannot be
161  pickled.
162 
163  Note that even if doReturnResults is False a struct with a single
164  member "exitStatus" is returned, with value 0 or 1 to be returned
165  to the unix shell.
166 
167  @throws ImportError if multiprocessing requested (and the task
168  supports it) but the multiprocessing library cannot be
169  imported.
170  """
171  self.TaskClass = TaskClass
172  self.doReturnResults = bool(doReturnResults)
173  self.config = parsedCmd.config
174  self.log = parsedCmd.log
175  self.doRaise = bool(parsedCmd.doraise)
176  self.clobberConfig = bool(parsedCmd.clobberConfig)
177  self.doBackup = not bool(parsedCmd.noBackupConfig)
178  self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
179 
180  self.timeout = getattr(parsedCmd, 'timeout', None)
181  if self.timeout is None or self.timeout <= 0:
182  self.timeout = self.TIMEOUT
183 
184  if self.numProcesses > 1:
185  if not TaskClass.canMultiprocess:
186  self.log.warn("This task does not support multiprocessing; using one process")
187  self.numProcesses = 1
188 
190  """Prepare this instance for multiprocessing
191 
192  Optional non-picklable elements are removed.
193 
194  This is only called if the task is run under multiprocessing.
195  """
196  self.log = None
197 
198  def run(self, parsedCmd):
199  """!Run the task on all targets.
200 
201  The task is run under multiprocessing if numProcesses > 1; otherwise
202  processing is serial.
203 
204  @return a list of results returned by TaskRunner.\_\_call\_\_, or an
205  empty list if TaskRunner.\_\_call\_\_ is not called (e.g. if
206  TaskRunner.precall returns `False`). See TaskRunner.\_\_call\_\_
207  for details.
208  """
209  resultList = []
210  if self.numProcesses > 1:
211  disableImplicitThreading() # To prevent thread contention
212  import multiprocessing
214  pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
215  mapFunc = functools.partial(_runPool, pool, self.timeout)
216  else:
217  pool = None
218  mapFunc = map
219 
220  if self.precall(parsedCmd):
221  profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
222  log = parsedCmd.log
223  targetList = self.getTargetList(parsedCmd)
224  if len(targetList) > 0:
225  with profile(profileName, log):
226  # Run the task using self.__call__
227  resultList = list(mapFunc(self, targetList))
228  else:
229  log.warn("Not running the task because there is no data to process; "
230  "you may preview data using \"--show data\"")
231 
232  if pool is not None:
233  pool.close()
234  pool.join()
235 
236  return resultList
237 
238  @staticmethod
239  def getTargetList(parsedCmd, **kwargs):
240  """!Return a list of (dataRef, kwargs) for TaskRunner.\_\_call\_\_.
241 
242  @param parsedCmd the parsed command object (an argparse.Namespace)
243  returned by \ref argumentParser.ArgumentParser.parse_args
244  "ArgumentParser.parse_args".
245  @param **kwargs any additional keyword arguments. In the default
246  TaskRunner this is an empty dict, but having it simplifies
247  overriding TaskRunner for tasks whose run method takes additional
248  arguments (see case (1) below).
249 
250  The default implementation of TaskRunner.getTargetList and
251  TaskRunner.\_\_call\_\_ works for any command-line task whose run
252  method takes exactly one argument: a data reference. Otherwise you
253  must provide a variant of TaskRunner that overrides
254  TaskRunner.getTargetList and possibly TaskRunner.\_\_call\_\_.
255  There are two cases:
256 
257  (1) If your command-line task has a `run` method that takes one data
258  reference followed by additional arguments, then you need only
259  override TaskRunner.getTargetList to return the additional arguments
260  as an argument dict. To make this easier, your overridden version of
261  getTargetList may call TaskRunner.getTargetList with the extra
262  arguments as keyword arguments. For example, the following adds an
263  argument dict containing a single key: "calExpList", whose value is
264  the list of data IDs for the calexp ID argument:
265 
266  \code
267  \@staticmethod
268  def getTargetList(parsedCmd):
269  return TaskRunner.getTargetList(
270  parsedCmd,
271  calExpList=parsedCmd.calexp.idList
272  )
273  \endcode
274 
275  It is equivalent to this slightly longer version:
276 
277  \code
278  \@staticmethod
279  def getTargetList(parsedCmd):
280  argDict = dict(calExpList=parsedCmd.calexp.idList)
281  return [(dataId, argDict) for dataId in parsedCmd.id.idList]
282  \endcode
283 
284  (2) If your task does not meet condition (1) then you must override
285  both TaskRunner.getTargetList and TaskRunner.\_\_call\_\_. You may do
286  this however you see fit, so long as TaskRunner.getTargetList
287  returns a list, each of whose elements is sent to
288  TaskRunner.\_\_call\_\_, which runs your task.
289  """
290  return [(ref, kwargs) for ref in parsedCmd.id.refList]
291 
292  def makeTask(self, parsedCmd=None, args=None):
293  """!Create a Task instance
294 
295  @param[in] parsedCmd parsed command-line options (used for extra
296  task args by some task runners)
297  @param[in] args args tuple passed to TaskRunner.\_\_call\_\_
298  (used for extra task arguments by some task runners)
299 
300  makeTask() can be called with either the 'parsedCmd' argument or
301  'args' argument set to None, but it must construct identical Task
302  instances in either case.
303 
304  Subclasses may ignore this method entirely if they reimplement
305  both TaskRunner.precall and TaskRunner.\_\_call\_\_
306  """
307  return self.TaskClass(config=self.config, log=self.log)
308 
309  def _precallImpl(self, task, parsedCmd):
310  """The main work of 'precall'
311 
312  We write package versions, schemas and configs, or compare these to
313  existing files on disk if present.
314  """
315  if not parsedCmd.noVersions:
316  task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
317  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
318  task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
319 
320  def precall(self, parsedCmd):
321  """Hook for code that should run exactly once, before multiprocessing
322 
323  Must return True if TaskRunner.\_\_call\_\_ should subsequently be
324  called.
325 
326  @warning Implementations must take care to ensure that no unpicklable
327  attributes are added to the TaskRunner itself, for compatibility
328  with multiprocessing.
329 
330  The default implementation writes package versions, schemas and
331  configs, or compares them to existing files on disk if present.
332  """
333  task = self.makeTask(parsedCmd=parsedCmd)
334 
335  if self.doRaise:
336  self._precallImpl(task, parsedCmd)
337  else:
338  try:
339  self._precallImpl(task, parsedCmd)
340  except Exception as e:
341  task.log.fatal("Failed in task initialization: %s", e)
342  if not isinstance(e, TaskError):
343  traceback.print_exc(file=sys.stderr)
344  return False
345  return True
346 
347  def __call__(self, args):
348  """!Run the Task on a single target.
349 
350  This default implementation assumes that the 'args' is a tuple
351  containing a data reference and a dict of keyword arguments.
352 
353  @warning if you override this method and wish to return something
354  when doReturnResults is false, then it must be picklable to support
355  multiprocessing and it should be small enough that pickling and
356  unpickling do not add excessive overhead.
357 
358  @param args Arguments for Task.run()
359 
360  @return:
361  - None if doReturnResults false
362  - A pipe_base Struct containing these fields if doReturnResults true:
363  - dataRef: the provided data reference
364  - metadata: task metadata after execution of run
365  - result: result returned by task run, or None if the task fails
366  """
367  dataRef, kwargs = args
368  if self.log is None:
369  self.log = Log.getDefaultLogger()
370  if hasattr(dataRef, "dataId"):
371  self.log.MDC("LABEL", str(dataRef.dataId))
372  elif isinstance(dataRef, (list, tuple)):
373  self.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
374  task = self.makeTask(args=args)
375  result = None # in case the task fails
376  exitStatus = 0 # exit status for the shell
377  if self.doRaise:
378  result = task.run(dataRef, **kwargs)
379  else:
380  try:
381  result = task.run(dataRef, **kwargs)
382  except Exception as e:
383  exitStatus = 1 # n.b. The shell exit value is the number of dataRefs returning
384  # non-zero, so the actual value used here is lost
385 
386  # don't use a try block as we need to preserve the original exception
387  if hasattr(dataRef, "dataId"):
388  task.log.fatal("Failed on dataId=%s: %s", dataRef.dataId, e)
389  elif isinstance(dataRef, (list, tuple)):
390  task.log.fatal("Failed on dataId=[%s]: %s",
391  ", ".join(str(ref.dataId) for ref in dataRef), e)
392  else:
393  task.log.fatal("Failed on dataRef=%s: %s", dataRef, e)
394 
395  if not isinstance(e, TaskError):
396  traceback.print_exc(file=sys.stderr)
397  task.writeMetadata(dataRef)
398 
399  # remove MDC so it does not show up outside of task context
400  self.log.MDCRemove("LABEL")
401 
402  if self.doReturnResults:
403  return Struct(
404  exitStatus=exitStatus,
405  dataRef=dataRef,
406  metadata=task.metadata,
407  result=result,
408  )
409  else:
410  return Struct(
411  exitStatus=exitStatus,
412  )
413 
414 
416  """!A TaskRunner for CmdLineTasks that require a 'butler' keyword argument to be passed to
417  their constructor.
418  """
419 
420  def makeTask(self, parsedCmd=None, args=None):
421  """!A variant of the base version that passes a butler argument to the task's constructor
422 
423  @param[in] parsedCmd parsed command-line options, as returned by the argument parser;
424  if specified then args is ignored
425  @param[in] args other arguments; if parsedCmd is None then this must be specified
426 
427  @throw RuntimeError if parsedCmd and args are both None
428  """
429  if parsedCmd is not None:
430  butler = parsedCmd.butler
431  elif args is not None:
432  dataRef, kwargs = args
433  butler = dataRef.butlerSubset.butler
434  else:
435  raise RuntimeError("parsedCmd or args must be specified")
436  return self.TaskClass(config=self.config, log=self.log, butler=butler)
437 
438 
439 class CmdLineTask(Task):
440  """!Base class for command-line tasks: tasks that may be executed from the command line
441 
442  See \ref pipeBase_introduction "pipe_base introduction" to learn what tasks are,
443  and \ref pipeTasks_writeCmdLineTask "how to write a command-line task" for more information
444  about writing command-line tasks.
445  If the second link is broken (as it will be before the documentation is cross-linked)
446  then look at the main page of pipe_tasks documentation for a link.
447 
448  Subclasses must specify the following class variables:
449  * ConfigClass: configuration class for your task (a subclass of \ref lsst.pex.config.config.Config
450  "lsst.pex.config.Config", or if your task needs no configuration, then
451  \ref lsst.pex.config.config.Config "lsst.pex.config.Config" itself)
452  * _DefaultName: default name used for this task (a str)
453 
454  Subclasses may also specify the following class variables:
455  * RunnerClass: a task runner class. The default is TaskRunner, which works for any task
456  with a run method that takes exactly one argument: a data reference. If your task does
457  not meet this requirement then you must supply a variant of TaskRunner; see TaskRunner
458  for more information.
459  * canMultiprocess: the default is True; set False if your task does not support multiprocessing.
460 
461  Subclasses must specify a method named "run":
462  - By default `run` accepts a single butler data reference, but you can specify an alternate task runner
463  (subclass of TaskRunner) as the value of class variable `RunnerClass` if your run method needs
464  something else.
465  - `run` is expected to return its data in a Struct. This provides safety for evolution of the task
466  since new values may be added without harming existing code.
467  - The data returned by `run` must be picklable if your task is to support multiprocessing.
468  """
469  RunnerClass = TaskRunner
470  canMultiprocess = True
471 
472  @classmethod
473  def applyOverrides(cls, config):
474  """!A hook to allow a task to change the values of its config *after* the camera-specific
475  overrides are loaded but before any command-line overrides are applied.
476 
477  This is necessary in some cases because the camera-specific overrides may retarget subtasks,
478  wiping out changes made in ConfigClass.setDefaults. See LSST Trac ticket #2282 for more discussion.
479 
480  @warning This is called by CmdLineTask.parseAndRun; other ways of constructing a config
481  will not apply these overrides.
482 
483  @param[in] cls the class object
484  @param[in] config task configuration (an instance of cls.ConfigClass)
485  """
486  pass
487 
488  @classmethod
489  def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
490  """!Parse an argument list and run the command
491 
492  Calling this method with no arguments specified is the standard way to run a command-line task
493  from the command line. For an example see pipe_tasks `bin/makeSkyMap.py` or almost any other
494  file in that directory.
495 
496  @param cls the class object
497  @param args list of command-line arguments; if `None` use sys.argv
498  @param config config for task (instance of pex_config Config); if `None` use cls.ConfigClass()
499  @param log log (instance of lsst.log.Log); if `None` use the default log
500  @param doReturnResults Return the collected results from each invocation of the task?
501  This is only intended for unit tests and similar use.
502  It can easily exhaust memory (if the task returns enough data and you call it enough times)
503  and it will fail when using multiprocessing if the returned data cannot be pickled.
504 
505  @return a Struct containing:
506  - argumentParser: the argument parser
507  - parsedCmd: the parsed command returned by the argument parser's parse_args method
508  - taskRunner: the task runner used to run the task (an instance of cls.RunnerClass)
509  - resultList: results returned by the task runner's run method, one entry per invocation.
510  This will typically be a list of `None` unless doReturnResults is `True`;
511  see cls.RunnerClass (TaskRunner by default) for more information.
512 
513  If one or more of the dataIds fails then this routine will exit (with a status giving the
514  number of failed dataIds) rather than returning this struct; this behaviour can be
515  overridden by specifying the --noExit option.
516  """
517  if args is None:
518  commandAsStr = " ".join(sys.argv)
519  args = sys.argv[1:]
520  else:
521  commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
522 
523  argumentParser = cls._makeArgumentParser()
524  if config is None:
525  config = cls.ConfigClass()
526  parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
527  # print this message after parsing the command so the log is fully configured
528  parsedCmd.log.info("Running: %s", commandAsStr)
529 
530  taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
531  resultList = taskRunner.run(parsedCmd)
532 
533  try:
534  nFailed = sum(((res.exitStatus != 0) for res in resultList))
535  except Exception as e:
536  parsedCmd.log.warn("Unable to retrieve exit status (%s); assuming success", e)
537  nFailed = 0
538 
539  if nFailed > 0:
540  if parsedCmd.noExit:
541  parsedCmd.log.warn("%d dataRefs failed; not exiting as --noExit was set", nFailed)
542  else:
543  sys.exit(nFailed)
544 
545  return Struct(
546  argumentParser=argumentParser,
547  parsedCmd=parsedCmd,
548  taskRunner=taskRunner,
549  resultList=resultList,
550  )
551 
552  @classmethod
553  def _makeArgumentParser(cls):
554  """!Create and return an argument parser
555 
556  @param[in] cls the class object
557  @return the argument parser for this task.
558 
559  By default this returns an ArgumentParser with one ID argument named `--id` of dataset type "raw".
560 
561  Your task subclass may need to override this method to change the dataset type or data ref level,
562  or to add additional data ID arguments. If you add additional data ID arguments or your task's
563  run method takes more than a single data reference then you will also have to provide a task-specific
564  task runner (see TaskRunner for more information).
565  """
566  parser = ArgumentParser(name=cls._DefaultName)
567  parser.add_id_argument(name="--id", datasetType="raw",
568  help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
569  return parser
570 
571  def writeConfig(self, butler, clobber=False, doBackup=True):
572  """!Write the configuration used for processing the data, or check that an existing
573  one is equal to the new one if present.
574 
575  @param[in] butler data butler used to write the config.
576  The config is written to dataset type self._getConfigName()
577  @param[in] clobber a boolean flag that controls what happens if a config already has been saved:
578  - True: overwrite or rename the existing config, depending on `doBackup`
579  - False: raise TaskError if this config does not match the existing config
580  @param[in] doBackup if clobbering, should we backup the old files?
581  """
582  configName = self._getConfigName()
583  if configName is None:
584  return
585  if clobber:
586  butler.put(self.config, configName, doBackup=doBackup)
587  elif butler.datasetExists(configName):
588  # this may be subject to a race condition; see #2789
589  try:
590  oldConfig = butler.get(configName, immediate=True)
591  except Exception as exc:
592  raise type(exc)("Unable to read stored config file %s (%s); consider using --clobber-config" %
593  (configName, exc))
594 
595  def logConfigMismatch(msg):
596  self.log.fatal("Comparing configuration: %s", msg)
597 
598  if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
599  raise TaskError(
600  ("Config does not match existing task config %r on disk; tasks configurations " +
601  "must be consistent within the same output repo (override with --clobber-config)") %
602  (configName,))
603  else:
604  butler.put(self.config, configName)
605 
606  def writeSchemas(self, butler, clobber=False, doBackup=True):
607  """!Write the schemas returned by \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs"
608 
609  @param[in] butler data butler used to write the schema.
610  Each schema is written to the dataset type specified as the key in the dict returned by
611  \ref task.Task.getAllSchemaCatalogs "getAllSchemaCatalogs".
612  @param[in] clobber a boolean flag that controls what happens if a schema already has been saved:
613  - True: overwrite or rename the existing schema, depending on `doBackup`
614  - False: raise TaskError if this schema does not match the existing schema
615  @param[in] doBackup if clobbering, should we backup the old files?
616 
617  @warning if clobber is False and an existing schema does not match a current schema,
618  then some schemas may have been saved successfully and others may not, and there is no easy way to
619  tell which is which.
620  """
621  for dataset, catalog in self.getAllSchemaCatalogs().items():
622  schemaDataset = dataset + "_schema"
623  if clobber:
624  butler.put(catalog, schemaDataset, doBackup=doBackup)
625  elif butler.datasetExists(schemaDataset):
626  oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
627  if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
628  raise TaskError(
629  ("New schema does not match schema %r on disk; schemas must be " +
630  " consistent within the same output repo (override with --clobber-config)") %
631  (dataset,))
632  else:
633  butler.put(catalog, schemaDataset)
634 
635  def writeMetadata(self, dataRef):
636  """!Write the metadata produced from processing the data
637 
638  @param[in] dataRef butler data reference used to write the metadata.
639  The metadata is written to dataset type self._getMetadataName()
640  """
641  try:
642  metadataName = self._getMetadataName()
643  if metadataName is not None:
644  dataRef.put(self.getFullMetadata(), metadataName)
645  except Exception as e:
646  self.log.warn("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
647 
648  def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
649  """!Compare and write package versions
650 
651  We retrieve the persisted list of packages and compare with what we're currently using.
652  We raise TaskError if there's a version mismatch.
653 
654  Note that this operation is subject to a race condition.
655 
656  @param[in] butler data butler used to read/write the package versions
657  @param[in] clobber a boolean flag that controls what happens if versions already have been saved:
658  - True: overwrite or rename the existing version info, depending on `doBackup`
659  - False: raise TaskError if this version info does not match the existing
660  @param[in] doBackup if clobbering, should we backup the old files?
661  @param[in] dataset name of dataset to read/write
662  """
663  packages = Packages.fromSystem()
664 
665  if clobber:
666  return butler.put(packages, dataset, doBackup=doBackup)
667  if not butler.datasetExists(dataset):
668  return butler.put(packages, dataset)
669 
670  try:
671  old = butler.get(dataset, immediate=True)
672  except Exception as exc:
673  raise type(exc)("Unable to read stored version dataset %s (%s); "
674  "consider using --clobber-versions or --no-versions" %
675  (dataset, exc))
676  # Note that because we can only detect python modules that have been imported, the stored
677  # list of products may be more or less complete than what we have now. What's important is
678  # that the products that are in common have the same version.
679  diff = packages.difference(old)
680  if diff:
681  raise TaskError(
682  "Version mismatch (" +
683  "; ".join("%s: %s vs %s" % (pkg, diff[pkg][1], diff[pkg][0]) for pkg in diff) +
684  "); consider using --clobber-versions or --no-versions")
685  # Update the old set of packages in case we have more packages that haven't been persisted.
686  extra = packages.extra(old)
687  if extra:
688  old.update(packages)
689  butler.put(old, dataset, doBackup=doBackup)
690 
691  def _getConfigName(self):
692  """!Return the name of the config dataset type, or None if config is not to be persisted
693 
694  @note The name may depend on the config; that is why this is not a class method.
695  """
696  return self._DefaultName + "_config"
697 
698  def _getMetadataName(self):
699  """!Return the name of the metadata dataset type, or None if metadata is not to be persisted
700 
701  @note The name may depend on the config; that is why this is not a class method.
702  """
703  return self._DefaultName + "_metadata"
def __init__
Construct a TaskRunner.
Definition: cmdLineTask.py:145
def makeTask
Create a Task instance.
Definition: cmdLineTask.py:292
def writePackageVersions
Compare and write package versions.
Definition: cmdLineTask.py:648
def makeTask
A variant of the base version that passes a butler argument to the task&#39;s constructor.
Definition: cmdLineTask.py:420
def run
Run the task on all targets.
Definition: cmdLineTask.py:198
def __call__
Run the Task on a single target.
Definition: cmdLineTask.py:347
def writeConfig
Write the configuration used for processing the data, or check that an existing one is equal to the n...
Definition: cmdLineTask.py:571
def _getConfigName
Return the name of the config dataset type, or None if config is not to be persisted.
Definition: cmdLineTask.py:691
def applyOverrides
A hook to allow a task to change the values of its config after the camera-specific overrides are loa...
Definition: cmdLineTask.py:473
def parseAndRun
Parse an argument list and run the command.
Definition: cmdLineTask.py:489
def _getMetadataName
Return the name of the metadata dataset type, or None if metadata is not to be persisted.
Definition: cmdLineTask.py:698
def writeMetadata
Write the metadata produced from processing the data.
Definition: cmdLineTask.py:635
def getTargetList
Return a list of (dataRef, kwargs) for TaskRunner.
Definition: cmdLineTask.py:239
A TaskRunner for CmdLineTasks that require a &#39;butler&#39; keyword argument to be passed to their construc...
Definition: cmdLineTask.py:415
def profile
Context manager for profiling with cProfile.
Definition: cmdLineTask.py:74
def writeSchemas
Write the schemas returned by getAllSchemaCatalogs.
Definition: cmdLineTask.py:606
Base class for command-line tasks: tasks that may be executed from the command line.
Definition: cmdLineTask.py:439