Coverage for python/lsst/pipe/base/cmdLineTask.py: 15%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# LSST Data Management System
3# Copyright 2008-2015 AURA/LSST.
4#
5# This product includes software developed by the
6# LSST Project (http://www.lsst.org/).
7#
8# This program is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the LSST License Statement and
19# the GNU General Public License along with this program. If not,
20# see <https://www.lsstcorp.org/LegalNotices/>.
21#
22__all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner", "LegacyTaskRunner"]
24import sys
25import traceback
26import functools
27import contextlib
29import lsst.log
30import lsst.utils
31from lsst.base import disableImplicitThreading
32import lsst.afw.table as afwTable
33from .task import Task, TaskError
34from .struct import Struct
35from .argumentParser import ArgumentParser
36from .task_logging import getTaskLogger
37from lsst.base import Packages
40def _runPool(pool, timeout, function, iterable):
41 """Wrapper around ``pool.map_async``, to handle timeout
43 This is required so as to trigger an immediate interrupt on the
44 KeyboardInterrupt (Ctrl-C); see
45 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
46 """
47 return pool.map_async(function, iterable).get(timeout)
50@contextlib.contextmanager
51def profile(filename, log=None):
52 """Context manager for profiling with cProfile.
55 Parameters
56 ----------
57 filename : `str`
58 Filename to which to write profile (profiling disabled if `None` or
59 empty).
60 log : `logging.Logger`, optional
61 Log object for logging the profile operations.
63 If profiling is enabled, the context manager returns the cProfile.Profile
64 object (otherwise it returns None), which allows additional control over
65 profiling. You can obtain this using the "as" clause, e.g.:
67 .. code-block:: python
69 with profile(filename) as prof:
70 runYourCodeHere()
72 The output cumulative profile can be printed with a command-line like:
74 .. code-block:: bash
76 python -c 'import pstats; \
77 pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
78 """
79 if not filename:
80 # Nothing to do
81 yield
82 return
83 from cProfile import Profile
84 profile = Profile()
85 if log is not None:
86 log.info("Enabling cProfile profiling")
87 profile.enable()
88 yield profile
89 profile.disable()
90 profile.dump_stats(filename)
91 if log is not None:
92 log.info("cProfile stats written to %s", filename)
95class TaskRunner:
96 """Run a command-line task, using `multiprocessing` if requested.
98 Parameters
99 ----------
100 TaskClass : `lsst.pipe.base.Task` subclass
101 The class of the task to run.
102 parsedCmd : `argparse.Namespace`
103 The parsed command-line arguments, as returned by the task's argument
104 parser's `~lsst.pipe.base.ArgumentParser.parse_args` method.
106 .. warning::
108 Do not store ``parsedCmd``, as this instance is pickled (if
109 multiprocessing) and parsedCmd may contain non-picklable elements.
110 It certainly contains more data than we need to send to each
111 instance of the task.
112 doReturnResults : `bool`, optional
113 Should run return the collected result from each invocation of the
114 task? This is only intended for unit tests and similar use. It can
115 easily exhaust memory (if the task returns enough data and you call it
116 enough times) and it will fail when using multiprocessing if the
117 returned data cannot be pickled.
119 Note that even if ``doReturnResults`` is False a struct with a single
120 member "exitStatus" is returned, with value 0 or 1 to be returned to
121 the unix shell.
123 Raises
124 ------
125 ImportError
126 Raised if multiprocessing is requested (and the task supports it) but
127 the multiprocessing library cannot be imported.
129 Notes
130 -----
131 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a
132 task runner. By default it is this class, but some tasks require a
133 subclass. See the manual :ref:`creating-a-command-line-task` for more
134 information. See `CmdLineTask.parseAndRun` to see how a task runner is
135 used.
137 You may use this task runner for your command-line task if your task has a
138 ``runDataRef`` method that takes exactly one argument: a butler data
139 reference. Otherwise you must provide a task-specific subclass of
140 this runner for your task's ``RunnerClass`` that overrides
141 `TaskRunner.getTargetList` and possibly
142 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
144 This design matches the common pattern for command-line tasks: the
145 ``runDataRef`` method takes a single data reference, of some suitable name.
146 Additional arguments are rare, and if present, require a subclass of
147 `TaskRunner` that calls these additional arguments by name.
149 Instances of this class must be picklable in order to be compatible with
150 multiprocessing. If multiprocessing is requested
151 (``parsedCmd.numProcesses > 1``) then `runDataRef` calls
152 `prepareForMultiProcessing` to jettison optional non-picklable elements.
153 If your task runner is not compatible with multiprocessing then indicate
154 this in your task by setting class variable ``canMultiprocess=False``.
156 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires
157 specifying a timeout`__. This timeout (in sec) can be specified as the
158 ``timeout`` element in the output from `~lsst.pipe.base.ArgumentParser`
159 (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
161 By default, we disable "implicit" threading -- ie, as provided by
162 underlying numerical libraries such as MKL or BLAS. This is designed to
163 avoid thread contention both when a single command line task spawns
164 multiple processes and when multiple users are running on a shared system.
165 Users can override this behaviour by setting the
166 ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable.
168 .. __: http://bugs.python.org/issue8296
169 .. __: http://stackoverflow.com/questions/1408356/
170 """
172 TIMEOUT = 3600*24*30
173 """Default timeout (seconds) for multiprocessing."""
175 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
176 self.TaskClass = TaskClass
177 self.doReturnResults = bool(doReturnResults)
178 self.config = parsedCmd.config
179 self.log = parsedCmd.log
180 self.doRaise = bool(parsedCmd.doraise)
181 self.clobberConfig = bool(parsedCmd.clobberConfig)
182 self.doBackup = not bool(parsedCmd.noBackupConfig)
183 self.numProcesses = int(getattr(parsedCmd, 'processes', 1))
185 self.timeout = getattr(parsedCmd, 'timeout', None)
186 if self.timeout is None or self.timeout <= 0:
187 self.timeout = self.TIMEOUT
189 if self.numProcesses > 1:
190 if not TaskClass.canMultiprocess:
191 self.log.warning("This task does not support multiprocessing; using one process")
192 self.numProcesses = 1
194 def prepareForMultiProcessing(self):
195 """Prepare this instance for multiprocessing
197 Optional non-picklable elements are removed.
199 This is only called if the task is run under multiprocessing.
200 """
201 self.log = None
203 def run(self, parsedCmd):
204 """Run the task on all targets.
206 Parameters
207 ----------
208 parsedCmd : `argparse.Namespace`
209 Parsed command `argparse.Namespace`.
211 Returns
212 -------
213 resultList : `list`
214 A list of results returned by `TaskRunner.__call__`, or an empty
215 list if `TaskRunner.__call__` is not called (e.g. if
216 `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
217 for details.
219 Notes
220 -----
221 The task is run under multiprocessing if `TaskRunner.numProcesses`
222 is more than 1; otherwise processing is serial.
223 """
224 resultList = []
225 disableImplicitThreading() # To prevent thread contention
226 if self.numProcesses > 1:
227 import multiprocessing
228 self.prepareForMultiProcessing()
229 pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
230 mapFunc = functools.partial(_runPool, pool, self.timeout)
231 else:
232 pool = None
233 mapFunc = map
235 if self.precall(parsedCmd):
236 profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
237 log = parsedCmd.log
238 targetList = self.getTargetList(parsedCmd)
239 if len(targetList) > 0:
240 with profile(profileName, log):
241 # Run the task using self.__call__
242 resultList = list(mapFunc(self, targetList))
243 else:
244 log.warning("Not running the task because there is no data to process; "
245 "you may preview data using \"--show data\"")
247 if pool is not None:
248 pool.close()
249 pool.join()
251 return resultList
253 @staticmethod
254 def getTargetList(parsedCmd, **kwargs):
255 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
257 Parameters
258 ----------
259 parsedCmd : `argparse.Namespace`
260 The parsed command object returned by
261 `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
262 kwargs
263 Any additional keyword arguments. In the default `TaskRunner` this
264 is an empty dict, but having it simplifies overriding `TaskRunner`
265 for tasks whose runDataRef method takes additional arguments
266 (see case (1) below).
268 Notes
269 -----
270 The default implementation of `TaskRunner.getTargetList` and
271 `TaskRunner.__call__` works for any command-line task whose
272 ``runDataRef`` method takes exactly one argument: a data reference.
273 Otherwise you must provide a variant of TaskRunner that overrides
274 `TaskRunner.getTargetList` and possibly `TaskRunner.__call__`.
275 There are two cases.
277 **Case 1**
279 If your command-line task has a ``runDataRef`` method that takes one
280 data reference followed by additional arguments, then you need only
281 override `TaskRunner.getTargetList` to return the additional
282 arguments as an argument dict. To make this easier, your overridden
283 version of `~TaskRunner.getTargetList` may call
284 `TaskRunner.getTargetList` with the extra arguments as keyword
285 arguments. For example, the following adds an argument dict containing
286 a single key: "calExpList", whose value is the list of data IDs for
287 the calexp ID argument:
289 .. code-block:: python
291 def getTargetList(parsedCmd):
292 return TaskRunner.getTargetList(
293 parsedCmd,
294 calExpList=parsedCmd.calexp.idList
295 )
297 It is equivalent to this slightly longer version:
299 .. code-block:: python
301 @staticmethod
302 def getTargetList(parsedCmd):
303 argDict = dict(calExpList=parsedCmd.calexp.idList)
304 return [(dataId, argDict) for dataId in parsedCmd.id.idList]
306 **Case 2**
308 If your task does not meet condition (1) then you must override both
309 TaskRunner.getTargetList and `TaskRunner.__call__`. You may do this
310 however you see fit, so long as `TaskRunner.getTargetList`
311 returns a list, each of whose elements is sent to
312 `TaskRunner.__call__`, which runs your task.
313 """
314 return [(ref, kwargs) for ref in parsedCmd.id.refList]
316 def makeTask(self, parsedCmd=None, args=None):
317 """Create a Task instance.
319 Parameters
320 ----------
321 parsedCmd
322 Parsed command-line options (used for extra task args by some task
323 runners).
324 args
325 Args tuple passed to `TaskRunner.__call__` (used for extra task
326 arguments by some task runners).
328 Notes
329 -----
330 ``makeTask`` can be called with either the ``parsedCmd`` argument or
331 ``args`` argument set to None, but it must construct identical Task
332 instances in either case.
334 Subclasses may ignore this method entirely if they reimplement both
335 `TaskRunner.precall` and `TaskRunner.__call__`.
336 """
337 return self.TaskClass(config=self.config, log=self.log)
339 def _precallImpl(self, task, parsedCmd):
340 """The main work of `precall`.
342 We write package versions, schemas and configs, or compare these to
343 existing files on disk if present.
344 """
345 if not parsedCmd.noVersions:
346 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
347 task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
348 task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
350 def precall(self, parsedCmd):
351 """Hook for code that should run exactly once, before multiprocessing.
353 Notes
354 -----
355 Must return True if `TaskRunner.__call__` should subsequently be
356 called.
358 .. warning::
360 Implementations must take care to ensure that no unpicklable
361 attributes are added to the TaskRunner itself, for compatibility
362 with multiprocessing.
364 The default implementation writes package versions, schemas and
365 configs, or compares them to existing files on disk if present.
366 """
367 task = self.makeTask(parsedCmd=parsedCmd)
369 if self.doRaise:
370 self._precallImpl(task, parsedCmd)
371 else:
372 try:
373 self._precallImpl(task, parsedCmd)
374 except Exception as e:
375 task.log.fatal("Failed in task initialization: %s", e)
376 if not isinstance(e, TaskError):
377 traceback.print_exc(file=sys.stderr)
378 return False
379 return True
381 def __call__(self, args):
382 """Run the Task on a single target.
384 Parameters
385 ----------
386 args
387 Arguments for Task.runDataRef()
389 Returns
390 -------
391 struct : `lsst.pipe.base.Struct`
392 Contains these fields if ``doReturnResults`` is `True`:
394 - ``dataRef``: the provided data reference.
395 - ``metadata``: task metadata after execution of run.
396 - ``result``: result returned by task run, or `None` if the task
397 fails.
398 - ``exitStatus``: 0 if the task completed successfully, 1
399 otherwise.
401 If ``doReturnResults`` is `False` the struct contains:
403 - ``exitStatus``: 0 if the task completed successfully, 1
404 otherwise.
406 Notes
407 -----
408 This default implementation assumes that the ``args`` is a tuple
409 containing a data reference and a dict of keyword arguments.
411 .. warning::
413 If you override this method and wish to return something when
414 ``doReturnResults`` is `False`, then it must be picklable to
415 support multiprocessing and it should be small enough that pickling
416 and unpickling do not add excessive overhead.
417 """
418 dataRef, kwargs = args
419 if self.log is None:
420 self.log = getTaskLogger()
421 if hasattr(dataRef, "dataId"):
422 lsst.log.MDC("LABEL", str(dataRef.dataId))
423 elif isinstance(dataRef, (list, tuple)):
424 lsst.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
425 task = self.makeTask(args=args)
426 result = None # in case the task fails
427 exitStatus = 0 # exit status for the shell
428 if self.doRaise:
429 result = self.runTask(task, dataRef, kwargs)
430 else:
431 try:
432 result = self.runTask(task, dataRef, kwargs)
433 except Exception as e:
434 # The shell exit value will be the number of dataRefs returning
435 # non-zero, so the actual value used here is lost.
436 exitStatus = 1
438 # don't use a try block as we need to preserve the original
439 # exception
440 eName = type(e).__name__
441 if hasattr(dataRef, "dataId"):
442 task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
443 elif isinstance(dataRef, (list, tuple)):
444 task.log.fatal("Failed on dataIds=[%s]: %s: %s",
445 ", ".join(str(ref.dataId) for ref in dataRef), eName, e)
446 else:
447 task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
449 if not isinstance(e, TaskError):
450 traceback.print_exc(file=sys.stderr)
452 # Ensure all errors have been logged and aren't hanging around in a
453 # buffer
454 sys.stdout.flush()
455 sys.stderr.flush()
457 task.writeMetadata(dataRef)
459 # remove MDC so it does not show up outside of task context
460 lsst.log.MDCRemove("LABEL")
462 if self.doReturnResults:
463 return Struct(
464 exitStatus=exitStatus,
465 dataRef=dataRef,
466 metadata=task.metadata,
467 result=result,
468 )
469 else:
470 return Struct(
471 exitStatus=exitStatus,
472 )
474 def runTask(self, task, dataRef, kwargs):
475 """Make the actual call to `runDataRef` for this task.
477 Parameters
478 ----------
479 task : `lsst.pipe.base.CmdLineTask` class
480 The class of the task to run.
481 dataRef
482 Butler data reference that contains the data the task will process.
483 kwargs
484 Any additional keyword arguments. See `TaskRunner.getTargetList`
485 above.
487 Notes
488 -----
489 The default implementation of `TaskRunner.runTask` works for any
490 command-line task which has a ``runDataRef`` method that takes a data
491 reference and an optional set of additional keyword arguments.
492 This method returns the results generated by the task's `runDataRef`
493 method.
495 """
496 return task.runDataRef(dataRef, **kwargs)
499class LegacyTaskRunner(TaskRunner):
500 r"""A `TaskRunner` for `CmdLineTask`\ s which calls the `Task`\ 's `run`
501 method on a `dataRef` rather than the `runDataRef` method.
502 """
504 def runTask(self, task, dataRef, kwargs):
505 """Call `run` for this task instead of `runDataRef`. See
506 `TaskRunner.runTask` above for details.
507 """
508 return task.run(dataRef, **kwargs)
511class ButlerInitializedTaskRunner(TaskRunner):
512 r"""A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword
513 argument to be passed to their constructor.
514 """
516 def makeTask(self, parsedCmd=None, args=None):
517 """A variant of the base version that passes a butler argument to the
518 task's constructor.
520 Parameters
521 ----------
522 parsedCmd : `argparse.Namespace`
523 Parsed command-line options, as returned by the
524 `~lsst.pipe.base.ArgumentParser`; if specified then args is
525 ignored.
526 args
527 Other arguments; if ``parsedCmd`` is `None` then this must be
528 specified.
530 Raises
531 ------
532 RuntimeError
533 Raised if ``parsedCmd`` and ``args`` are both `None`.
534 """
535 if parsedCmd is not None:
536 butler = parsedCmd.butler
537 elif args is not None:
538 dataRef, kwargs = args
539 butler = dataRef.butlerSubset.butler
540 else:
541 raise RuntimeError("parsedCmd or args must be specified")
542 return self.TaskClass(config=self.config, log=self.log, butler=butler)
545class CmdLineTask(Task):
546 """Base class for command-line tasks: tasks that may be executed from the
547 command-line.
549 Notes
550 -----
551 See :ref:`task-framework-overview` to learn what tasks are and
552 :ref:`creating-a-command-line-task` for more information about writing
553 command-line tasks.
555 Subclasses must specify the following class variables:
557 - ``ConfigClass``: configuration class for your task (a subclass of
558 `lsst.pex.config.Config`, or if your task needs no configuration, then
559 `lsst.pex.config.Config` itself).
560 - ``_DefaultName``: default name used for this task (a `str`).
562 Subclasses may also specify the following class variables:
564 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``,
565 which works for any task with a runDataRef method that takes exactly one
566 argument: a data reference. If your task does not meet this requirement
567 then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
568 for more information.
569 - ``canMultiprocess``: the default is `True`; set `False` if your task
570 does not support multiprocessing.
572 Subclasses must specify a method named ``runDataRef``:
574 - By default ``runDataRef`` accepts a single butler data reference, but
575 you can specify an alternate task runner (subclass of ``TaskRunner``) as
576 the value of class variable ``RunnerClass`` if your run method needs
577 something else.
578 - ``runDataRef`` is expected to return its data in a
579 `lsst.pipe.base.Struct`. This provides safety for evolution of the task
580 since new values may be added without harming existing code.
581 - The data returned by ``runDataRef`` must be picklable if your task is to
582 support multiprocessing.
583 """
584 RunnerClass = TaskRunner
585 canMultiprocess = True
587 @classmethod
588 def applyOverrides(cls, config):
589 """A hook to allow a task to change the values of its config *after*
590 the camera-specific overrides are loaded but before any command-line
591 overrides are applied.
593 Parameters
594 ----------
595 config : instance of task's ``ConfigClass``
596 Task configuration.
598 Notes
599 -----
600 This is necessary in some cases because the camera-specific overrides
601 may retarget subtasks, wiping out changes made in
602 ConfigClass.setDefaults. See LSST Trac ticket #2282 for more
603 discussion.
605 .. warning::
607 This is called by CmdLineTask.parseAndRun; other ways of
608 constructing a config will not apply these overrides.
609 """
610 pass
612 @classmethod
613 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
614 """Parse an argument list and run the command.
616 Parameters
617 ----------
618 args : `list`, optional
619 List of command-line arguments; if `None` use `sys.argv`.
620 config : `lsst.pex.config.Config`-type, optional
621 Config for task. If `None` use `Task.ConfigClass`.
622 log : `logging.Logger`-type, optional
623 Log. If `None` use the default log.
624 doReturnResults : `bool`, optional
625 If `True`, return the results of this task. Default is `False`.
626 This is only intended for unit tests and similar use. It can
627 easily exhaust memory (if the task returns enough data and you
628 call it enough times) and it will fail when using multiprocessing
629 if the returned data cannot be pickled.
631 Returns
632 -------
633 struct : `lsst.pipe.base.Struct`
634 Fields are:
636 ``argumentParser``
637 the argument parser (`lsst.pipe.base.ArgumentParser`).
638 ``parsedCmd``
639 the parsed command returned by the argument parser's
640 `~lsst.pipe.base.ArgumentParser.parse_args` method
641 (`argparse.Namespace`).
642 ``taskRunner``
643 the task runner used to run the task (an instance of
644 `Task.RunnerClass`).
645 ``resultList``
646 results returned by the task runner's ``run`` method, one entry
647 per invocation (`list`). This will typically be a list of
648 `Struct`, each containing at least an ``exitStatus`` integer
649 (0 or 1); see `Task.RunnerClass` (`TaskRunner` by default) for
650 more details.
652 Notes
653 -----
654 Calling this method with no arguments specified is the standard way to
655 run a command-line task from the command-line. For an example see
656 ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other file in that
657 directory.
659 If one or more of the dataIds fails then this routine will exit (with
660 a status giving the number of failed dataIds) rather than returning
661 this struct; this behaviour can be overridden by specifying the
662 ``--noExit`` command-line option.
663 """
664 if args is None:
665 commandAsStr = " ".join(sys.argv)
666 args = sys.argv[1:]
667 else:
668 commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args))
670 argumentParser = cls._makeArgumentParser()
671 if config is None:
672 config = cls.ConfigClass()
673 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
674 # print this message after parsing the command so the log is fully
675 # configured
676 parsedCmd.log.info("Running: %s", commandAsStr)
678 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
679 resultList = taskRunner.run(parsedCmd)
681 try:
682 nFailed = sum(((res.exitStatus != 0) for res in resultList))
683 except (TypeError, AttributeError) as e:
684 # NOTE: TypeError if resultList is None, AttributeError if it
685 # doesn't have exitStatus.
686 parsedCmd.log.warning("Unable to retrieve exit status (%s); assuming success", e)
687 nFailed = 0
689 if nFailed > 0:
690 if parsedCmd.noExit:
691 parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
692 else:
693 sys.exit(nFailed)
695 return Struct(
696 argumentParser=argumentParser,
697 parsedCmd=parsedCmd,
698 taskRunner=taskRunner,
699 resultList=resultList,
700 )
702 @classmethod
703 def _makeArgumentParser(cls):
704 """Create and return an argument parser.
706 Returns
707 -------
708 parser : `lsst.pipe.base.ArgumentParser`
709 The argument parser for this task.
711 Notes
712 -----
713 By default this returns an `~lsst.pipe.base.ArgumentParser` with one
714 ID argument named `--id` of dataset type ``raw``.
716 Your task subclass may need to override this method to change the
717 dataset type or data ref level, or to add additional data ID arguments.
718 If you add additional data ID arguments or your task's runDataRef
719 method takes more than a single data reference then you will also have
720 to provide a task-specific task runner (see TaskRunner for more
721 information).
722 """
723 parser = ArgumentParser(name=cls._DefaultName)
724 parser.add_id_argument(name="--id", datasetType="raw",
725 help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3")
726 return parser
728 def writeConfig(self, butler, clobber=False, doBackup=True):
729 """Write the configuration used for processing the data, or check that
730 an existing one is equal to the new one if present.
732 Parameters
733 ----------
734 butler : `lsst.daf.persistence.Butler`
735 Data butler used to write the config. The config is written to
736 dataset type `CmdLineTask._getConfigName`.
737 clobber : `bool`, optional
738 A boolean flag that controls what happens if a config already has
739 been saved:
741 - `True`: overwrite or rename the existing config, depending on
742 ``doBackup``.
743 - `False`: raise `TaskError` if this config does not match the
744 existing config.
745 doBackup : `bool`, optional
746 Set to `True` to backup the config files if clobbering.
747 """
748 configName = self._getConfigName()
749 if configName is None:
750 return
751 if clobber:
752 butler.put(self.config, configName, doBackup=doBackup)
753 elif butler.datasetExists(configName, write=True):
754 # this may be subject to a race condition; see #2789
755 try:
756 oldConfig = butler.get(configName, immediate=True)
757 except Exception as exc:
758 raise type(exc)(f"Unable to read stored config file {configName} (exc); "
759 "consider using --clobber-config")
761 def logConfigMismatch(msg):
762 self.log.fatal("Comparing configuration: %s", msg)
764 if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
765 raise TaskError(
766 f"Config does not match existing task config {configName!r} on disk; "
767 "tasks configurations must be consistent within the same output repo "
768 "(override with --clobber-config)")
769 else:
770 butler.put(self.config, configName)
772 def writeSchemas(self, butler, clobber=False, doBackup=True):
773 """Write the schemas returned by
774 `lsst.pipe.base.Task.getAllSchemaCatalogs`.
776 Parameters
777 ----------
778 butler : `lsst.daf.persistence.Butler`
779 Data butler used to write the schema. Each schema is written to the
780 dataset type specified as the key in the dict returned by
781 `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
782 clobber : `bool`, optional
783 A boolean flag that controls what happens if a schema already has
784 been saved:
786 - `True`: overwrite or rename the existing schema, depending on
787 ``doBackup``.
788 - `False`: raise `TaskError` if this schema does not match the
789 existing schema.
790 doBackup : `bool`, optional
791 Set to `True` to backup the schema files if clobbering.
793 Notes
794 -----
795 If ``clobber`` is `False` and an existing schema does not match a
796 current schema, then some schemas may have been saved successfully
797 and others may not, and there is no easy way to tell which is which.
798 """
799 for dataset, catalog in self.getAllSchemaCatalogs().items():
800 schemaDataset = dataset + "_schema"
801 if clobber:
802 butler.put(catalog, schemaDataset, doBackup=doBackup)
803 elif butler.datasetExists(schemaDataset, write=True):
804 oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
805 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
806 raise TaskError(
807 f"New schema does not match schema {dataset!r} on disk; "
808 "schemas must be consistent within the same output repo "
809 "(override with --clobber-config)")
810 else:
811 butler.put(catalog, schemaDataset)
813 def writeMetadata(self, dataRef):
814 """Write the metadata produced from processing the data.
816 Parameters
817 ----------
818 dataRef
819 Butler data reference used to write the metadata.
820 The metadata is written to dataset type
821 `CmdLineTask._getMetadataName`.
822 """
823 try:
824 metadataName = self._getMetadataName()
825 if metadataName is not None:
826 dataRef.put(self.getFullMetadata(), metadataName)
827 except Exception as e:
828 self.log.warning("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
830 def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
831 """Compare and write package versions.
833 Parameters
834 ----------
835 butler : `lsst.daf.persistence.Butler`
836 Data butler used to read/write the package versions.
837 clobber : `bool`, optional
838 A boolean flag that controls what happens if versions already have
839 been saved:
841 - `True`: overwrite or rename the existing version info, depending
842 on ``doBackup``.
843 - `False`: raise `TaskError` if this version info does not match
844 the existing.
845 doBackup : `bool`, optional
846 If `True` and clobbering, old package version files are backed up.
847 dataset : `str`, optional
848 Name of dataset to read/write.
850 Raises
851 ------
852 TaskError
853 Raised if there is a version mismatch with current and persisted
854 lists of package versions.
856 Notes
857 -----
858 Note that this operation is subject to a race condition.
859 """
860 packages = Packages.fromSystem()
862 if clobber:
863 return butler.put(packages, dataset, doBackup=doBackup)
864 if not butler.datasetExists(dataset, write=True):
865 return butler.put(packages, dataset)
867 try:
868 old = butler.get(dataset, immediate=True)
869 except Exception as exc:
870 raise type(exc)(f"Unable to read stored version dataset {dataset} ({exc}); "
871 "consider using --clobber-versions or --no-versions")
872 # Note that because we can only detect python modules that have been
873 # imported, the stored list of products may be more or less complete
874 # than what we have now. What's important is that the products that
875 # are in common have the same version.
876 diff = packages.difference(old)
877 if diff:
878 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
879 raise TaskError(
880 f"Version mismatch ({versions_str}); consider using --clobber-versions or --no-versions")
881 # Update the old set of packages in case we have more packages that
882 # haven't been persisted.
883 extra = packages.extra(old)
884 if extra:
885 old.update(packages)
886 butler.put(old, dataset, doBackup=doBackup)
888 def _getConfigName(self):
889 """Get the name of the config dataset type, or `None` if config is not
890 to be persisted.
892 Notes
893 -----
894 The name may depend on the config; that is why this is not a class
895 method.
896 """
897 return self._DefaultName + "_config"
899 def _getMetadataName(self):
900 """Get the name of the metadata dataset type, or `None` if metadata is
901 not to be persisted.
903 Notes
904 -----
905 The name may depend on the config; that is why this is not a class
906 method.
907 """
908 return self._DefaultName + "_metadata"