Coverage for python/lsst/pipe/base/cmdLineTask.py: 15%
230 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-06 12:56 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-06 12:56 -0800
1#
2# LSST Data Management System
3# Copyright 2008-2015 AURA/LSST.
4#
5# This product includes software developed by the
6# LSST Project (http://www.lsst.org/).
7#
8# This program is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the LSST License Statement and
19# the GNU General Public License along with this program. If not,
20# see <https://www.lsstcorp.org/LegalNotices/>.
21#
22__all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner", "LegacyTaskRunner"]
24import contextlib
25import functools
26import sys
27import traceback
29import lsst.afw.table as afwTable
30import lsst.log
31import lsst.utils.introspection
32import lsst.utils.logging
33from lsst.base import Packages, disableImplicitThreading
35from .argumentParser import ArgumentParser
36from .struct import Struct
37from .task import Task, TaskError
40def _runPool(pool, timeout, function, iterable):
41 """Wrapper around ``pool.map_async``, to handle timeout
43 This is required so as to trigger an immediate interrupt on the
44 KeyboardInterrupt (Ctrl-C); see
45 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool
46 """
47 return pool.map_async(function, iterable).get(timeout)
50@contextlib.contextmanager
51def profile(filename, log=None):
52 """Context manager for profiling with cProfile.
55 Parameters
56 ----------
57 filename : `str`
58 Filename to which to write profile (profiling disabled if `None` or
59 empty).
60 log : `logging.Logger`, optional
61 Log object for logging the profile operations.
63 If profiling is enabled, the context manager returns the cProfile.Profile
64 object (otherwise it returns None), which allows additional control over
65 profiling. You can obtain this using the "as" clause, e.g.:
67 .. code-block:: python
69 with profile(filename) as prof:
70 runYourCodeHere()
72 The output cumulative profile can be printed with a command-line like:
74 .. code-block:: bash
76 python -c 'import pstats; \
77 pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)'
78 """
79 if not filename:
80 # Nothing to do
81 yield
82 return
83 from cProfile import Profile
85 profile = Profile()
86 if log is not None:
87 log.info("Enabling cProfile profiling")
88 profile.enable()
89 yield profile
90 profile.disable()
91 profile.dump_stats(filename)
92 if log is not None:
93 log.info("cProfile stats written to %s", filename)
96class TaskRunner:
97 """Run a command-line task, using `multiprocessing` if requested.
99 Parameters
100 ----------
101 TaskClass : `lsst.pipe.base.Task` subclass
102 The class of the task to run.
103 parsedCmd : `argparse.Namespace`
104 The parsed command-line arguments, as returned by the task's argument
105 parser's `~lsst.pipe.base.ArgumentParser.parse_args` method.
107 .. warning::
109 Do not store ``parsedCmd``, as this instance is pickled (if
110 multiprocessing) and parsedCmd may contain non-picklable elements.
111 It certainly contains more data than we need to send to each
112 instance of the task.
113 doReturnResults : `bool`, optional
114 Should run return the collected result from each invocation of the
115 task? This is only intended for unit tests and similar use. It can
116 easily exhaust memory (if the task returns enough data and you call it
117 enough times) and it will fail when using multiprocessing if the
118 returned data cannot be pickled.
120 Note that even if ``doReturnResults`` is False a struct with a single
121 member "exitStatus" is returned, with value 0 or 1 to be returned to
122 the unix shell.
124 Raises
125 ------
126 ImportError
127 Raised if multiprocessing is requested (and the task supports it) but
128 the multiprocessing library cannot be imported.
130 Notes
131 -----
132 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a
133 task runner. By default it is this class, but some tasks require a
134 subclass. See the manual :ref:`creating-a-command-line-task` for more
135 information. See `CmdLineTask.parseAndRun` to see how a task runner is
136 used.
138 You may use this task runner for your command-line task if your task has a
139 ``runDataRef`` method that takes exactly one argument: a butler data
140 reference. Otherwise you must provide a task-specific subclass of
141 this runner for your task's ``RunnerClass`` that overrides
142 `TaskRunner.getTargetList` and possibly
143 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details.
145 This design matches the common pattern for command-line tasks: the
146 ``runDataRef`` method takes a single data reference, of some suitable name.
147 Additional arguments are rare, and if present, require a subclass of
148 `TaskRunner` that calls these additional arguments by name.
150 Instances of this class must be picklable in order to be compatible with
151 multiprocessing. If multiprocessing is requested
152 (``parsedCmd.numProcesses > 1``) then `runDataRef` calls
153 `prepareForMultiProcessing` to jettison optional non-picklable elements.
154 If your task runner is not compatible with multiprocessing then indicate
155 this in your task by setting class variable ``canMultiprocess=False``.
157 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires
158 specifying a timeout`__. This timeout (in sec) can be specified as the
159 ``timeout`` element in the output from `~lsst.pipe.base.ArgumentParser`
160 (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`.
162 By default, we disable "implicit" threading -- ie, as provided by
163 underlying numerical libraries such as MKL or BLAS. This is designed to
164 avoid thread contention both when a single command line task spawns
165 multiple processes and when multiple users are running on a shared system.
166 Users can override this behaviour by setting the
167 ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable.
169 .. __: http://bugs.python.org/issue8296
170 .. __: http://stackoverflow.com/questions/1408356/
171 """
173 TIMEOUT = 3600 * 24 * 30
174 """Default timeout (seconds) for multiprocessing."""
176 def __init__(self, TaskClass, parsedCmd, doReturnResults=False):
177 self.TaskClass = TaskClass
178 self.doReturnResults = bool(doReturnResults)
179 self.config = parsedCmd.config
180 self.log = parsedCmd.log
181 self.doRaise = bool(parsedCmd.doraise)
182 self.clobberConfig = bool(parsedCmd.clobberConfig)
183 self.doBackup = not bool(parsedCmd.noBackupConfig)
184 self.numProcesses = int(getattr(parsedCmd, "processes", 1))
186 self.timeout = getattr(parsedCmd, "timeout", None)
187 if self.timeout is None or self.timeout <= 0:
188 self.timeout = self.TIMEOUT
190 if self.numProcesses > 1:
191 if not TaskClass.canMultiprocess:
192 self.log.warning("This task does not support multiprocessing; using one process")
193 self.numProcesses = 1
195 def prepareForMultiProcessing(self):
196 """Prepare this instance for multiprocessing
198 Optional non-picklable elements are removed.
200 This is only called if the task is run under multiprocessing.
201 """
202 self.log = None
204 def run(self, parsedCmd):
205 """Run the task on all targets.
207 Parameters
208 ----------
209 parsedCmd : `argparse.Namespace`
210 Parsed command `argparse.Namespace`.
212 Returns
213 -------
214 resultList : `list`
215 A list of results returned by `TaskRunner.__call__`, or an empty
216 list if `TaskRunner.__call__` is not called (e.g. if
217 `TaskRunner.precall` returns `False`). See `TaskRunner.__call__`
218 for details.
220 Notes
221 -----
222 The task is run under multiprocessing if `TaskRunner.numProcesses`
223 is more than 1; otherwise processing is serial.
224 """
225 resultList = []
226 disableImplicitThreading() # To prevent thread contention
227 if self.numProcesses > 1:
228 import multiprocessing
230 self.prepareForMultiProcessing()
231 pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1)
232 mapFunc = functools.partial(_runPool, pool, self.timeout)
233 else:
234 pool = None
235 mapFunc = map
237 if self.precall(parsedCmd):
238 profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None
239 log = parsedCmd.log
240 targetList = self.getTargetList(parsedCmd)
241 if len(targetList) > 0:
242 with profile(profileName, log):
243 # Run the task using self.__call__
244 resultList = list(mapFunc(self, targetList))
245 else:
246 log.warning(
247 "Not running the task because there is no data to process; "
248 'you may preview data using "--show data"'
249 )
251 if pool is not None:
252 pool.close()
253 pool.join()
255 return resultList
257 @staticmethod
258 def getTargetList(parsedCmd, **kwargs):
259 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`.
261 Parameters
262 ----------
263 parsedCmd : `argparse.Namespace`
264 The parsed command object returned by
265 `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`.
266 kwargs
267 Any additional keyword arguments. In the default `TaskRunner` this
268 is an empty dict, but having it simplifies overriding `TaskRunner`
269 for tasks whose runDataRef method takes additional arguments
270 (see case (1) below).
272 Notes
273 -----
274 The default implementation of `TaskRunner.getTargetList` and
275 `TaskRunner.__call__` works for any command-line task whose
276 ``runDataRef`` method takes exactly one argument: a data reference.
277 Otherwise you must provide a variant of TaskRunner that overrides
278 `TaskRunner.getTargetList` and possibly `TaskRunner.__call__`.
279 There are two cases.
281 **Case 1**
283 If your command-line task has a ``runDataRef`` method that takes one
284 data reference followed by additional arguments, then you need only
285 override `TaskRunner.getTargetList` to return the additional
286 arguments as an argument dict. To make this easier, your overridden
287 version of `~TaskRunner.getTargetList` may call
288 `TaskRunner.getTargetList` with the extra arguments as keyword
289 arguments. For example, the following adds an argument dict containing
290 a single key: "calExpList", whose value is the list of data IDs for
291 the calexp ID argument:
293 .. code-block:: python
295 def getTargetList(parsedCmd):
296 return TaskRunner.getTargetList(
297 parsedCmd,
298 calExpList=parsedCmd.calexp.idList
299 )
301 It is equivalent to this slightly longer version:
303 .. code-block:: python
305 @staticmethod
306 def getTargetList(parsedCmd):
307 argDict = dict(calExpList=parsedCmd.calexp.idList)
308 return [(dataId, argDict) for dataId in parsedCmd.id.idList]
310 **Case 2**
312 If your task does not meet condition (1) then you must override both
313 TaskRunner.getTargetList and `TaskRunner.__call__`. You may do this
314 however you see fit, so long as `TaskRunner.getTargetList`
315 returns a list, each of whose elements is sent to
316 `TaskRunner.__call__`, which runs your task.
317 """
318 return [(ref, kwargs) for ref in parsedCmd.id.refList]
320 def makeTask(self, parsedCmd=None, args=None):
321 """Create a Task instance.
323 Parameters
324 ----------
325 parsedCmd
326 Parsed command-line options (used for extra task args by some task
327 runners).
328 args
329 Args tuple passed to `TaskRunner.__call__` (used for extra task
330 arguments by some task runners).
332 Notes
333 -----
334 ``makeTask`` can be called with either the ``parsedCmd`` argument or
335 ``args`` argument set to None, but it must construct identical Task
336 instances in either case.
338 Subclasses may ignore this method entirely if they reimplement both
339 `TaskRunner.precall` and `TaskRunner.__call__`.
340 """
341 return self.TaskClass(config=self.config, log=self.log)
343 def _precallImpl(self, task, parsedCmd):
344 """The main work of `precall`.
346 We write package versions, schemas and configs, or compare these to
347 existing files on disk if present.
348 """
349 if not parsedCmd.noVersions:
350 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions)
351 task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
352 task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
354 def precall(self, parsedCmd):
355 """Hook for code that should run exactly once, before multiprocessing.
357 Notes
358 -----
359 Must return True if `TaskRunner.__call__` should subsequently be
360 called.
362 .. warning::
364 Implementations must take care to ensure that no unpicklable
365 attributes are added to the TaskRunner itself, for compatibility
366 with multiprocessing.
368 The default implementation writes package versions, schemas and
369 configs, or compares them to existing files on disk if present.
370 """
371 task = self.makeTask(parsedCmd=parsedCmd)
373 if self.doRaise:
374 self._precallImpl(task, parsedCmd)
375 else:
376 try:
377 self._precallImpl(task, parsedCmd)
378 except Exception as e:
379 task.log.fatal("Failed in task initialization: %s", e)
380 if not isinstance(e, TaskError):
381 traceback.print_exc(file=sys.stderr)
382 return False
383 return True
385 def __call__(self, args):
386 """Run the Task on a single target.
388 Parameters
389 ----------
390 args
391 Arguments for Task.runDataRef()
393 Returns
394 -------
395 struct : `lsst.pipe.base.Struct`
396 Contains these fields if ``doReturnResults`` is `True`:
398 - ``dataRef``: the provided data reference.
399 - ``metadata``: task metadata after execution of run.
400 - ``result``: result returned by task run, or `None` if the task
401 fails.
402 - ``exitStatus``: 0 if the task completed successfully, 1
403 otherwise.
405 If ``doReturnResults`` is `False` the struct contains:
407 - ``exitStatus``: 0 if the task completed successfully, 1
408 otherwise.
410 Notes
411 -----
412 This default implementation assumes that the ``args`` is a tuple
413 containing a data reference and a dict of keyword arguments.
415 .. warning::
417 If you override this method and wish to return something when
418 ``doReturnResults`` is `False`, then it must be picklable to
419 support multiprocessing and it should be small enough that pickling
420 and unpickling do not add excessive overhead.
421 """
422 dataRef, kwargs = args
423 if self.log is None:
424 self.log = lsst.utils.logging.getLogger()
425 if hasattr(dataRef, "dataId"):
426 lsst.log.MDC("LABEL", str(dataRef.dataId))
427 elif isinstance(dataRef, (list, tuple)):
428 lsst.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")]))
429 task = self.makeTask(args=args)
430 result = None # in case the task fails
431 exitStatus = 0 # exit status for the shell
432 if self.doRaise:
433 result = self.runTask(task, dataRef, kwargs)
434 else:
435 try:
436 result = self.runTask(task, dataRef, kwargs)
437 except Exception as e:
438 # The shell exit value will be the number of dataRefs returning
439 # non-zero, so the actual value used here is lost.
440 exitStatus = 1
442 # don't use a try block as we need to preserve the original
443 # exception
444 eName = type(e).__name__
445 if hasattr(dataRef, "dataId"):
446 task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e)
447 elif isinstance(dataRef, (list, tuple)):
448 task.log.fatal(
449 "Failed on dataIds=[%s]: %s: %s",
450 ", ".join(str(ref.dataId) for ref in dataRef),
451 eName,
452 e,
453 )
454 else:
455 task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e)
457 if not isinstance(e, TaskError):
458 traceback.print_exc(file=sys.stderr)
460 # Ensure all errors have been logged and aren't hanging around in a
461 # buffer
462 sys.stdout.flush()
463 sys.stderr.flush()
465 task.writeMetadata(dataRef)
467 # remove MDC so it does not show up outside of task context
468 lsst.log.MDCRemove("LABEL")
470 if self.doReturnResults:
471 return Struct(
472 exitStatus=exitStatus,
473 dataRef=dataRef,
474 metadata=task.metadata,
475 result=result,
476 )
477 else:
478 return Struct(
479 exitStatus=exitStatus,
480 )
482 def runTask(self, task, dataRef, kwargs):
483 """Make the actual call to `runDataRef` for this task.
485 Parameters
486 ----------
487 task : `lsst.pipe.base.CmdLineTask` class
488 The class of the task to run.
489 dataRef
490 Butler data reference that contains the data the task will process.
491 kwargs
492 Any additional keyword arguments. See `TaskRunner.getTargetList`
493 above.
495 Notes
496 -----
497 The default implementation of `TaskRunner.runTask` works for any
498 command-line task which has a ``runDataRef`` method that takes a data
499 reference and an optional set of additional keyword arguments.
500 This method returns the results generated by the task's `runDataRef`
501 method.
503 """
504 return task.runDataRef(dataRef, **kwargs)
507class LegacyTaskRunner(TaskRunner):
508 r"""A `TaskRunner` for `CmdLineTask`\ s which calls the `Task`\ 's `run`
509 method on a `dataRef` rather than the `runDataRef` method.
510 """
512 def runTask(self, task, dataRef, kwargs):
513 """Call `run` for this task instead of `runDataRef`. See
514 `TaskRunner.runTask` above for details.
515 """
516 return task.run(dataRef, **kwargs)
519class ButlerInitializedTaskRunner(TaskRunner):
520 r"""A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword
521 argument to be passed to their constructor.
522 """
524 def makeTask(self, parsedCmd=None, args=None):
525 """A variant of the base version that passes a butler argument to the
526 task's constructor.
528 Parameters
529 ----------
530 parsedCmd : `argparse.Namespace`
531 Parsed command-line options, as returned by the
532 `~lsst.pipe.base.ArgumentParser`; if specified then args is
533 ignored.
534 args
535 Other arguments; if ``parsedCmd`` is `None` then this must be
536 specified.
538 Raises
539 ------
540 RuntimeError
541 Raised if ``parsedCmd`` and ``args`` are both `None`.
542 """
543 if parsedCmd is not None:
544 butler = parsedCmd.butler
545 elif args is not None:
546 dataRef, kwargs = args
547 butler = dataRef.butlerSubset.butler
548 else:
549 raise RuntimeError("parsedCmd or args must be specified")
550 return self.TaskClass(config=self.config, log=self.log, butler=butler)
553class CmdLineTask(Task):
554 """Base class for command-line tasks: tasks that may be executed from the
555 command-line.
557 Notes
558 -----
559 See :ref:`task-framework-overview` to learn what tasks are and
560 :ref:`creating-a-command-line-task` for more information about writing
561 command-line tasks.
563 Subclasses must specify the following class variables:
565 - ``ConfigClass``: configuration class for your task (a subclass of
566 `lsst.pex.config.Config`, or if your task needs no configuration, then
567 `lsst.pex.config.Config` itself).
568 - ``_DefaultName``: default name used for this task (a `str`).
570 Subclasses may also specify the following class variables:
572 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``,
573 which works for any task with a runDataRef method that takes exactly one
574 argument: a data reference. If your task does not meet this requirement
575 then you must supply a variant of ``TaskRunner``; see ``TaskRunner``
576 for more information.
577 - ``canMultiprocess``: the default is `True`; set `False` if your task
578 does not support multiprocessing.
580 Subclasses must specify a method named ``runDataRef``:
582 - By default ``runDataRef`` accepts a single butler data reference, but
583 you can specify an alternate task runner (subclass of ``TaskRunner``) as
584 the value of class variable ``RunnerClass`` if your run method needs
585 something else.
586 - ``runDataRef`` is expected to return its data in a
587 `lsst.pipe.base.Struct`. This provides safety for evolution of the task
588 since new values may be added without harming existing code.
589 - The data returned by ``runDataRef`` must be picklable if your task is to
590 support multiprocessing.
591 """
593 RunnerClass = TaskRunner
594 canMultiprocess = True
596 @classmethod
597 def applyOverrides(cls, config):
598 """A hook to allow a task to change the values of its config *after*
599 the camera-specific overrides are loaded but before any command-line
600 overrides are applied.
602 Parameters
603 ----------
604 config : instance of task's ``ConfigClass``
605 Task configuration.
607 Notes
608 -----
609 This is necessary in some cases because the camera-specific overrides
610 may retarget subtasks, wiping out changes made in
611 ConfigClass.setDefaults. See LSST Trac ticket #2282 for more
612 discussion.
614 .. warning::
616 This is called by CmdLineTask.parseAndRun; other ways of
617 constructing a config will not apply these overrides.
618 """
619 pass
621 @classmethod
622 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False):
623 """Parse an argument list and run the command.
625 Parameters
626 ----------
627 args : `list`, optional
628 List of command-line arguments; if `None` use `sys.argv`.
629 config : `lsst.pex.config.Config`-type, optional
630 Config for task. If `None` use `Task.ConfigClass`.
631 log : `logging.Logger`-type, optional
632 Log. If `None` use the default log.
633 doReturnResults : `bool`, optional
634 If `True`, return the results of this task. Default is `False`.
635 This is only intended for unit tests and similar use. It can
636 easily exhaust memory (if the task returns enough data and you
637 call it enough times) and it will fail when using multiprocessing
638 if the returned data cannot be pickled.
640 Returns
641 -------
642 struct : `lsst.pipe.base.Struct`
643 Fields are:
645 ``argumentParser``
646 the argument parser (`lsst.pipe.base.ArgumentParser`).
647 ``parsedCmd``
648 the parsed command returned by the argument parser's
649 `~lsst.pipe.base.ArgumentParser.parse_args` method
650 (`argparse.Namespace`).
651 ``taskRunner``
652 the task runner used to run the task (an instance of
653 `Task.RunnerClass`).
654 ``resultList``
655 results returned by the task runner's ``run`` method, one entry
656 per invocation (`list`). This will typically be a list of
657 `Struct`, each containing at least an ``exitStatus`` integer
658 (0 or 1); see `Task.RunnerClass` (`TaskRunner` by default) for
659 more details.
661 Notes
662 -----
663 Calling this method with no arguments specified is the standard way to
664 run a command-line task from the command-line. For an example see
665 ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other file in that
666 directory.
668 If one or more of the dataIds fails then this routine will exit (with
669 a status giving the number of failed dataIds) rather than returning
670 this struct; this behaviour can be overridden by specifying the
671 ``--noExit`` command-line option.
672 """
673 if args is None:
674 commandAsStr = " ".join(sys.argv)
675 args = sys.argv[1:]
676 else:
677 commandAsStr = "{}{}".format(lsst.utils.introspection.get_caller_name(stacklevel=1), tuple(args))
679 argumentParser = cls._makeArgumentParser()
680 if config is None:
681 config = cls.ConfigClass()
682 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides)
683 # print this message after parsing the command so the log is fully
684 # configured
685 parsedCmd.log.info("Running: %s", commandAsStr)
687 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults)
688 resultList = taskRunner.run(parsedCmd)
690 try:
691 nFailed = sum(((res.exitStatus != 0) for res in resultList))
692 except (TypeError, AttributeError) as e:
693 # NOTE: TypeError if resultList is None, AttributeError if it
694 # doesn't have exitStatus.
695 parsedCmd.log.warning("Unable to retrieve exit status (%s); assuming success", e)
696 nFailed = 0
698 if nFailed > 0:
699 if parsedCmd.noExit:
700 parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed)
701 else:
702 sys.exit(nFailed)
704 return Struct(
705 argumentParser=argumentParser,
706 parsedCmd=parsedCmd,
707 taskRunner=taskRunner,
708 resultList=resultList,
709 )
711 @classmethod
712 def _makeArgumentParser(cls):
713 """Create and return an argument parser.
715 Returns
716 -------
717 parser : `lsst.pipe.base.ArgumentParser`
718 The argument parser for this task.
720 Notes
721 -----
722 By default this returns an `~lsst.pipe.base.ArgumentParser` with one
723 ID argument named `--id` of dataset type ``raw``.
725 Your task subclass may need to override this method to change the
726 dataset type or data ref level, or to add additional data ID arguments.
727 If you add additional data ID arguments or your task's runDataRef
728 method takes more than a single data reference then you will also have
729 to provide a task-specific task runner (see TaskRunner for more
730 information).
731 """
732 parser = ArgumentParser(name=cls._DefaultName)
733 parser.add_id_argument(
734 name="--id", datasetType="raw", help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3"
735 )
736 return parser
738 def writeConfig(self, butler, clobber=False, doBackup=True):
739 """Write the configuration used for processing the data, or check that
740 an existing one is equal to the new one if present.
742 Parameters
743 ----------
744 butler : `lsst.daf.persistence.Butler`
745 Data butler used to write the config. The config is written to
746 dataset type `CmdLineTask._getConfigName`.
747 clobber : `bool`, optional
748 A boolean flag that controls what happens if a config already has
749 been saved:
751 - `True`: overwrite or rename the existing config, depending on
752 ``doBackup``.
753 - `False`: raise `TaskError` if this config does not match the
754 existing config.
755 doBackup : `bool`, optional
756 Set to `True` to backup the config files if clobbering.
757 """
758 configName = self._getConfigName()
759 if configName is None:
760 return
761 if clobber:
762 butler.put(self.config, configName, doBackup=doBackup)
763 elif butler.datasetExists(configName, write=True):
764 # this may be subject to a race condition; see #2789
765 try:
766 oldConfig = butler.get(configName, immediate=True)
767 except Exception as exc:
768 raise type(exc)(
769 f"Unable to read stored config file {configName} (exc); "
770 "consider using --clobber-config"
771 )
773 def logConfigMismatch(msg):
774 self.log.fatal("Comparing configuration: %s", msg)
776 if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
777 raise TaskError(
778 f"Config does not match existing task config {configName!r} on disk; "
779 "tasks configurations must be consistent within the same output repo "
780 "(override with --clobber-config)"
781 )
782 else:
783 butler.put(self.config, configName)
785 def writeSchemas(self, butler, clobber=False, doBackup=True):
786 """Write the schemas returned by
787 `lsst.pipe.base.Task.getAllSchemaCatalogs`.
789 Parameters
790 ----------
791 butler : `lsst.daf.persistence.Butler`
792 Data butler used to write the schema. Each schema is written to the
793 dataset type specified as the key in the dict returned by
794 `~lsst.pipe.base.Task.getAllSchemaCatalogs`.
795 clobber : `bool`, optional
796 A boolean flag that controls what happens if a schema already has
797 been saved:
799 - `True`: overwrite or rename the existing schema, depending on
800 ``doBackup``.
801 - `False`: raise `TaskError` if this schema does not match the
802 existing schema.
803 doBackup : `bool`, optional
804 Set to `True` to backup the schema files if clobbering.
806 Notes
807 -----
808 If ``clobber`` is `False` and an existing schema does not match a
809 current schema, then some schemas may have been saved successfully
810 and others may not, and there is no easy way to tell which is which.
811 """
812 for dataset, catalog in self.getAllSchemaCatalogs().items():
813 schemaDataset = dataset + "_schema"
814 if clobber:
815 butler.put(catalog, schemaDataset, doBackup=doBackup)
816 elif butler.datasetExists(schemaDataset, write=True):
817 oldSchema = butler.get(schemaDataset, immediate=True).getSchema()
818 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL):
819 raise TaskError(
820 f"New schema does not match schema {dataset!r} on disk; "
821 "schemas must be consistent within the same output repo "
822 "(override with --clobber-config)"
823 )
824 else:
825 butler.put(catalog, schemaDataset)
827 def writeMetadata(self, dataRef):
828 """Write the metadata produced from processing the data.
830 Parameters
831 ----------
832 dataRef
833 Butler data reference used to write the metadata.
834 The metadata is written to dataset type
835 `CmdLineTask._getMetadataName`.
836 """
837 try:
838 metadataName = self._getMetadataName()
839 if metadataName is not None:
840 dataRef.put(self.getFullMetadata(), metadataName)
841 except Exception as e:
842 self.log.warning("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e)
844 def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"):
845 """Compare and write package versions.
847 Parameters
848 ----------
849 butler : `lsst.daf.persistence.Butler`
850 Data butler used to read/write the package versions.
851 clobber : `bool`, optional
852 A boolean flag that controls what happens if versions already have
853 been saved:
855 - `True`: overwrite or rename the existing version info, depending
856 on ``doBackup``.
857 - `False`: raise `TaskError` if this version info does not match
858 the existing.
859 doBackup : `bool`, optional
860 If `True` and clobbering, old package version files are backed up.
861 dataset : `str`, optional
862 Name of dataset to read/write.
864 Raises
865 ------
866 TaskError
867 Raised if there is a version mismatch with current and persisted
868 lists of package versions.
870 Notes
871 -----
872 Note that this operation is subject to a race condition.
873 """
874 packages = Packages.fromSystem()
876 if clobber:
877 return butler.put(packages, dataset, doBackup=doBackup)
878 if not butler.datasetExists(dataset, write=True):
879 return butler.put(packages, dataset)
881 try:
882 old = butler.get(dataset, immediate=True)
883 except Exception as exc:
884 raise type(exc)(
885 f"Unable to read stored version dataset {dataset} ({exc}); "
886 "consider using --clobber-versions or --no-versions"
887 )
888 # Note that because we can only detect python modules that have been
889 # imported, the stored list of products may be more or less complete
890 # than what we have now. What's important is that the products that
891 # are in common have the same version.
892 diff = packages.difference(old)
893 if diff:
894 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
895 raise TaskError(
896 f"Version mismatch ({versions_str}); consider using --clobber-versions or --no-versions"
897 )
898 # Update the old set of packages in case we have more packages that
899 # haven't been persisted.
900 extra = packages.extra(old)
901 if extra:
902 old.update(packages)
903 butler.put(old, dataset, doBackup=doBackup)
905 def _getConfigName(self):
906 """Get the name of the config dataset type, or `None` if config is not
907 to be persisted.
909 Notes
910 -----
911 The name may depend on the config; that is why this is not a class
912 method.
913 """
914 return self._DefaultName + "_config"
916 def _getMetadataName(self):
917 """Get the name of the metadata dataset type, or `None` if metadata is
918 not to be persisted.
920 Notes
921 -----
922 The name may depend on the config; that is why this is not a class
923 method.
924 """
925 return self._DefaultName + "_metadata"