Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 11%
203 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 03:29 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 03:29 -0700
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28__all__ = ["SingleQuantumExecutor"]
30# -------------------------------
31# Imports of standard modules --
32# -------------------------------
33import logging
34import sys
35import time
36import warnings
37from collections import defaultdict
38from collections.abc import Callable
39from itertools import chain
40from typing import Any, cast
42from lsst.daf.butler import (
43 Butler,
44 CollectionType,
45 DatasetRef,
46 DatasetType,
47 LimitedButler,
48 NamedKeyDict,
49 Quantum,
50)
51from lsst.daf.butler.registry.wildcards import CollectionWildcard
52from lsst.pipe.base import (
53 AdjustQuantumHelper,
54 ExecutionResources,
55 Instrument,
56 InvalidQuantumError,
57 NoWorkFound,
58 PipelineTask,
59 QuantumContext,
60 RepeatableQuantumError,
61 TaskDef,
62 TaskFactory,
63)
64from lsst.pipe.base.pipeline_graph import PipelineGraph, TaskNode
66# During metadata transition phase, determine metadata class by
67# asking pipe_base
68from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE
69from lsst.utils.introspection import find_outside_stacklevel
70from lsst.utils.timer import logInfo
72# -----------------------------
73# Imports for other modules --
74# -----------------------------
75from .log_capture import LogCapture
76from .quantumGraphExecutor import QuantumExecutor
77from .reports import QuantumReport
79# ----------------------------------
80# Local non-exported definitions --
81# ----------------------------------
83_LOG = logging.getLogger(__name__)
86class SingleQuantumExecutor(QuantumExecutor):
87 """Executor class which runs one Quantum at a time.
89 Parameters
90 ----------
91 butler : `~lsst.daf.butler.Butler` or `None`
92 Data butler, `None` means that Quantum-backed butler should be used
93 instead.
94 taskFactory : `~lsst.pipe.base.TaskFactory`
95 Instance of a task factory.
96 skipExistingIn : `~typing.Any`
97 Expressions representing the collections to search for existing
98 output datasets. See :ref:`daf_butler_ordered_collection_searches`
99 for allowed types. This class only checks for the presence of butler
100 output run in the list of collections. If the output run is present
101 in the list then the quanta whose complete outputs exist in the output
102 run will be skipped. `None` or empty string/sequence disables skipping.
103 clobberOutputs : `bool`, optional
104 If `True`, then outputs from a quantum that exist in output run
105 collection will be removed prior to executing a quantum. If
106 ``skipExistingIn`` contains output run, then only partial outputs from
107 a quantum will be removed. Only used when ``butler`` is not `None`.
108 enableLsstDebug : `bool`, optional
109 Enable debugging with ``lsstDebug`` facility for a task.
110 exitOnKnownError : `bool`, optional
111 If `True`, call `sys.exit` with the appropriate exit code for special
112 known exceptions, after printing a traceback, instead of letting the
113 exception propagate up to calling. This is always the behavior for
114 InvalidQuantumError.
115 limited_butler_factory : `Callable`, optional
116 A method that creates a `~lsst.daf.butler.LimitedButler` instance
117 for a given Quantum. This parameter must be defined if ``butler`` is
118 `None`. If ``butler`` is not `None` then this parameter is ignored.
119 resources : `~lsst.pipe.base.ExecutionResources`, optional
120 The resources available to this quantum when executing.
121 skipExisting : `bool`, optional
122 If `True`, skip quanta whose metadata datasets are already stored.
123 Unlike ``skipExistingIn``, this works with limited butlers as well as
124 full butlers. Always set to `True` if ``skipExistingIn`` matches
125 ``butler.run``.
126 """
128 def __init__(
129 self,
130 butler: Butler | None,
131 taskFactory: TaskFactory,
132 skipExistingIn: Any = None,
133 clobberOutputs: bool = False,
134 enableLsstDebug: bool = False,
135 exitOnKnownError: bool = False,
136 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None,
137 resources: ExecutionResources | None = None,
138 skipExisting: bool = False,
139 ):
140 self.butler = butler
141 self.taskFactory = taskFactory
142 self.enableLsstDebug = enableLsstDebug
143 self.clobberOutputs = clobberOutputs
144 self.exitOnKnownError = exitOnKnownError
145 self.limited_butler_factory = limited_butler_factory
146 self.report: QuantumReport | None = None
147 self.resources = resources
149 if self.butler is None:
150 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None"
152 # Find whether output run is in skipExistingIn.
153 # TODO: This duplicates logic in GraphBuilder, would be nice to have
154 # better abstraction for this some day.
155 self.skipExisting = skipExisting
156 if self.butler is not None and skipExistingIn:
157 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
158 # As optimization check in the explicit list of names first
159 self.skipExisting = self.butler.run in skip_collections_wildcard.strings
160 if not self.skipExisting:
161 # need to flatten it and check again
162 self.skipExisting = self.butler.run in self.butler.registry.queryCollections(
163 skipExistingIn,
164 collectionTypes=CollectionType.RUN,
165 )
167 def execute(self, task_node: TaskDef | TaskNode, /, quantum: Quantum) -> Quantum:
168 # Docstring inherited from QuantumExecutor.execute
169 assert quantum.dataId is not None, "Quantum DataId cannot be None"
171 task_node = self._conform_task_def(task_node)
172 if self.butler is not None:
173 self.butler.registry.refresh()
175 # Catch any exception and make a report based on that.
176 try:
177 result = self._execute(task_node, quantum)
178 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=task_node.label)
179 return result
180 except Exception as exc:
181 self.report = QuantumReport.from_exception(
182 exception=exc,
183 dataId=quantum.dataId,
184 taskLabel=task_node.label,
185 )
186 raise
188 def _conform_task_def(self, task_node: TaskDef | TaskNode) -> TaskNode:
189 """Convert the given object to a TaskNode and emit a deprecation
190 warning if it isn't one already.
191 """
192 # TODO: remove this function and all call points on DM-40443, and
193 # fix annotations and docstrings for those methods as well.
194 if isinstance(task_node, TaskDef):
195 warnings.warn(
196 "Passing TaskDef to SingleQuantumExecutor methods is deprecated "
197 "and will not be supported after v27.",
198 FutureWarning,
199 find_outside_stacklevel("lsst.ctrl.mpexec"),
200 )
201 # Convert to a real TaskNode to avoid a warnings cascade.
202 pipeline_graph = PipelineGraph()
203 return pipeline_graph.add_task(
204 task_node.label, task_node.taskClass, task_node.config, connections=task_node.connections
205 )
206 return task_node
208 def _execute(self, task_node: TaskNode, /, quantum: Quantum) -> Quantum:
209 """Execute the quantum.
211 Internal implementation of `execute()`.
212 """
213 startTime = time.time()
215 # Make a limited butler instance if needed (which should be QBB if full
216 # butler is not defined).
217 limited_butler: LimitedButler
218 if self.butler is not None:
219 limited_butler = self.butler
220 else:
221 # We check this in constructor, but mypy needs this check here.
222 assert self.limited_butler_factory is not None
223 limited_butler = self.limited_butler_factory(quantum)
225 if self.butler is not None:
226 log_capture = LogCapture.from_full(self.butler)
227 else:
228 log_capture = LogCapture.from_limited(limited_butler)
229 with log_capture.capture_logging(task_node, quantum) as captureLog:
230 # Save detailed resource usage before task start to metadata.
231 quantumMetadata = _TASK_METADATA_TYPE()
232 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type]
234 _LOG.info(
235 "Preparing execution of quantum for label=%s dataId=%s.", task_node.label, quantum.dataId
236 )
238 # check whether to skip or delete old outputs, if it returns True
239 # or raises an exception do not try to store logs, as they may be
240 # already in butler.
241 captureLog.store = False
242 if self.checkExistingOutputs(quantum, task_node, limited_butler):
243 _LOG.info(
244 "Skipping already-successful quantum for label=%s dataId=%s.",
245 task_node.label,
246 quantum.dataId,
247 )
248 return quantum
249 captureLog.store = True
251 try:
252 quantum = self.updatedQuantumInputs(quantum, task_node, limited_butler)
253 except NoWorkFound as exc:
254 _LOG.info(
255 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s",
256 task_node.label,
257 quantum.dataId,
258 str(exc),
259 )
260 # Make empty metadata that looks something like what a
261 # do-nothing task would write (but we don't bother with empty
262 # nested PropertySets for subtasks). This is slightly
263 # duplicative with logic in pipe_base that we can't easily call
264 # from here; we'll fix this on DM-29761.
265 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type]
266 fullMetadata = _TASK_FULL_METADATA_TYPE()
267 fullMetadata[task_node.label] = _TASK_METADATA_TYPE()
268 fullMetadata["quantum"] = quantumMetadata
269 self.writeMetadata(quantum, fullMetadata, task_node, limited_butler)
270 return quantum
272 # enable lsstDebug debugging
273 if self.enableLsstDebug:
274 try:
275 _LOG.debug("Will try to import debug.py")
276 import debug # type: ignore # noqa:F401
277 except ImportError:
278 _LOG.warn("No 'debug' module found.")
280 # initialize global state
281 self.initGlobals(quantum)
283 # Ensure that we are executing a frozen config
284 task_node.config.freeze()
285 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type]
286 init_input_refs = list(quantum.initInputs.values())
288 _LOG.info(
289 "Constructing task and executing quantum for label=%s dataId=%s.",
290 task_node.label,
291 quantum.dataId,
292 )
293 task = self.taskFactory.makeTask(task_node, limited_butler, init_input_refs)
294 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type]
295 try:
296 self.runQuantum(task, quantum, task_node, limited_butler)
297 except Exception as e:
298 _LOG.error(
299 "Execution of task '%s' on quantum %s failed. Exception %s: %s",
300 task_node.label,
301 quantum.dataId,
302 e.__class__.__name__,
303 str(e),
304 )
305 raise
306 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type]
307 fullMetadata = task.getFullMetadata()
308 fullMetadata["quantum"] = quantumMetadata
309 self.writeMetadata(quantum, fullMetadata, task_node, limited_butler)
310 stopTime = time.time()
311 _LOG.info(
312 "Execution of task '%s' on quantum %s took %.3f seconds",
313 task_node.label,
314 quantum.dataId,
315 stopTime - startTime,
316 )
317 return quantum
319 def checkExistingOutputs(
320 self, quantum: Quantum, task_node: TaskDef | TaskNode, /, limited_butler: LimitedButler
321 ) -> bool:
322 """Decide whether this quantum needs to be executed.
324 If only partial outputs exist then they are removed if
325 ``clobberOutputs`` is True, otherwise an exception is raised.
327 The ``LimitedButler`` is used for everything, and should be set to
328 ``self.butler`` if no separate ``LimitedButler`` is available.
330 Parameters
331 ----------
332 quantum : `~lsst.daf.butler.Quantum`
333 Quantum to check for existing outputs.
334 task_node : `~lsst.pipe.base.TaskDef` or \
335 `~lsst.pipe.base.pipeline_graph.TaskNode`
336 Task definition structure. `~lsst.pipe.base.TaskDef` support is
337 deprecated and will be removed after v27.
338 limited_butler : `~lsst.daf.butler.LimitedButler`
339 Butler to use for querying and clobbering.
341 Returns
342 -------
343 exist : `bool`
344 `True` if ``self.skipExisting`` is defined, and a previous
345 execution of this quanta appears to have completed successfully
346 (either because metadata was written or all datasets were written).
347 `False` otherwise.
349 Raises
350 ------
351 RuntimeError
352 Raised if some outputs exist and some not.
353 """
354 task_node = self._conform_task_def(task_node)
356 if not self.butler:
357 # Skip/prune logic only works for full butler.
358 return False
360 if self.skipExisting:
361 _LOG.debug(
362 "Checking existence of metadata from previous execution of label=%s dataId=%s.",
363 task_node.label,
364 quantum.dataId,
365 )
366 # Metadata output exists; this is sufficient to assume the previous
367 # run was successful and should be skipped.
368 [metadata_ref] = quantum.outputs[task_node.metadata_output.dataset_type_name]
369 if metadata_ref is not None:
370 if limited_butler.stored(metadata_ref):
371 return True
373 # Find and prune (partial) outputs if `self.clobberOutputs` is set.
374 _LOG.debug(
375 "Looking for existing outputs in the way for label=%s dataId=%s.", task_node.label, quantum.dataId
376 )
377 ref_dict = limited_butler.stored_many(chain.from_iterable(quantum.outputs.values()))
378 existingRefs = [ref for ref, exists in ref_dict.items() if exists]
379 missingRefs = [ref for ref, exists in ref_dict.items() if not exists]
380 if existingRefs:
381 if not missingRefs:
382 # Full outputs exist.
383 if self.skipExisting:
384 return True
385 elif self.clobberOutputs:
386 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs)
387 limited_butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True)
388 else:
389 raise RuntimeError(
390 f"Complete outputs exists for a quantum {quantum} "
391 "and neither clobberOutputs nor skipExisting is set: "
392 f"existingRefs={existingRefs}"
393 )
394 else:
395 # Partial outputs from a failed quantum.
396 _LOG.debug(
397 "Partial outputs exist for quantum %s existingRefs=%s missingRefs=%s",
398 quantum,
399 existingRefs,
400 missingRefs,
401 )
402 if self.clobberOutputs:
403 # only prune
404 _LOG.info("Removing partial outputs for task %s: %s", task_node.label, existingRefs)
405 limited_butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True)
406 return False
407 else:
408 raise RuntimeError(
409 "Registry inconsistency while checking for existing quantum outputs:"
410 f" quantum={quantum} existingRefs={existingRefs}"
411 f" missingRefs={missingRefs}"
412 )
414 # By default always execute.
415 return False
417 def updatedQuantumInputs(
418 self, quantum: Quantum, task_node: TaskDef | TaskNode, /, limited_butler: LimitedButler
419 ) -> Quantum:
420 """Update quantum with extra information, returns a new updated
421 Quantum.
423 Some methods may require input DatasetRefs to have non-None
424 ``dataset_id``, but in case of intermediate dataset it may not be
425 filled during QuantumGraph construction. This method will retrieve
426 missing info from registry.
428 Parameters
429 ----------
430 quantum : `~lsst.daf.butler.Quantum`
431 Single Quantum instance.
432 task_node : `~lsst.pipe.base.TaskDef` or \
433 `~lsst.pipe.base.pipeline_graph.TaskNode`
434 Task definition structure. `~lsst.pipe.base.TaskDef` support is
435 deprecated and will be removed after v27.
436 limited_butler : `~lsst.daf.butler.LimitedButler`
437 Butler to use for querying.
439 Returns
440 -------
441 update : `~lsst.daf.butler.Quantum`
442 Updated Quantum instance.
443 """
444 task_node = self._conform_task_def(task_node)
446 anyChanges = False
447 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list)
448 for key, refsForDatasetType in quantum.inputs.items():
449 _LOG.debug(
450 "Checking existence of input '%s' for label=%s dataId=%s.",
451 key.name,
452 task_node.label,
453 quantum.dataId,
454 )
455 newRefsForDatasetType = updatedInputs[key]
456 stored = limited_butler.stored_many(refsForDatasetType)
457 for ref in refsForDatasetType:
458 if stored[ref]:
459 newRefsForDatasetType.append(ref)
460 else:
461 # This should only happen if a predicted intermediate was
462 # not actually produced upstream, but
463 # datastore misconfigurations can unfortunately also land
464 # us here.
465 _LOG.info("No dataset artifact found for %s", ref)
466 continue
467 if len(newRefsForDatasetType) != len(refsForDatasetType):
468 anyChanges = True
469 # If we removed any input datasets, let the task check if it has enough
470 # to proceed and/or prune related datasets that it also doesn't
471 # need/produce anymore. It will raise NoWorkFound if it can't run,
472 # which we'll let propagate up. This is exactly what we run during QG
473 # generation, because a task shouldn't care whether an input is missing
474 # because some previous task didn't produce it, or because it just
475 # wasn't there during QG generation.
476 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items())
477 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs)
478 if anyChanges:
479 _LOG.debug("Running adjustQuantum for label=%s dataId=%s.", task_node.label, quantum.dataId)
480 assert quantum.dataId is not None, "Quantum DataId cannot be None"
481 helper.adjust_in_place(task_node.get_connections(), label=task_node.label, data_id=quantum.dataId)
482 return Quantum(
483 taskName=quantum.taskName,
484 taskClass=quantum.taskClass,
485 dataId=quantum.dataId,
486 initInputs=quantum.initInputs,
487 inputs=helper.inputs,
488 outputs=helper.outputs,
489 )
491 def runQuantum(
492 self,
493 task: PipelineTask,
494 quantum: Quantum,
495 task_node: TaskDef | TaskNode,
496 /,
497 limited_butler: LimitedButler,
498 ) -> None:
499 """Execute task on a single quantum.
501 Parameters
502 ----------
503 task : `~lsst.pipe.base.PipelineTask`
504 Task object.
505 quantum : `~lsst.daf.butler.Quantum`
506 Single Quantum instance.
507 task_node : `~lsst.pipe.base.TaskDef` or \
508 `~lsst.pipe.base.pipeline_graph.TaskNode`
509 Task definition structure. `~lsst.pipe.base.TaskDef` support is
510 deprecated and will be removed after v27.
511 limited_butler : `~lsst.daf.butler.LimitedButler`
512 Butler to use for dataset I/O.
513 """
514 task_node = self._conform_task_def(task_node)
516 # Create a butler that operates in the context of a quantum
517 butlerQC = QuantumContext(limited_butler, quantum, resources=self.resources)
519 # Get the input and output references for the task
520 inputRefs, outputRefs = task_node.get_connections().buildDatasetRefs(quantum)
522 # Call task runQuantum() method. Catch a few known failure modes and
523 # translate them into specific
524 try:
525 task.runQuantum(butlerQC, inputRefs, outputRefs)
526 except NoWorkFound as err:
527 # Not an error, just an early exit.
528 _LOG.info("Task '%s' on quantum %s exited early: %s", task_node.label, quantum.dataId, str(err))
529 pass
530 except RepeatableQuantumError as err:
531 if self.exitOnKnownError:
532 _LOG.warning("Caught repeatable quantum error for %s (%s):", task_node.label, quantum.dataId)
533 _LOG.warning(err, exc_info=True)
534 sys.exit(err.EXIT_CODE)
535 else:
536 raise
537 except InvalidQuantumError as err:
538 _LOG.fatal("Invalid quantum error for %s (%s): %s", task_node.label, quantum.dataId)
539 _LOG.fatal(err, exc_info=True)
540 sys.exit(err.EXIT_CODE)
542 def writeMetadata(
543 self, quantum: Quantum, metadata: Any, task_node: TaskDef | TaskNode, /, limited_butler: LimitedButler
544 ) -> None:
545 # DatasetRef has to be in the Quantum outputs, can lookup by name
546 task_node = self._conform_task_def(task_node)
547 try:
548 [ref] = quantum.outputs[task_node.metadata_output.dataset_type_name]
549 except LookupError as exc:
550 raise InvalidQuantumError(
551 "Quantum outputs is missing metadata dataset type "
552 f"{task_node.metadata_output.dataset_type_name};"
553 " this could happen due to inconsistent options between QuantumGraph generation"
554 " and execution"
555 ) from exc
556 limited_butler.put(metadata, ref)
558 def initGlobals(self, quantum: Quantum) -> None:
559 """Initialize global state needed for task execution.
561 Parameters
562 ----------
563 quantum : `~lsst.daf.butler.Quantum`
564 Single Quantum instance.
566 Notes
567 -----
568 There is an issue with initializing filters singleton which is done
569 by instrument, to avoid requiring tasks to do it in runQuantum()
570 we do it here when any dataId has an instrument dimension. Also for
571 now we only allow single instrument, verify that all instrument
572 names in all dataIds are identical.
574 This will need revision when filter singleton disappears.
575 """
576 # can only work for full butler
577 if self.butler is None:
578 return
579 oneInstrument = None
580 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()):
581 for datasetRef in datasetRefs:
582 dataId = datasetRef.dataId
583 instrument = cast(str, dataId.get("instrument"))
584 if instrument is not None:
585 if oneInstrument is not None:
586 assert ( # type: ignore
587 instrument == oneInstrument
588 ), "Currently require that only one instrument is used per graph"
589 else:
590 oneInstrument = instrument
591 Instrument.fromName(instrument, self.butler.registry)
593 def getReport(self) -> QuantumReport | None:
594 # Docstring inherited from base class
595 if self.report is None:
596 raise RuntimeError("getReport() called before execute()")
597 return self.report