Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 10%
190 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-09 02:48 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-09 02:48 -0700
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["SingleQuantumExecutor"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
28import sys
29import time
30from collections import defaultdict
31from collections.abc import Callable
32from itertools import chain
33from typing import Any, Optional
35from lsst.daf.butler import (
36 Butler,
37 CollectionType,
38 DatasetRef,
39 DatasetType,
40 LimitedButler,
41 NamedKeyDict,
42 Quantum,
43)
44from lsst.daf.butler.registry.wildcards import CollectionWildcard
45from lsst.pipe.base import (
46 AdjustQuantumHelper,
47 ButlerQuantumContext,
48 Instrument,
49 InvalidQuantumError,
50 NoWorkFound,
51 PipelineTask,
52 RepeatableQuantumError,
53 TaskDef,
54 TaskFactory,
55)
57# During metadata transition phase, determine metadata class by
58# asking pipe_base
59from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE
60from lsst.utils.timer import logInfo
62# -----------------------------
63# Imports for other modules --
64# -----------------------------
65from .log_capture import LogCapture
66from .quantumGraphExecutor import QuantumExecutor
67from .reports import QuantumReport
69# ----------------------------------
70# Local non-exported definitions --
71# ----------------------------------
73_LOG = logging.getLogger(__name__)
76class SingleQuantumExecutor(QuantumExecutor):
77 """Executor class which runs one Quantum at a time.
79 Parameters
80 ----------
81 butler : `~lsst.daf.butler.Butler` or `None`
82 Data butler, `None` means that Quantum-backed butler should be used
83 instead.
84 taskFactory : `~lsst.pipe.base.TaskFactory`
85 Instance of a task factory.
86 skipExistingIn
87 Expressions representing the collections to search for existing
88 output datasets. See :ref:`daf_butler_ordered_collection_searches`
89 for allowed types. This class only checks for the presence of butler
90 output run in the list of collections. If the output run is present
91 in the list then the quanta whose complete outputs exist in the output
92 run will be skipped. `None` or empty string/sequence disables skipping.
93 clobberOutputs : `bool`, optional
94 If `True`, then outputs from a quantum that exist in output run
95 collection will be removed prior to executing a quantum. If
96 ``skipExistingIn`` contains output run, then only partial outputs from
97 a quantum will be removed. Only used when ``butler`` is not `None`.
98 enableLsstDebug : `bool`, optional
99 Enable debugging with ``lsstDebug`` facility for a task.
100 exitOnKnownError : `bool`, optional
101 If `True`, call `sys.exit` with the appropriate exit code for special
102 known exceptions, after printing a traceback, instead of letting the
103 exception propagate up to calling. This is always the behavior for
104 InvalidQuantumError.
105 limited_butler_factory : `Callable`, optional
106 A method that creates a `~lsst.daf.butler.LimitedButler` instance
107 for a given Quantum. This parameter must be defined if ``butler`` is
108 `None`. If ``butler`` is not `None` then this parameter is ignored.
109 """
111 def __init__(
112 self,
113 butler: Butler | None,
114 taskFactory: TaskFactory,
115 skipExistingIn: Any = None,
116 clobberOutputs: bool = False,
117 enableLsstDebug: bool = False,
118 exitOnKnownError: bool = False,
119 limited_butler_factory: Callable[[Quantum], LimitedButler] | None = None,
120 ):
121 self.butler = butler
122 self.taskFactory = taskFactory
123 self.enableLsstDebug = enableLsstDebug
124 self.clobberOutputs = clobberOutputs
125 self.exitOnKnownError = exitOnKnownError
126 self.limited_butler_factory = limited_butler_factory
127 self.report: Optional[QuantumReport] = None
129 if self.butler is None:
130 assert limited_butler_factory is not None, "limited_butler_factory is needed when butler is None"
132 # Find whether output run is in skipExistingIn.
133 # TODO: This duplicates logic in GraphBuilder, would be nice to have
134 # better abstraction for this some day.
135 self.skipExisting = False
136 if self.butler is not None and skipExistingIn:
137 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
138 # As optimization check in the explicit list of names first
139 self.skipExisting = self.butler.run in skip_collections_wildcard.strings
140 if not self.skipExisting:
141 # need to flatten it and check again
142 self.skipExisting = self.butler.run in self.butler.registry.queryCollections(
143 skipExistingIn,
144 collectionTypes=CollectionType.RUN,
145 )
147 def execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum:
148 # Docstring inherited from QuantumExecutor.execute
149 assert quantum.dataId is not None, "Quantum DataId cannot be None"
151 if self.butler is not None:
152 self.butler.registry.refresh()
154 # Catch any exception and make a report based on that.
155 try:
156 result = self._execute(taskDef, quantum)
157 self.report = QuantumReport(dataId=quantum.dataId, taskLabel=taskDef.label)
158 return result
159 except Exception as exc:
160 self.report = QuantumReport.from_exception(
161 exception=exc,
162 dataId=quantum.dataId,
163 taskLabel=taskDef.label,
164 )
165 raise
167 def _execute(self, taskDef: TaskDef, quantum: Quantum) -> Quantum:
168 """Internal implementation of execute()"""
169 startTime = time.time()
171 # Make a limited butler instance if needed (which should be QBB if full
172 # butler is not defined).
173 limited_butler: LimitedButler
174 if self.butler is not None:
175 limited_butler = self.butler
176 else:
177 # We check this in constructor, but mypy needs this check here.
178 assert self.limited_butler_factory is not None
179 limited_butler = self.limited_butler_factory(quantum)
181 if self.butler is not None:
182 log_capture = LogCapture.from_full(self.butler)
183 else:
184 log_capture = LogCapture.from_limited(limited_butler)
185 with log_capture.capture_logging(taskDef, quantum) as captureLog:
186 # Save detailed resource usage before task start to metadata.
187 quantumMetadata = _TASK_METADATA_TYPE()
188 logInfo(None, "prep", metadata=quantumMetadata) # type: ignore[arg-type]
190 # check whether to skip or delete old outputs, if it returns True
191 # or raises an exception do not try to store logs, as they may be
192 # already in butler.
193 captureLog.store = False
194 if self.checkExistingOutputs(quantum, taskDef, limited_butler):
195 _LOG.info(
196 "Skipping already-successful quantum for label=%s dataId=%s.",
197 taskDef.label,
198 quantum.dataId,
199 )
200 return quantum
201 captureLog.store = True
203 try:
204 quantum = self.updatedQuantumInputs(quantum, taskDef, limited_butler)
205 except NoWorkFound as exc:
206 _LOG.info(
207 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s",
208 taskDef.label,
209 quantum.dataId,
210 str(exc),
211 )
212 # Make empty metadata that looks something like what a
213 # do-nothing task would write (but we don't bother with empty
214 # nested PropertySets for subtasks). This is slightly
215 # duplicative with logic in pipe_base that we can't easily call
216 # from here; we'll fix this on DM-29761.
217 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type]
218 fullMetadata = _TASK_FULL_METADATA_TYPE()
219 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE()
220 fullMetadata["quantum"] = quantumMetadata
221 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler)
222 return quantum
224 # enable lsstDebug debugging
225 if self.enableLsstDebug:
226 try:
227 _LOG.debug("Will try to import debug.py")
228 import debug # type: ignore # noqa:F401
229 except ImportError:
230 _LOG.warn("No 'debug' module found.")
232 # initialize global state
233 self.initGlobals(quantum)
235 # Ensure that we are executing a frozen config
236 taskDef.config.freeze()
237 logInfo(None, "init", metadata=quantumMetadata) # type: ignore[arg-type]
238 init_input_refs = list(quantum.initInputs.values())
239 task = self.taskFactory.makeTask(taskDef, limited_butler, init_input_refs)
240 logInfo(None, "start", metadata=quantumMetadata) # type: ignore[arg-type]
241 try:
242 self.runQuantum(task, quantum, taskDef, limited_butler)
243 except Exception as e:
244 _LOG.error(
245 "Execution of task '%s' on quantum %s failed. Exception %s: %s",
246 taskDef.label,
247 quantum.dataId,
248 e.__class__.__name__,
249 str(e),
250 )
251 raise
252 logInfo(None, "end", metadata=quantumMetadata) # type: ignore[arg-type]
253 fullMetadata = task.getFullMetadata()
254 fullMetadata["quantum"] = quantumMetadata
255 self.writeMetadata(quantum, fullMetadata, taskDef, limited_butler)
256 stopTime = time.time()
257 _LOG.info(
258 "Execution of task '%s' on quantum %s took %.3f seconds",
259 taskDef.label,
260 quantum.dataId,
261 stopTime - startTime,
262 )
263 return quantum
265 def checkExistingOutputs(self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler) -> bool:
266 """Decide whether this quantum needs to be executed.
268 If only partial outputs exist then they are removed if
269 ``clobberOutputs`` is True, otherwise an exception is raised.
271 Parameters
272 ----------
273 quantum : `~lsst.daf.butler.Quantum`
274 Quantum to check for existing outputs
275 taskDef : `~lsst.pipe.base.TaskDef`
276 Task definition structure.
278 Returns
279 -------
280 exist : `bool`
281 `True` if ``self.skipExisting`` is defined, and a previous
282 execution of this quanta appears to have completed successfully
283 (either because metadata was written or all datasets were written).
284 `False` otherwise.
286 Raises
287 ------
288 RuntimeError
289 Raised if some outputs exist and some not.
290 """
291 if not self.butler:
292 # Skip/prune logic only works for full butler.
293 return False
295 if self.skipExisting:
296 # Metadata output exists; this is sufficient to assume the previous
297 # run was successful and should be skipped.
298 [metadata_ref] = quantum.outputs[taskDef.metadataDatasetName]
299 if metadata_ref is not None:
300 if limited_butler.stored(metadata_ref):
301 return True
303 # Find and prune (partial) outputs if `self.clobberOutputs` is set.
304 ref_dict = self.butler.stored_many(chain.from_iterable(quantum.outputs.values()))
305 existingRefs = [ref for ref, exists in ref_dict.items() if exists]
306 missingRefs = [ref for ref, exists in ref_dict.items() if not exists]
307 if existingRefs:
308 if not missingRefs:
309 # Full outputs exist.
310 if self.skipExisting:
311 return True
312 elif self.clobberOutputs:
313 _LOG.info("Removing complete outputs for quantum %s: %s", quantum, existingRefs)
314 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True)
315 else:
316 raise RuntimeError(
317 f"Complete outputs exists for a quantum {quantum} "
318 "and neither clobberOutputs nor skipExisting is set: "
319 f"collection={self.butler.run} existingRefs={existingRefs}"
320 )
321 else:
322 # Partial outputs from a failed quantum.
323 _LOG.debug(
324 "Partial outputs exist for quantum %s collection=%s existingRefs=%s missingRefs=%s",
325 quantum,
326 self.butler.run,
327 existingRefs,
328 missingRefs,
329 )
330 if self.clobberOutputs:
331 # only prune
332 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs)
333 self.butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True)
334 return False
335 else:
336 raise RuntimeError(
337 "Registry inconsistency while checking for existing quantum outputs:"
338 f" quantum={quantum} collection={self.butler.run} existingRefs={existingRefs}"
339 f" missingRefs={missingRefs}"
340 )
342 # By default always execute.
343 return False
345 def updatedQuantumInputs(
346 self, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler
347 ) -> Quantum:
348 """Update quantum with extra information, returns a new updated
349 Quantum.
351 Some methods may require input DatasetRefs to have non-None
352 ``dataset_id``, but in case of intermediate dataset it may not be
353 filled during QuantumGraph construction. This method will retrieve
354 missing info from registry.
356 Parameters
357 ----------
358 quantum : `~lsst.daf.butler.Quantum`
359 Single Quantum instance.
360 taskDef : `~lsst.pipe.base.TaskDef`
361 Task definition structure.
363 Returns
364 -------
365 update : `~lsst.daf.butler.Quantum`
366 Updated Quantum instance
367 """
368 anyChanges = False
369 updatedInputs: defaultdict[DatasetType, list] = defaultdict(list)
370 for key, refsForDatasetType in quantum.inputs.items():
371 newRefsForDatasetType = updatedInputs[key]
372 stored = limited_butler.stored_many(refsForDatasetType)
373 for ref in refsForDatasetType:
374 # Inputs may already be resolved even if they do not exist, but
375 # we have to re-resolve them because IDs are ignored on output.
376 # Check datastore for existence first to cover calibration
377 # dataset types, as they would need a timespan for findDataset.
378 resolvedRef: DatasetRef | None
379 if stored[ref]:
380 resolvedRef = ref
381 elif self.butler is not None:
382 # This branch is for mock execution only which does not
383 # generate actual outputs, only adds datasets to registry.
384 resolvedRef = self.butler.registry.findDataset(ref.datasetType, ref.dataId)
385 if resolvedRef is None:
386 _LOG.info("No dataset found for %s", ref)
387 continue
388 else:
389 _LOG.debug("Updated dataset ID for %s", ref)
390 else:
391 # QBB with missing intermediate
392 _LOG.info("No dataset found for %s", ref)
393 continue
395 if (ref_stored := stored.get(resolvedRef)) or (
396 ref_stored is None and limited_butler.stored(resolvedRef)
397 ):
398 # We need to ask datastore if the dataset actually exists
399 # because the Registry of a local "execution butler"
400 # cannot know this (because we prepopulate it with all of
401 # the datasets that might be created). Either we have
402 # already checked and know the answer, or the resolved
403 # ref differed from the original and we have to ask
404 # explicitly for that.
405 newRefsForDatasetType.append(resolvedRef)
407 if len(newRefsForDatasetType) != len(refsForDatasetType):
408 anyChanges = True
409 # If we removed any input datasets, let the task check if it has enough
410 # to proceed and/or prune related datasets that it also doesn't
411 # need/produce anymore. It will raise NoWorkFound if it can't run,
412 # which we'll let propagate up. This is exactly what we run during QG
413 # generation, because a task shouldn't care whether an input is missing
414 # because some previous task didn't produce it, or because it just
415 # wasn't there during QG generation.
416 namedUpdatedInputs = NamedKeyDict[DatasetType, list[DatasetRef]](updatedInputs.items())
417 helper = AdjustQuantumHelper(namedUpdatedInputs, quantum.outputs)
418 if anyChanges:
419 assert quantum.dataId is not None, "Quantum DataId cannot be None"
420 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId)
421 return Quantum(
422 taskName=quantum.taskName,
423 taskClass=quantum.taskClass,
424 dataId=quantum.dataId,
425 initInputs=quantum.initInputs,
426 inputs=helper.inputs,
427 outputs=helper.outputs,
428 )
430 def runQuantum(
431 self, task: PipelineTask, quantum: Quantum, taskDef: TaskDef, limited_butler: LimitedButler
432 ) -> None:
433 """Execute task on a single quantum.
435 Parameters
436 ----------
437 task : `~lsst.pipe.base.PipelineTask`
438 Task object.
439 quantum : `~lsst.daf.butler.Quantum`
440 Single Quantum instance.
441 taskDef : `~lsst.pipe.base.TaskDef`
442 Task definition structure.
443 """
444 # Create a butler that operates in the context of a quantum
445 butlerQC = ButlerQuantumContext(limited_butler, quantum)
447 # Get the input and output references for the task
448 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum)
450 # Call task runQuantum() method. Catch a few known failure modes and
451 # translate them into specific
452 try:
453 task.runQuantum(butlerQC, inputRefs, outputRefs)
454 except NoWorkFound as err:
455 # Not an error, just an early exit.
456 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err))
457 pass
458 except RepeatableQuantumError as err:
459 if self.exitOnKnownError:
460 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId)
461 _LOG.warning(err, exc_info=True)
462 sys.exit(err.EXIT_CODE)
463 else:
464 raise
465 except InvalidQuantumError as err:
466 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId)
467 _LOG.fatal(err, exc_info=True)
468 sys.exit(err.EXIT_CODE)
470 def writeMetadata(
471 self, quantum: Quantum, metadata: Any, taskDef: TaskDef, limited_butler: LimitedButler
472 ) -> None:
473 # DatasetRef has to be in the Quantum outputs, can lookup by name
474 try:
475 [ref] = quantum.outputs[taskDef.metadataDatasetName]
476 except LookupError as exc:
477 raise InvalidQuantumError(
478 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};"
479 " this could happen due to inconsistent options between QuantumGraph generation"
480 " and execution"
481 ) from exc
482 limited_butler.put(metadata, ref)
484 def initGlobals(self, quantum: Quantum) -> None:
485 """Initialize global state needed for task execution.
487 Parameters
488 ----------
489 quantum : `~lsst.daf.butler.Quantum`
490 Single Quantum instance.
492 Notes
493 -----
494 There is an issue with initializing filters singleton which is done
495 by instrument, to avoid requiring tasks to do it in runQuantum()
496 we do it here when any dataId has an instrument dimension. Also for
497 now we only allow single instrument, verify that all instrument
498 names in all dataIds are identical.
500 This will need revision when filter singleton disappears.
501 """
502 # can only work for full butler
503 if self.butler is None:
504 return
505 oneInstrument = None
506 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()):
507 for datasetRef in datasetRefs:
508 dataId = datasetRef.dataId
509 instrument = dataId.get("instrument")
510 if instrument is not None:
511 if oneInstrument is not None:
512 assert ( # type: ignore
513 instrument == oneInstrument
514 ), "Currently require that only one instrument is used per graph"
515 else:
516 oneInstrument = instrument
517 Instrument.fromName(instrument, self.butler.registry)
519 def getReport(self) -> Optional[QuantumReport]:
520 # Docstring inherited from base class
521 if self.report is None:
522 raise RuntimeError("getReport() called before execute()")
523 return self.report