Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py: 12%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["SingleQuantumExecutor"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
28import os
29import shutil
30import sys
31import tempfile
32import time
33from collections import defaultdict
34from contextlib import contextmanager
35from itertools import chain
36from logging import FileHandler
37from typing import List
39from lsst.daf.butler import DatasetRef, DatasetType, FileDataset, NamedKeyDict, Quantum
40from lsst.daf.butler.core.logging import ButlerLogRecordHandler, ButlerLogRecords, ButlerMDC, JsonLogFormatter
41from lsst.obs.base import Instrument
42from lsst.pipe.base import (
43 AdjustQuantumHelper,
44 ButlerQuantumContext,
45 InvalidQuantumError,
46 NoWorkFound,
47 RepeatableQuantumError,
48)
49from lsst.pipe.base.configOverrides import ConfigOverrides
51# During metadata transition phase, determine metadata class by
52# asking pipe_base
53from lsst.pipe.base.task import _TASK_FULL_METADATA_TYPE, _TASK_METADATA_TYPE
54from lsst.utils.timer import logInfo
56# -----------------------------
57# Imports for other modules --
58# -----------------------------
59from .mock_task import MockButlerQuantumContext, MockPipelineTask
60from .quantumGraphExecutor import QuantumExecutor
62# ----------------------------------
63# Local non-exported definitions --
64# ----------------------------------
66_LOG = logging.getLogger(__name__)
69class _LogCaptureFlag:
70 """Simple flag to enable/disable log-to-butler saving."""
72 store: bool = True
75class SingleQuantumExecutor(QuantumExecutor):
76 """Executor class which runs one Quantum at a time.
78 Parameters
79 ----------
80 butler : `~lsst.daf.butler.Butler`
81 Data butler.
82 taskFactory : `~lsst.pipe.base.TaskFactory`
83 Instance of a task factory.
84 skipExistingIn : `list` [ `str` ], optional
85 Accepts list of collections, if all Quantum outputs already exist in
86 the specified list of collections then that Quantum will not be rerun.
87 clobberOutputs : `bool`, optional
88 If `True`, then existing outputs in output run collection will be
89 overwritten. If ``skipExistingIn`` is defined, only outputs from
90 failed quanta will be overwritten.
91 enableLsstDebug : `bool`, optional
92 Enable debugging with ``lsstDebug`` facility for a task.
93 exitOnKnownError : `bool`, optional
94 If `True`, call `sys.exit` with the appropriate exit code for special
95 known exceptions, after printing a traceback, instead of letting the
96 exception propagate up to calling. This is always the behavior for
97 InvalidQuantumError.
98 mock : `bool`, optional
99 If `True` then mock task execution.
100 mock_configs : `list` [ `_PipelineAction` ], optional
101 Optional config overrides for mock tasks.
102 """
104 stream_json_logs = True
105 """If True each log record is written to a temporary file and ingested
106 when quantum completes. If False the records are accumulated in memory
107 and stored in butler on quantum completion."""
109 def __init__(
110 self,
111 taskFactory,
112 skipExistingIn=None,
113 clobberOutputs=False,
114 enableLsstDebug=False,
115 exitOnKnownError=False,
116 mock=False,
117 mock_configs=None,
118 ):
119 self.taskFactory = taskFactory
120 self.skipExistingIn = skipExistingIn
121 self.enableLsstDebug = enableLsstDebug
122 self.clobberOutputs = clobberOutputs
123 self.exitOnKnownError = exitOnKnownError
124 self.mock = mock
125 self.mock_configs = mock_configs
126 self.log_handler = None
128 def execute(self, taskDef, quantum, butler):
129 # Docstring inherited from QuantumExecutor.execute
130 startTime = time.time()
132 with self.captureLogging(taskDef, quantum, butler) as captureLog:
134 # Save detailed resource usage before task start to metadata.
135 quantumMetadata = _TASK_METADATA_TYPE()
136 logInfo(None, "prep", metadata=quantumMetadata)
138 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config
140 # check whether to skip or delete old outputs, if it returns True
141 # or raises an exception do not try to store logs, as they may be
142 # already in butler.
143 captureLog.store = False
144 if self.checkExistingOutputs(quantum, butler, taskDef):
145 _LOG.info(
146 "Skipping already-successful quantum for label=%s dataId=%s.", label, quantum.dataId
147 )
148 return
149 captureLog.store = True
151 try:
152 quantum = self.updatedQuantumInputs(quantum, butler, taskDef)
153 except NoWorkFound as exc:
154 _LOG.info(
155 "Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s",
156 taskDef.label,
157 quantum.dataId,
158 str(exc),
159 )
160 # Make empty metadata that looks something like what a
161 # do-nothing task would write (but we don't bother with empty
162 # nested PropertySets for subtasks). This is slightly
163 # duplicative with logic in pipe_base that we can't easily call
164 # from here; we'll fix this on DM-29761.
165 logInfo(None, "end", metadata=quantumMetadata)
166 fullMetadata = _TASK_FULL_METADATA_TYPE()
167 fullMetadata[taskDef.label] = _TASK_METADATA_TYPE()
168 fullMetadata["quantum"] = quantumMetadata
169 self.writeMetadata(quantum, fullMetadata, taskDef, butler)
170 return
172 # enable lsstDebug debugging
173 if self.enableLsstDebug:
174 try:
175 _LOG.debug("Will try to import debug.py")
176 import debug # noqa:F401
177 except ImportError:
178 _LOG.warn("No 'debug' module found.")
180 # initialize global state
181 self.initGlobals(quantum, butler)
183 # Ensure that we are executing a frozen config
184 config.freeze()
185 logInfo(None, "init", metadata=quantumMetadata)
186 task = self.makeTask(taskClass, label, config, butler)
187 logInfo(None, "start", metadata=quantumMetadata)
188 try:
189 if self.mock:
190 # Use mock task instance to execute method.
191 runTask = self._makeMockTask(taskDef)
192 else:
193 runTask = task
194 self.runQuantum(runTask, quantum, taskDef, butler)
195 except Exception as e:
196 _LOG.error(
197 "Execution of task '%s' on quantum %s failed. Exception %s: %s",
198 taskDef.label,
199 quantum.dataId,
200 e.__class__.__name__,
201 str(e),
202 )
203 raise
204 logInfo(None, "end", metadata=quantumMetadata)
205 fullMetadata = task.getFullMetadata()
206 fullMetadata["quantum"] = quantumMetadata
207 self.writeMetadata(quantum, fullMetadata, taskDef, butler)
208 stopTime = time.time()
209 _LOG.info(
210 "Execution of task '%s' on quantum %s took %.3f seconds",
211 taskDef.label,
212 quantum.dataId,
213 stopTime - startTime,
214 )
215 return quantum
217 def _makeMockTask(self, taskDef):
218 """Make an instance of mock task for given TaskDef."""
219 # Make config instance and apply overrides
220 overrides = ConfigOverrides()
221 for action in self.mock_configs:
222 if action.label == taskDef.label + "-mock":
223 if action.action == "config":
224 key, _, value = action.value.partition("=")
225 overrides.addValueOverride(key, value)
226 elif action.action == "configfile":
227 overrides.addFileOverride(os.path.expandvars(action.value))
228 else:
229 raise ValueError(f"Unexpected action for mock task config overrides: {action}")
230 config = MockPipelineTask.ConfigClass()
231 overrides.applyTo(config)
233 task = MockPipelineTask(config=config, name=taskDef.label)
234 return task
236 @contextmanager
237 def captureLogging(self, taskDef, quantum, butler):
238 """Configure logging system to capture logs for execution of this task.
240 Parameters
241 ----------
242 taskDef : `lsst.pipe.base.TaskDef`
243 The task definition.
244 quantum : `~lsst.daf.butler.Quantum`
245 Single Quantum instance.
246 butler : `~lsst.daf.butler.Butler`
247 Butler to write logs to.
249 Notes
250 -----
251 Expected to be used as a context manager to ensure that logging
252 records are inserted into the butler once the quantum has been
253 executed:
255 .. code-block:: py
257 with self.captureLogging(taskDef, quantum, butler):
258 # Run quantum and capture logs.
260 Ths method can also setup logging to attach task- or
261 quantum-specific information to log messages. Potentially this can
262 take into account some info from task configuration as well.
263 """
264 # Add a handler to the root logger to capture execution log output.
265 # How does it get removed reliably?
266 if taskDef.logOutputDatasetName is not None:
267 # Either accumulate into ButlerLogRecords or stream
268 # JSON records to file and ingest that.
269 tmpdir = None
270 if self.stream_json_logs:
271 # Create the log file in a temporary directory rather than
272 # creating a temporary file. This is necessary because
273 # temporary files are created with restrictive permissions
274 # and during file ingest these permissions persist in the
275 # datastore. Using a temp directory allows us to create
276 # a file with umask default permissions.
277 tmpdir = tempfile.mkdtemp(prefix="butler-temp-logs-")
279 # Construct a file to receive the log records and "touch" it.
280 log_file = os.path.join(tmpdir, f"butler-log-{taskDef.label}.json")
281 with open(log_file, "w"):
282 pass
283 self.log_handler = FileHandler(log_file)
284 self.log_handler.setFormatter(JsonLogFormatter())
285 else:
286 self.log_handler = ButlerLogRecordHandler()
288 logging.getLogger().addHandler(self.log_handler)
290 # include quantum dataId and task label into MDC
291 label = taskDef.label
292 if quantum.dataId:
293 label += f":{quantum.dataId}"
295 ctx = _LogCaptureFlag()
296 try:
297 with ButlerMDC.set_mdc({"LABEL": label, "RUN": butler.run}):
298 yield ctx
299 finally:
300 # Ensure that the logs are stored in butler.
301 self.writeLogRecords(quantum, taskDef, butler, ctx.store)
302 if tmpdir:
303 shutil.rmtree(tmpdir, ignore_errors=True)
305 def checkExistingOutputs(self, quantum, butler, taskDef):
306 """Decide whether this quantum needs to be executed.
308 If only partial outputs exist then they are removed if
309 ``clobberOutputs`` is True, otherwise an exception is raised.
311 Parameters
312 ----------
313 quantum : `~lsst.daf.butler.Quantum`
314 Quantum to check for existing outputs
315 butler : `~lsst.daf.butler.Butler`
316 Data butler.
317 taskDef : `~lsst.pipe.base.TaskDef`
318 Task definition structure.
320 Returns
321 -------
322 exist : `bool`
323 `True` if ``self.skipExistingIn`` is defined, and a previous
324 execution of this quanta appears to have completed successfully
325 (either because metadata was written or all datasets were written).
326 `False` otherwise.
328 Raises
329 ------
330 RuntimeError
331 Raised if some outputs exist and some not.
332 """
333 if self.skipExistingIn and taskDef.metadataDatasetName is not None:
334 # Metadata output exists; this is sufficient to assume the previous
335 # run was successful and should be skipped.
336 ref = butler.registry.findDataset(
337 taskDef.metadataDatasetName, quantum.dataId, collections=self.skipExistingIn
338 )
339 if ref is not None:
340 if butler.datastore.exists(ref):
341 return True
343 # Previously we always checked for existing outputs in `butler.run`,
344 # now logic gets more complicated as we only want to skip quantum
345 # whose outputs exist in `self.skipExistingIn` but pruning should only
346 # be done for outputs existing in `butler.run`.
348 def findOutputs(collections):
349 """Find quantum outputs in specified collections."""
350 existingRefs = []
351 missingRefs = []
352 for datasetRefs in quantum.outputs.values():
353 for datasetRef in datasetRefs:
354 ref = butler.registry.findDataset(
355 datasetRef.datasetType, datasetRef.dataId, collections=collections
356 )
357 if ref is not None and butler.datastore.exists(ref):
358 existingRefs.append(ref)
359 else:
360 missingRefs.append(datasetRef)
361 return existingRefs, missingRefs
363 existingRefs, missingRefs = findOutputs(self.skipExistingIn)
364 if self.skipExistingIn:
365 if existingRefs and not missingRefs:
366 # everything is already there
367 return True
369 # If we are to re-run quantum then prune datasets that exists in
370 # output run collection, only if `self.clobberOutputs` is set.
371 if existingRefs:
372 existingRefs, missingRefs = findOutputs(butler.run)
373 if existingRefs and missingRefs:
374 _LOG.debug(
375 "Partial outputs exist for task %s dataId=%s collection=%s "
376 "existingRefs=%s missingRefs=%s",
377 taskDef,
378 quantum.dataId,
379 butler.run,
380 existingRefs,
381 missingRefs,
382 )
383 if self.clobberOutputs:
384 # only prune
385 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs)
386 # Do not purge registry records if this looks like
387 # an execution butler. This ensures that the UUID
388 # of the dataset doesn't change.
389 if butler._allow_put_of_predefined_dataset:
390 purge = False
391 disassociate = False
392 else:
393 purge = True
394 disassociate = True
395 butler.pruneDatasets(existingRefs, disassociate=disassociate, unstore=True, purge=purge)
396 return False
397 else:
398 raise RuntimeError(
399 f"Registry inconsistency while checking for existing outputs:"
400 f" collection={butler.run} existingRefs={existingRefs}"
401 f" missingRefs={missingRefs}"
402 )
404 # need to re-run
405 return False
407 def makeTask(self, taskClass, name, config, butler):
408 """Make new task instance.
410 Parameters
411 ----------
412 taskClass : `type`
413 Sub-class of `~lsst.pipe.base.PipelineTask`.
414 name : `str`
415 Name for this task.
416 config : `~lsst.pipe.base.PipelineTaskConfig`
417 Configuration object for this task
419 Returns
420 -------
421 task : `~lsst.pipe.base.PipelineTask`
422 Instance of ``taskClass`` type.
423 butler : `~lsst.daf.butler.Butler`
424 Data butler.
425 """
426 # call task factory for that
427 return self.taskFactory.makeTask(taskClass, name, config, None, butler)
429 def updatedQuantumInputs(self, quantum, butler, taskDef):
430 """Update quantum with extra information, returns a new updated
431 Quantum.
433 Some methods may require input DatasetRefs to have non-None
434 ``dataset_id``, but in case of intermediate dataset it may not be
435 filled during QuantumGraph construction. This method will retrieve
436 missing info from registry.
438 Parameters
439 ----------
440 quantum : `~lsst.daf.butler.Quantum`
441 Single Quantum instance.
442 butler : `~lsst.daf.butler.Butler`
443 Data butler.
444 taskDef : `~lsst.pipe.base.TaskDef`
445 Task definition structure.
447 Returns
448 -------
449 update : `~lsst.daf.butler.Quantum`
450 Updated Quantum instance
451 """
452 anyChanges = False
453 updatedInputs = defaultdict(list)
454 for key, refsForDatasetType in quantum.inputs.items():
455 newRefsForDatasetType = updatedInputs[key]
456 for ref in refsForDatasetType:
457 if ref.id is None:
458 resolvedRef = butler.registry.findDataset(
459 ref.datasetType, ref.dataId, collections=butler.collections
460 )
461 if resolvedRef is None:
462 _LOG.info("No dataset found for %s", ref)
463 continue
464 else:
465 _LOG.debug("Updated dataset ID for %s", ref)
466 else:
467 resolvedRef = ref
468 # We need to ask datastore if the dataset actually exists
469 # because the Registry of a local "execution butler" cannot
470 # know this (because we prepopulate it with all of the datasets
471 # that might be created). In case of mock execution we check
472 # that mock dataset exists instead.
473 if self.mock:
474 try:
475 typeName, component = ref.datasetType.nameAndComponent()
476 if component is not None:
477 mockDatasetTypeName = MockButlerQuantumContext.mockDatasetTypeName(typeName)
478 else:
479 mockDatasetTypeName = MockButlerQuantumContext.mockDatasetTypeName(
480 ref.datasetType.name
481 )
483 mockDatasetType = butler.registry.getDatasetType(mockDatasetTypeName)
484 except KeyError:
485 # means that mock dataset type is not there and this
486 # should be a pre-existing dataset
487 _LOG.debug("No mock dataset type for %s", ref)
488 if butler.datastore.exists(resolvedRef):
489 newRefsForDatasetType.append(resolvedRef)
490 else:
491 mockRef = DatasetRef(mockDatasetType, ref.dataId)
492 resolvedMockRef = butler.registry.findDataset(
493 mockRef.datasetType, mockRef.dataId, collections=butler.collections
494 )
495 _LOG.debug("mockRef=%s resolvedMockRef=%s", mockRef, resolvedMockRef)
496 if resolvedMockRef is not None and butler.datastore.exists(resolvedMockRef):
497 _LOG.debug("resolvedMockRef dataset exists")
498 newRefsForDatasetType.append(resolvedRef)
499 elif butler.datastore.exists(resolvedRef):
500 newRefsForDatasetType.append(resolvedRef)
502 if len(newRefsForDatasetType) != len(refsForDatasetType):
503 anyChanges = True
504 # If we removed any input datasets, let the task check if it has enough
505 # to proceed and/or prune related datasets that it also doesn't
506 # need/produce anymore. It will raise NoWorkFound if it can't run,
507 # which we'll let propagate up. This is exactly what we run during QG
508 # generation, because a task shouldn't care whether an input is missing
509 # because some previous task didn't produce it, or because it just
510 # wasn't there during QG generation.
511 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items())
512 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs)
513 if anyChanges:
514 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId)
515 return Quantum(
516 taskName=quantum.taskName,
517 taskClass=quantum.taskClass,
518 dataId=quantum.dataId,
519 initInputs=quantum.initInputs,
520 inputs=helper.inputs,
521 outputs=helper.outputs,
522 )
524 def runQuantum(self, task, quantum, taskDef, butler):
525 """Execute task on a single quantum.
527 Parameters
528 ----------
529 task : `~lsst.pipe.base.PipelineTask`
530 Task object.
531 quantum : `~lsst.daf.butler.Quantum`
532 Single Quantum instance.
533 taskDef : `~lsst.pipe.base.TaskDef`
534 Task definition structure.
535 butler : `~lsst.daf.butler.Butler`
536 Data butler.
537 """
538 # Create a butler that operates in the context of a quantum
539 if self.mock:
540 butlerQC = MockButlerQuantumContext(butler, quantum)
541 else:
542 butlerQC = ButlerQuantumContext(butler, quantum)
544 # Get the input and output references for the task
545 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum)
547 # Call task runQuantum() method. Catch a few known failure modes and
548 # translate them into specific
549 try:
550 task.runQuantum(butlerQC, inputRefs, outputRefs)
551 except NoWorkFound as err:
552 # Not an error, just an early exit.
553 _LOG.info("Task '%s' on quantum %s exited early: %s", taskDef.label, quantum.dataId, str(err))
554 pass
555 except RepeatableQuantumError as err:
556 if self.exitOnKnownError:
557 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId)
558 _LOG.warning(err, exc_info=True)
559 sys.exit(err.EXIT_CODE)
560 else:
561 raise
562 except InvalidQuantumError as err:
563 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId)
564 _LOG.fatal(err, exc_info=True)
565 sys.exit(err.EXIT_CODE)
567 def writeMetadata(self, quantum, metadata, taskDef, butler):
568 if taskDef.metadataDatasetName is not None:
569 # DatasetRef has to be in the Quantum outputs, can lookup by name
570 try:
571 ref = quantum.outputs[taskDef.metadataDatasetName]
572 except LookupError as exc:
573 raise InvalidQuantumError(
574 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};"
575 f" this could happen due to inconsistent options between QuantumGraph generation"
576 f" and execution"
577 ) from exc
578 butler.put(metadata, ref[0])
580 def writeLogRecords(self, quantum, taskDef, butler, store):
581 # If we are logging to an external file we must always try to
582 # close it.
583 filename = None
584 if isinstance(self.log_handler, FileHandler):
585 filename = self.log_handler.stream.name
586 self.log_handler.close()
588 if self.log_handler is not None:
589 # Remove the handler so we stop accumulating log messages.
590 logging.getLogger().removeHandler(self.log_handler)
592 try:
593 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None:
594 # DatasetRef has to be in the Quantum outputs, can lookup by
595 # name
596 try:
597 ref = quantum.outputs[taskDef.logOutputDatasetName]
598 except LookupError as exc:
599 raise InvalidQuantumError(
600 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};"
601 f" this could happen due to inconsistent options between QuantumGraph generation"
602 f" and execution"
603 ) from exc
605 if isinstance(self.log_handler, ButlerLogRecordHandler):
606 butler.put(self.log_handler.records, ref[0])
608 # Clear the records in case the handler is reused.
609 self.log_handler.records.clear()
610 else:
611 assert filename is not None, "Somehow unable to extract filename from file handler"
613 # Need to ingest this file directly into butler.
614 dataset = FileDataset(path=filename, refs=ref[0])
615 try:
616 butler.ingest(dataset, transfer="move")
617 filename = None
618 except NotImplementedError:
619 # Some datastores can't receive files (e.g. in-memory
620 # datastore when testing), we store empty list for
621 # those just to have a dataset. Alternative is to read
622 # the file as a ButlerLogRecords object and put it.
623 _LOG.info(
624 "Log records could not be stored in this butler because the"
625 " datastore can not ingest files, empty record list is stored instead."
626 )
627 records = ButlerLogRecords.from_records([])
628 butler.put(records, ref[0])
629 finally:
630 # remove file if it is not ingested
631 if filename is not None:
632 try:
633 os.remove(filename)
634 except OSError:
635 pass
637 def initGlobals(self, quantum, butler):
638 """Initialize global state needed for task execution.
640 Parameters
641 ----------
642 quantum : `~lsst.daf.butler.Quantum`
643 Single Quantum instance.
644 butler : `~lsst.daf.butler.Butler`
645 Data butler.
647 Notes
648 -----
649 There is an issue with initializing filters singleton which is done
650 by instrument, to avoid requiring tasks to do it in runQuantum()
651 we do it here when any dataId has an instrument dimension. Also for
652 now we only allow single instrument, verify that all instrument
653 names in all dataIds are identical.
655 This will need revision when filter singleton disappears.
656 """
657 oneInstrument = None
658 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()):
659 for datasetRef in datasetRefs:
660 dataId = datasetRef.dataId
661 instrument = dataId.get("instrument")
662 if instrument is not None:
663 if oneInstrument is not None:
664 assert (
665 instrument == oneInstrument
666 ), "Currently require that only one instrument is used per graph"
667 else:
668 oneInstrument = instrument
669 Instrument.fromName(instrument, butler.registry)