Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ['SingleQuantumExecutor']
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
28import os
29import sys
30import tempfile
31import time
32from contextlib import contextmanager
33from collections import defaultdict
34from itertools import chain
35from logging import FileHandler
36from typing import List
38# -----------------------------
39# Imports for other modules --
40# -----------------------------
41from .quantumGraphExecutor import QuantumExecutor
42from lsst.daf.base import PropertyList, PropertySet
43from lsst.obs.base import Instrument
44from lsst.pipe.base import (
45 AdjustQuantumHelper,
46 ButlerQuantumContext,
47 InvalidQuantumError,
48 NoWorkFound,
49 RepeatableQuantumError,
50 logInfo,
51)
52from lsst.daf.butler import (
53 DatasetRef,
54 DatasetType,
55 FileDataset,
56 NamedKeyDict,
57 Quantum,
58)
59from lsst.daf.butler.core.logging import (
60 ButlerLogRecordHandler,
61 ButlerLogRecords,
62 ButlerMDC,
63 JsonLogFormatter,
64)
65# ----------------------------------
66# Local non-exported definitions --
67# ----------------------------------
69_LOG = logging.getLogger(__name__.partition(".")[2])
72class _LogCaptureFlag:
73 """Simple flag to enable/disable log-to-butler saving.
74 """
75 store: bool = True
78class SingleQuantumExecutor(QuantumExecutor):
79 """Executor class which runs one Quantum at a time.
81 Parameters
82 ----------
83 butler : `~lsst.daf.butler.Butler`
84 Data butler.
85 taskFactory : `~lsst.pipe.base.TaskFactory`
86 Instance of a task factory.
87 skipExistingIn : `list` [ `str` ], optional
88 Accepts list of collections, if all Quantum outputs already exist in
89 the specified list of collections then that Quantum will not be rerun.
90 clobberOutputs : `bool`, optional
91 If `True`, then existing outputs in output run collection will be
92 overwritten. If ``skipExistingIn`` is defined, only outputs from
93 failed quanta will be overwritten.
94 enableLsstDebug : `bool`, optional
95 Enable debugging with ``lsstDebug`` facility for a task.
96 exitOnKnownError : `bool`, optional
97 If `True`, call `sys.exit` with the appropriate exit code for special
98 known exceptions, after printing a traceback, instead of letting the
99 exception propagate up to calling. This is always the behavior for
100 InvalidQuantumError.
101 """
103 stream_json_logs = True
104 """If True each log record is written to a temporary file and ingested
105 when quantum completes. If False the records are accumulated in memory
106 and stored in butler on quantum completion."""
108 def __init__(self, taskFactory, skipExistingIn=None, clobberOutputs=False, enableLsstDebug=False,
109 exitOnKnownError=False):
110 self.taskFactory = taskFactory
111 self.skipExistingIn = skipExistingIn
112 self.enableLsstDebug = enableLsstDebug
113 self.clobberOutputs = clobberOutputs
114 self.exitOnKnownError = exitOnKnownError
115 self.log_handler = None
117 def execute(self, taskDef, quantum, butler):
118 # Docstring inherited from QuantumExecutor.execute
119 startTime = time.time()
121 with self.captureLogging(taskDef, quantum, butler) as captureLog:
123 # Save detailed resource usage before task start to metadata.
124 quantumMetadata = PropertyList()
125 logInfo(None, "prep", metadata=quantumMetadata)
127 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config
129 # check whether to skip or delete old outputs, if it returns True
130 # or raises an exception do not try to store logs, as they may be
131 # already in butler.
132 captureLog.store = False
133 if self.checkExistingOutputs(quantum, butler, taskDef):
134 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label,
135 quantum.dataId)
136 return
137 captureLog.store = True
139 try:
140 quantum = self.updatedQuantumInputs(quantum, butler, taskDef)
141 except NoWorkFound as exc:
142 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s",
143 taskDef.label, quantum.dataId, str(exc))
144 # Make empty metadata that looks something like what a
145 # do-nothing task would write (but we don't bother with empty
146 # nested PropertySets for subtasks). This is slightly
147 # duplicative with logic in pipe_base that we can't easily call
148 # from here; we'll fix this on DM-29761.
149 logInfo(None, "end", metadata=quantumMetadata)
150 fullMetadata = PropertySet()
151 fullMetadata[taskDef.label] = PropertyList()
152 fullMetadata["quantum"] = quantumMetadata
153 self.writeMetadata(quantum, fullMetadata, taskDef, butler)
154 return
156 # enable lsstDebug debugging
157 if self.enableLsstDebug:
158 try:
159 _LOG.debug("Will try to import debug.py")
160 import debug # noqa:F401
161 except ImportError:
162 _LOG.warn("No 'debug' module found.")
164 # initialize global state
165 self.initGlobals(quantum, butler)
167 # Ensure that we are executing a frozen config
168 config.freeze()
169 logInfo(None, "init", metadata=quantumMetadata)
170 task = self.makeTask(taskClass, label, config, butler)
171 logInfo(None, "start", metadata=quantumMetadata)
172 try:
173 self.runQuantum(task, quantum, taskDef, butler)
174 except Exception:
175 _LOG.exception("Execution of task '%s' on quantum %s failed",
176 taskDef.label, quantum.dataId)
177 raise
178 logInfo(None, "end", metadata=quantumMetadata)
179 fullMetadata = task.getFullMetadata()
180 fullMetadata["quantum"] = quantumMetadata
181 self.writeMetadata(quantum, fullMetadata, taskDef, butler)
182 stopTime = time.time()
183 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds",
184 taskDef.label, quantum.dataId, stopTime - startTime)
186 @contextmanager
187 def captureLogging(self, taskDef, quantum, butler):
188 """Configure logging system to capture logs for execution of this task.
190 Parameters
191 ----------
192 taskDef : `lsst.pipe.base.TaskDef`
193 The task definition.
194 quantum : `~lsst.daf.butler.Quantum`
195 Single Quantum instance.
196 butler : `~lsst.daf.butler.Butler`
197 Butler to write logs to.
199 Notes
200 -----
201 Expected to be used as a context manager to ensure that logging
202 records are inserted into the butler once the quantum has been
203 executed:
205 .. code-block:: py
207 with self.captureLogging(taskDef, quantum, butler):
208 # Run quantum and capture logs.
210 Ths method can also setup logging to attach task- or
211 quantum-specific information to log messages. Potentially this can
212 take into account some info from task configuration as well.
213 """
214 # Add a handler to the root logger to capture execution log output.
215 # How does it get removed reliably?
216 if taskDef.logOutputDatasetName is not None:
217 # Either accumulate into ButlerLogRecords or stream
218 # JSON records to file and ingest that.
219 if self.stream_json_logs:
220 tmp = tempfile.NamedTemporaryFile(mode="w",
221 suffix=".json",
222 prefix=f"butler-log-{taskDef.label}-",
223 delete=False)
224 self.log_handler = FileHandler(tmp.name)
225 tmp.close()
226 self.log_handler.setFormatter(JsonLogFormatter())
227 else:
228 self.log_handler = ButlerLogRecordHandler()
230 logging.getLogger().addHandler(self.log_handler)
232 # include quantum dataId and task label into MDC
233 label = taskDef.label
234 if quantum.dataId:
235 label += f":{quantum.dataId}"
237 ctx = _LogCaptureFlag()
238 try:
239 with ButlerMDC.set_mdc({"LABEL": label}):
240 yield ctx
241 finally:
242 # Ensure that the logs are stored in butler.
243 self.writeLogRecords(quantum, taskDef, butler, ctx.store)
245 def checkExistingOutputs(self, quantum, butler, taskDef):
246 """Decide whether this quantum needs to be executed.
248 If only partial outputs exist then they are removed if
249 ``clobberOutputs`` is True, otherwise an exception is raised.
251 Parameters
252 ----------
253 quantum : `~lsst.daf.butler.Quantum`
254 Quantum to check for existing outputs
255 butler : `~lsst.daf.butler.Butler`
256 Data butler.
257 taskDef : `~lsst.pipe.base.TaskDef`
258 Task definition structure.
260 Returns
261 -------
262 exist : `bool`
263 `True` if ``self.skipExistingIn`` is defined, and a previous
264 execution of this quanta appears to have completed successfully
265 (either because metadata was written or all datasets were written).
266 `False` otherwise.
268 Raises
269 ------
270 RuntimeError
271 Raised if some outputs exist and some not.
272 """
273 if self.skipExistingIn and taskDef.metadataDatasetName is not None:
274 # Metadata output exists; this is sufficient to assume the previous
275 # run was successful and should be skipped.
276 ref = butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId,
277 collections=self.skipExistingIn)
278 if ref is not None:
279 if butler.datastore.exists(ref):
280 return True
282 # Previously we always checked for existing outputs in `butler.run`,
283 # now logic gets more complicated as we only want to skip quantum
284 # whose outputs exist in `self.skipExistingIn` but pruning should only
285 # be done for outputs existing in `butler.run`.
287 def findOutputs(collections):
288 """Find quantum outputs in specified collections.
289 """
290 existingRefs = []
291 missingRefs = []
292 for datasetRefs in quantum.outputs.values():
293 for datasetRef in datasetRefs:
294 ref = butler.registry.findDataset(datasetRef.datasetType, datasetRef.dataId,
295 collections=collections)
296 if ref is not None and butler.datastore.exists(ref):
297 existingRefs.append(ref)
298 else:
299 missingRefs.append(datasetRef)
300 return existingRefs, missingRefs
302 existingRefs, missingRefs = findOutputs(self.skipExistingIn)
303 if self.skipExistingIn:
304 if existingRefs and not missingRefs:
305 # everything is already there
306 return True
308 # If we are to re-run quantum then prune datasets that exists in
309 # output run collection, only if `self.clobberOutputs` is set.
310 if existingRefs:
311 existingRefs, missingRefs = findOutputs(butler.run)
312 if existingRefs and missingRefs:
313 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s "
314 "existingRefs=%s missingRefs=%s",
315 taskDef, quantum.dataId, butler.run, existingRefs, missingRefs)
316 if self.clobberOutputs:
317 # only prune
318 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs)
319 butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True)
320 return False
321 else:
322 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:"
323 f" collection={butler.run} existingRefs={existingRefs}"
324 f" missingRefs={missingRefs}")
326 # need to re-run
327 return False
329 def makeTask(self, taskClass, name, config, butler):
330 """Make new task instance.
332 Parameters
333 ----------
334 taskClass : `type`
335 Sub-class of `~lsst.pipe.base.PipelineTask`.
336 name : `str`
337 Name for this task.
338 config : `~lsst.pipe.base.PipelineTaskConfig`
339 Configuration object for this task
341 Returns
342 -------
343 task : `~lsst.pipe.base.PipelineTask`
344 Instance of ``taskClass`` type.
345 butler : `~lsst.daf.butler.Butler`
346 Data butler.
347 """
348 # call task factory for that
349 return self.taskFactory.makeTask(taskClass, name, config, None, butler)
351 def updatedQuantumInputs(self, quantum, butler, taskDef):
352 """Update quantum with extra information, returns a new updated
353 Quantum.
355 Some methods may require input DatasetRefs to have non-None
356 ``dataset_id``, but in case of intermediate dataset it may not be
357 filled during QuantumGraph construction. This method will retrieve
358 missing info from registry.
360 Parameters
361 ----------
362 quantum : `~lsst.daf.butler.Quantum`
363 Single Quantum instance.
364 butler : `~lsst.daf.butler.Butler`
365 Data butler.
366 taskDef : `~lsst.pipe.base.TaskDef`
367 Task definition structure.
369 Returns
370 -------
371 update : `~lsst.daf.butler.Quantum`
372 Updated Quantum instance
373 """
374 anyChanges = False
375 updatedInputs = defaultdict(list)
376 for key, refsForDatasetType in quantum.inputs.items():
377 newRefsForDatasetType = updatedInputs[key]
378 for ref in refsForDatasetType:
379 if ref.id is None:
380 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId,
381 collections=butler.collections)
382 if resolvedRef is None:
383 _LOG.info("No dataset found for %s", ref)
384 continue
385 else:
386 _LOG.debug("Updated dataset ID for %s", ref)
387 else:
388 resolvedRef = ref
389 # We need to ask datastore if the dataset actually exists
390 # because the Registry of a local "execution butler" cannot
391 # know this (because we prepopulate it with all of the datasets
392 # that might be created).
393 if butler.datastore.exists(resolvedRef):
394 newRefsForDatasetType.append(resolvedRef)
395 if len(newRefsForDatasetType) != len(refsForDatasetType):
396 anyChanges = True
397 # If we removed any input datasets, let the task check if it has enough
398 # to proceed and/or prune related datasets that it also doesn't
399 # need/produce anymore. It will raise NoWorkFound if it can't run,
400 # which we'll let propagate up. This is exactly what we run during QG
401 # generation, because a task shouldn't care whether an input is missing
402 # because some previous task didn't produce it, or because it just
403 # wasn't there during QG generation.
404 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items())
405 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs)
406 if anyChanges:
407 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId)
408 return Quantum(taskName=quantum.taskName,
409 taskClass=quantum.taskClass,
410 dataId=quantum.dataId,
411 initInputs=quantum.initInputs,
412 inputs=helper.inputs,
413 outputs=helper.outputs
414 )
416 def runQuantum(self, task, quantum, taskDef, butler):
417 """Execute task on a single quantum.
419 Parameters
420 ----------
421 task : `~lsst.pipe.base.PipelineTask`
422 Task object.
423 quantum : `~lsst.daf.butler.Quantum`
424 Single Quantum instance.
425 taskDef : `~lsst.pipe.base.TaskDef`
426 Task definition structure.
427 butler : `~lsst.daf.butler.Butler`
428 Data butler.
429 """
430 # Create a butler that operates in the context of a quantum
431 butlerQC = ButlerQuantumContext(butler, quantum)
433 # Get the input and output references for the task
434 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum)
436 # Call task runQuantum() method. Catch a few known failure modes and
437 # translate them into specific
438 try:
439 task.runQuantum(butlerQC, inputRefs, outputRefs)
440 except NoWorkFound as err:
441 # Not an error, just an early exit.
442 _LOG.info("Task '%s' on quantum %s exited early: %s",
443 taskDef.label, quantum.dataId, str(err))
444 pass
445 except RepeatableQuantumError as err:
446 if self.exitOnKnownError:
447 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId)
448 _LOG.warning(err, exc_info=True)
449 sys.exit(err.EXIT_CODE)
450 else:
451 raise
452 except InvalidQuantumError as err:
453 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId)
454 _LOG.fatal(err, exc_info=True)
455 sys.exit(err.EXIT_CODE)
457 def writeMetadata(self, quantum, metadata, taskDef, butler):
458 if taskDef.metadataDatasetName is not None:
459 # DatasetRef has to be in the Quantum outputs, can lookup by name
460 try:
461 ref = quantum.outputs[taskDef.metadataDatasetName]
462 except LookupError as exc:
463 raise InvalidQuantumError(
464 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};"
465 f" this could happen due to inconsistent options between QuantumGraph generation"
466 f" and execution") from exc
467 butler.put(metadata, ref[0])
469 def writeLogRecords(self, quantum, taskDef, butler, store):
470 # If we are logging to an external file we must always try to
471 # close it.
472 filename = None
473 if isinstance(self.log_handler, FileHandler):
474 filename = self.log_handler.stream.name
475 self.log_handler.close()
477 if self.log_handler is not None:
478 # Remove the handler so we stop accumulating log messages.
479 logging.getLogger().removeHandler(self.log_handler)
481 try:
482 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None:
483 # DatasetRef has to be in the Quantum outputs, can lookup by
484 # name
485 try:
486 ref = quantum.outputs[taskDef.logOutputDatasetName]
487 except LookupError as exc:
488 raise InvalidQuantumError(
489 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};"
490 f" this could happen due to inconsistent options between QuantumGraph generation"
491 f" and execution") from exc
493 if isinstance(self.log_handler, ButlerLogRecordHandler):
494 butler.put(self.log_handler.records, ref[0])
496 # Clear the records in case the handler is reused.
497 self.log_handler.records.clear()
498 else:
499 assert filename is not None, "Somehow unable to extract filename from file handler"
501 # Need to ingest this file directly into butler.
502 dataset = FileDataset(path=filename, refs=ref[0])
503 try:
504 butler.ingest(dataset, transfer="move")
505 filename = None
506 except NotImplementedError:
507 # Some datastores can't receive files (e.g. in-memory
508 # datastore when testing), we store empty list for
509 # those just to have a dataset. Alternative is to read
510 # the file as a ButlerLogRecords object and put it.
511 _LOG.info("Log records could not be stored in this butler because the"
512 " datastore can not ingest files, empty record list is stored instead.")
513 records = ButlerLogRecords.from_records([])
514 butler.put(records, ref[0])
515 finally:
516 # remove file if it is not ingested
517 if filename is not None:
518 try:
519 os.remove(filename)
520 except OSError:
521 pass
523 def initGlobals(self, quantum, butler):
524 """Initialize global state needed for task execution.
526 Parameters
527 ----------
528 quantum : `~lsst.daf.butler.Quantum`
529 Single Quantum instance.
530 butler : `~lsst.daf.butler.Butler`
531 Data butler.
533 Notes
534 -----
535 There is an issue with initializing filters singleton which is done
536 by instrument, to avoid requiring tasks to do it in runQuantum()
537 we do it here when any dataId has an instrument dimension. Also for
538 now we only allow single instrument, verify that all instrument
539 names in all dataIds are identical.
541 This will need revision when filter singleton disappears.
542 """
543 oneInstrument = None
544 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()):
545 for datasetRef in datasetRefs:
546 dataId = datasetRef.dataId
547 instrument = dataId.get("instrument")
548 if instrument is not None:
549 if oneInstrument is not None:
550 assert instrument == oneInstrument, \
551 "Currently require that only one instrument is used per graph"
552 else:
553 oneInstrument = instrument
554 Instrument.fromName(instrument, butler.registry)