Coverage for python/lsst/ctrl/mpexec/singleQuantumExecutor.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ['SingleQuantumExecutor']
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
28import os
29import shutil
30import sys
31import tempfile
32import time
33from contextlib import contextmanager
34from collections import defaultdict
35from itertools import chain
36from logging import FileHandler
37from typing import List
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .quantumGraphExecutor import QuantumExecutor
43from lsst.daf.base import PropertyList, PropertySet
44from lsst.obs.base import Instrument
45from lsst.pipe.base import (
46 AdjustQuantumHelper,
47 ButlerQuantumContext,
48 InvalidQuantumError,
49 NoWorkFound,
50 RepeatableQuantumError,
51 logInfo,
52)
53from lsst.daf.butler import (
54 DatasetRef,
55 DatasetType,
56 FileDataset,
57 NamedKeyDict,
58 Quantum,
59)
60from lsst.daf.butler.core.logging import (
61 ButlerLogRecordHandler,
62 ButlerLogRecords,
63 ButlerMDC,
64 JsonLogFormatter,
65)
66# ----------------------------------
67# Local non-exported definitions --
68# ----------------------------------
70_LOG = logging.getLogger(__name__.partition(".")[2])
73class _LogCaptureFlag:
74 """Simple flag to enable/disable log-to-butler saving.
75 """
76 store: bool = True
79class SingleQuantumExecutor(QuantumExecutor):
80 """Executor class which runs one Quantum at a time.
82 Parameters
83 ----------
84 butler : `~lsst.daf.butler.Butler`
85 Data butler.
86 taskFactory : `~lsst.pipe.base.TaskFactory`
87 Instance of a task factory.
88 skipExistingIn : `list` [ `str` ], optional
89 Accepts list of collections, if all Quantum outputs already exist in
90 the specified list of collections then that Quantum will not be rerun.
91 clobberOutputs : `bool`, optional
92 If `True`, then existing outputs in output run collection will be
93 overwritten. If ``skipExistingIn`` is defined, only outputs from
94 failed quanta will be overwritten.
95 enableLsstDebug : `bool`, optional
96 Enable debugging with ``lsstDebug`` facility for a task.
97 exitOnKnownError : `bool`, optional
98 If `True`, call `sys.exit` with the appropriate exit code for special
99 known exceptions, after printing a traceback, instead of letting the
100 exception propagate up to calling. This is always the behavior for
101 InvalidQuantumError.
102 """
104 stream_json_logs = True
105 """If True each log record is written to a temporary file and ingested
106 when quantum completes. If False the records are accumulated in memory
107 and stored in butler on quantum completion."""
109 def __init__(self, taskFactory, skipExistingIn=None, clobberOutputs=False, enableLsstDebug=False,
110 exitOnKnownError=False):
111 self.taskFactory = taskFactory
112 self.skipExistingIn = skipExistingIn
113 self.enableLsstDebug = enableLsstDebug
114 self.clobberOutputs = clobberOutputs
115 self.exitOnKnownError = exitOnKnownError
116 self.log_handler = None
118 def execute(self, taskDef, quantum, butler):
119 # Docstring inherited from QuantumExecutor.execute
120 startTime = time.time()
122 with self.captureLogging(taskDef, quantum, butler) as captureLog:
124 # Save detailed resource usage before task start to metadata.
125 quantumMetadata = PropertyList()
126 logInfo(None, "prep", metadata=quantumMetadata)
128 taskClass, label, config = taskDef.taskClass, taskDef.label, taskDef.config
130 # check whether to skip or delete old outputs, if it returns True
131 # or raises an exception do not try to store logs, as they may be
132 # already in butler.
133 captureLog.store = False
134 if self.checkExistingOutputs(quantum, butler, taskDef):
135 _LOG.info("Skipping already-successful quantum for label=%s dataId=%s.", label,
136 quantum.dataId)
137 return
138 captureLog.store = True
140 try:
141 quantum = self.updatedQuantumInputs(quantum, butler, taskDef)
142 except NoWorkFound as exc:
143 _LOG.info("Nothing to do for task '%s' on quantum %s; saving metadata and skipping: %s",
144 taskDef.label, quantum.dataId, str(exc))
145 # Make empty metadata that looks something like what a
146 # do-nothing task would write (but we don't bother with empty
147 # nested PropertySets for subtasks). This is slightly
148 # duplicative with logic in pipe_base that we can't easily call
149 # from here; we'll fix this on DM-29761.
150 logInfo(None, "end", metadata=quantumMetadata)
151 fullMetadata = PropertySet()
152 fullMetadata[taskDef.label] = PropertyList()
153 fullMetadata["quantum"] = quantumMetadata
154 self.writeMetadata(quantum, fullMetadata, taskDef, butler)
155 return
157 # enable lsstDebug debugging
158 if self.enableLsstDebug:
159 try:
160 _LOG.debug("Will try to import debug.py")
161 import debug # noqa:F401
162 except ImportError:
163 _LOG.warn("No 'debug' module found.")
165 # initialize global state
166 self.initGlobals(quantum, butler)
168 # Ensure that we are executing a frozen config
169 config.freeze()
170 logInfo(None, "init", metadata=quantumMetadata)
171 task = self.makeTask(taskClass, label, config, butler)
172 logInfo(None, "start", metadata=quantumMetadata)
173 try:
174 self.runQuantum(task, quantum, taskDef, butler)
175 except Exception:
176 _LOG.exception("Execution of task '%s' on quantum %s failed",
177 taskDef.label, quantum.dataId)
178 raise
179 logInfo(None, "end", metadata=quantumMetadata)
180 fullMetadata = task.getFullMetadata()
181 fullMetadata["quantum"] = quantumMetadata
182 self.writeMetadata(quantum, fullMetadata, taskDef, butler)
183 stopTime = time.time()
184 _LOG.info("Execution of task '%s' on quantum %s took %.3f seconds",
185 taskDef.label, quantum.dataId, stopTime - startTime)
187 @contextmanager
188 def captureLogging(self, taskDef, quantum, butler):
189 """Configure logging system to capture logs for execution of this task.
191 Parameters
192 ----------
193 taskDef : `lsst.pipe.base.TaskDef`
194 The task definition.
195 quantum : `~lsst.daf.butler.Quantum`
196 Single Quantum instance.
197 butler : `~lsst.daf.butler.Butler`
198 Butler to write logs to.
200 Notes
201 -----
202 Expected to be used as a context manager to ensure that logging
203 records are inserted into the butler once the quantum has been
204 executed:
206 .. code-block:: py
208 with self.captureLogging(taskDef, quantum, butler):
209 # Run quantum and capture logs.
211 Ths method can also setup logging to attach task- or
212 quantum-specific information to log messages. Potentially this can
213 take into account some info from task configuration as well.
214 """
215 # Add a handler to the root logger to capture execution log output.
216 # How does it get removed reliably?
217 if taskDef.logOutputDatasetName is not None:
218 # Either accumulate into ButlerLogRecords or stream
219 # JSON records to file and ingest that.
220 tmpdir = None
221 if self.stream_json_logs:
222 # Create the log file in a temporary directory rather than
223 # creating a temporary file. This is necessary because
224 # temporary files are created with restrictive permissions
225 # and during file ingest these permissions persist in the
226 # datastore. Using a temp directory allows us to create
227 # a file with umask default permissions.
228 tmpdir = tempfile.mkdtemp(prefix="butler-temp-logs-")
230 # Construct a file to receive the log records and "touch" it.
231 log_file = os.path.join(tmpdir, f"butler-log-{taskDef.label}.json")
232 with open(log_file, "w"):
233 pass
234 self.log_handler = FileHandler(log_file)
235 self.log_handler.setFormatter(JsonLogFormatter())
236 else:
237 self.log_handler = ButlerLogRecordHandler()
239 logging.getLogger().addHandler(self.log_handler)
241 # include quantum dataId and task label into MDC
242 label = taskDef.label
243 if quantum.dataId:
244 label += f":{quantum.dataId}"
246 ctx = _LogCaptureFlag()
247 try:
248 with ButlerMDC.set_mdc({"LABEL": label, "RUN": butler.run}):
249 yield ctx
250 finally:
251 # Ensure that the logs are stored in butler.
252 self.writeLogRecords(quantum, taskDef, butler, ctx.store)
253 if tmpdir:
254 shutil.rmtree(tmpdir, ignore_errors=True)
256 def checkExistingOutputs(self, quantum, butler, taskDef):
257 """Decide whether this quantum needs to be executed.
259 If only partial outputs exist then they are removed if
260 ``clobberOutputs`` is True, otherwise an exception is raised.
262 Parameters
263 ----------
264 quantum : `~lsst.daf.butler.Quantum`
265 Quantum to check for existing outputs
266 butler : `~lsst.daf.butler.Butler`
267 Data butler.
268 taskDef : `~lsst.pipe.base.TaskDef`
269 Task definition structure.
271 Returns
272 -------
273 exist : `bool`
274 `True` if ``self.skipExistingIn`` is defined, and a previous
275 execution of this quanta appears to have completed successfully
276 (either because metadata was written or all datasets were written).
277 `False` otherwise.
279 Raises
280 ------
281 RuntimeError
282 Raised if some outputs exist and some not.
283 """
284 if self.skipExistingIn and taskDef.metadataDatasetName is not None:
285 # Metadata output exists; this is sufficient to assume the previous
286 # run was successful and should be skipped.
287 ref = butler.registry.findDataset(taskDef.metadataDatasetName, quantum.dataId,
288 collections=self.skipExistingIn)
289 if ref is not None:
290 if butler.datastore.exists(ref):
291 return True
293 # Previously we always checked for existing outputs in `butler.run`,
294 # now logic gets more complicated as we only want to skip quantum
295 # whose outputs exist in `self.skipExistingIn` but pruning should only
296 # be done for outputs existing in `butler.run`.
298 def findOutputs(collections):
299 """Find quantum outputs in specified collections.
300 """
301 existingRefs = []
302 missingRefs = []
303 for datasetRefs in quantum.outputs.values():
304 for datasetRef in datasetRefs:
305 ref = butler.registry.findDataset(datasetRef.datasetType, datasetRef.dataId,
306 collections=collections)
307 if ref is not None and butler.datastore.exists(ref):
308 existingRefs.append(ref)
309 else:
310 missingRefs.append(datasetRef)
311 return existingRefs, missingRefs
313 existingRefs, missingRefs = findOutputs(self.skipExistingIn)
314 if self.skipExistingIn:
315 if existingRefs and not missingRefs:
316 # everything is already there
317 return True
319 # If we are to re-run quantum then prune datasets that exists in
320 # output run collection, only if `self.clobberOutputs` is set.
321 if existingRefs:
322 existingRefs, missingRefs = findOutputs(butler.run)
323 if existingRefs and missingRefs:
324 _LOG.debug("Partial outputs exist for task %s dataId=%s collection=%s "
325 "existingRefs=%s missingRefs=%s",
326 taskDef, quantum.dataId, butler.run, existingRefs, missingRefs)
327 if self.clobberOutputs:
328 # only prune
329 _LOG.info("Removing partial outputs for task %s: %s", taskDef, existingRefs)
330 butler.pruneDatasets(existingRefs, disassociate=True, unstore=True, purge=True)
331 return False
332 else:
333 raise RuntimeError(f"Registry inconsistency while checking for existing outputs:"
334 f" collection={butler.run} existingRefs={existingRefs}"
335 f" missingRefs={missingRefs}")
337 # need to re-run
338 return False
340 def makeTask(self, taskClass, name, config, butler):
341 """Make new task instance.
343 Parameters
344 ----------
345 taskClass : `type`
346 Sub-class of `~lsst.pipe.base.PipelineTask`.
347 name : `str`
348 Name for this task.
349 config : `~lsst.pipe.base.PipelineTaskConfig`
350 Configuration object for this task
352 Returns
353 -------
354 task : `~lsst.pipe.base.PipelineTask`
355 Instance of ``taskClass`` type.
356 butler : `~lsst.daf.butler.Butler`
357 Data butler.
358 """
359 # call task factory for that
360 return self.taskFactory.makeTask(taskClass, name, config, None, butler)
362 def updatedQuantumInputs(self, quantum, butler, taskDef):
363 """Update quantum with extra information, returns a new updated
364 Quantum.
366 Some methods may require input DatasetRefs to have non-None
367 ``dataset_id``, but in case of intermediate dataset it may not be
368 filled during QuantumGraph construction. This method will retrieve
369 missing info from registry.
371 Parameters
372 ----------
373 quantum : `~lsst.daf.butler.Quantum`
374 Single Quantum instance.
375 butler : `~lsst.daf.butler.Butler`
376 Data butler.
377 taskDef : `~lsst.pipe.base.TaskDef`
378 Task definition structure.
380 Returns
381 -------
382 update : `~lsst.daf.butler.Quantum`
383 Updated Quantum instance
384 """
385 anyChanges = False
386 updatedInputs = defaultdict(list)
387 for key, refsForDatasetType in quantum.inputs.items():
388 newRefsForDatasetType = updatedInputs[key]
389 for ref in refsForDatasetType:
390 if ref.id is None:
391 resolvedRef = butler.registry.findDataset(ref.datasetType, ref.dataId,
392 collections=butler.collections)
393 if resolvedRef is None:
394 _LOG.info("No dataset found for %s", ref)
395 continue
396 else:
397 _LOG.debug("Updated dataset ID for %s", ref)
398 else:
399 resolvedRef = ref
400 # We need to ask datastore if the dataset actually exists
401 # because the Registry of a local "execution butler" cannot
402 # know this (because we prepopulate it with all of the datasets
403 # that might be created).
404 if butler.datastore.exists(resolvedRef):
405 newRefsForDatasetType.append(resolvedRef)
406 if len(newRefsForDatasetType) != len(refsForDatasetType):
407 anyChanges = True
408 # If we removed any input datasets, let the task check if it has enough
409 # to proceed and/or prune related datasets that it also doesn't
410 # need/produce anymore. It will raise NoWorkFound if it can't run,
411 # which we'll let propagate up. This is exactly what we run during QG
412 # generation, because a task shouldn't care whether an input is missing
413 # because some previous task didn't produce it, or because it just
414 # wasn't there during QG generation.
415 updatedInputs = NamedKeyDict[DatasetType, List[DatasetRef]](updatedInputs.items())
416 helper = AdjustQuantumHelper(updatedInputs, quantum.outputs)
417 if anyChanges:
418 helper.adjust_in_place(taskDef.connections, label=taskDef.label, data_id=quantum.dataId)
419 return Quantum(taskName=quantum.taskName,
420 taskClass=quantum.taskClass,
421 dataId=quantum.dataId,
422 initInputs=quantum.initInputs,
423 inputs=helper.inputs,
424 outputs=helper.outputs
425 )
427 def runQuantum(self, task, quantum, taskDef, butler):
428 """Execute task on a single quantum.
430 Parameters
431 ----------
432 task : `~lsst.pipe.base.PipelineTask`
433 Task object.
434 quantum : `~lsst.daf.butler.Quantum`
435 Single Quantum instance.
436 taskDef : `~lsst.pipe.base.TaskDef`
437 Task definition structure.
438 butler : `~lsst.daf.butler.Butler`
439 Data butler.
440 """
441 # Create a butler that operates in the context of a quantum
442 butlerQC = ButlerQuantumContext(butler, quantum)
444 # Get the input and output references for the task
445 inputRefs, outputRefs = taskDef.connections.buildDatasetRefs(quantum)
447 # Call task runQuantum() method. Catch a few known failure modes and
448 # translate them into specific
449 try:
450 task.runQuantum(butlerQC, inputRefs, outputRefs)
451 except NoWorkFound as err:
452 # Not an error, just an early exit.
453 _LOG.info("Task '%s' on quantum %s exited early: %s",
454 taskDef.label, quantum.dataId, str(err))
455 pass
456 except RepeatableQuantumError as err:
457 if self.exitOnKnownError:
458 _LOG.warning("Caught repeatable quantum error for %s (%s):", taskDef, quantum.dataId)
459 _LOG.warning(err, exc_info=True)
460 sys.exit(err.EXIT_CODE)
461 else:
462 raise
463 except InvalidQuantumError as err:
464 _LOG.fatal("Invalid quantum error for %s (%s): %s", taskDef, quantum.dataId)
465 _LOG.fatal(err, exc_info=True)
466 sys.exit(err.EXIT_CODE)
468 def writeMetadata(self, quantum, metadata, taskDef, butler):
469 if taskDef.metadataDatasetName is not None:
470 # DatasetRef has to be in the Quantum outputs, can lookup by name
471 try:
472 ref = quantum.outputs[taskDef.metadataDatasetName]
473 except LookupError as exc:
474 raise InvalidQuantumError(
475 f"Quantum outputs is missing metadata dataset type {taskDef.metadataDatasetName};"
476 f" this could happen due to inconsistent options between QuantumGraph generation"
477 f" and execution") from exc
478 butler.put(metadata, ref[0])
480 def writeLogRecords(self, quantum, taskDef, butler, store):
481 # If we are logging to an external file we must always try to
482 # close it.
483 filename = None
484 if isinstance(self.log_handler, FileHandler):
485 filename = self.log_handler.stream.name
486 self.log_handler.close()
488 if self.log_handler is not None:
489 # Remove the handler so we stop accumulating log messages.
490 logging.getLogger().removeHandler(self.log_handler)
492 try:
493 if store and taskDef.logOutputDatasetName is not None and self.log_handler is not None:
494 # DatasetRef has to be in the Quantum outputs, can lookup by
495 # name
496 try:
497 ref = quantum.outputs[taskDef.logOutputDatasetName]
498 except LookupError as exc:
499 raise InvalidQuantumError(
500 f"Quantum outputs is missing log output dataset type {taskDef.logOutputDatasetName};"
501 f" this could happen due to inconsistent options between QuantumGraph generation"
502 f" and execution") from exc
504 if isinstance(self.log_handler, ButlerLogRecordHandler):
505 butler.put(self.log_handler.records, ref[0])
507 # Clear the records in case the handler is reused.
508 self.log_handler.records.clear()
509 else:
510 assert filename is not None, "Somehow unable to extract filename from file handler"
512 # Need to ingest this file directly into butler.
513 dataset = FileDataset(path=filename, refs=ref[0])
514 try:
515 butler.ingest(dataset, transfer="move")
516 filename = None
517 except NotImplementedError:
518 # Some datastores can't receive files (e.g. in-memory
519 # datastore when testing), we store empty list for
520 # those just to have a dataset. Alternative is to read
521 # the file as a ButlerLogRecords object and put it.
522 _LOG.info("Log records could not be stored in this butler because the"
523 " datastore can not ingest files, empty record list is stored instead.")
524 records = ButlerLogRecords.from_records([])
525 butler.put(records, ref[0])
526 finally:
527 # remove file if it is not ingested
528 if filename is not None:
529 try:
530 os.remove(filename)
531 except OSError:
532 pass
534 def initGlobals(self, quantum, butler):
535 """Initialize global state needed for task execution.
537 Parameters
538 ----------
539 quantum : `~lsst.daf.butler.Quantum`
540 Single Quantum instance.
541 butler : `~lsst.daf.butler.Butler`
542 Data butler.
544 Notes
545 -----
546 There is an issue with initializing filters singleton which is done
547 by instrument, to avoid requiring tasks to do it in runQuantum()
548 we do it here when any dataId has an instrument dimension. Also for
549 now we only allow single instrument, verify that all instrument
550 names in all dataIds are identical.
552 This will need revision when filter singleton disappears.
553 """
554 oneInstrument = None
555 for datasetRefs in chain(quantum.inputs.values(), quantum.outputs.values()):
556 for datasetRef in datasetRefs:
557 dataId = datasetRef.dataId
558 instrument = dataId.get("instrument")
559 if instrument is not None:
560 if oneInstrument is not None:
561 assert instrument == oneInstrument, \
562 "Currently require that only one instrument is used per graph"
563 else:
564 oneInstrument = instrument
565 Instrument.fromName(instrument, butler.registry)