21 from __future__
import annotations
23 """Module defining GraphBuilder class and related methods.
26 __all__ = [
'GraphBuilder']
32 from collections
import ChainMap
33 from dataclasses
import dataclass
34 from typing
import Dict, Iterable, Iterator, List
40 from .connections
import iterConnections
41 from .pipeline
import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
42 from .graph
import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler
import (
57 _LOG = logging.getLogger(__name__.partition(
".")[2])
60 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
61 """A custom dictionary that maps `DatasetType` to a nested dictionary of
62 the known `DatasetRef` instances of that type.
67 Positional arguments are forwarded to the `dict` constructor.
68 universe : `DimensionUniverse`
69 Universe of all possible dimensions.
71 def __init__(self, *args, universe: DimensionGraph):
77 universe: DimensionUniverse) -> _DatasetDict:
78 """Construct a dictionary from a flat iterable of `DatasetType` keys.
82 datasetTypes : `iterable` of `DatasetType`
83 DatasetTypes to use as keys for the dict. Values will be empty
85 universe : `DimensionUniverse`
86 Universe of all possible dimensions.
90 dictionary : `_DatasetDict`
91 A new `_DatasetDict` instance.
93 return cls({datasetType: {}
for datasetType
in datasetTypes}, universe=universe)
96 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
98 """Return a new dictionary by extracting items corresponding to the
99 given keys from one or more existing dictionaries.
103 datasetTypes : `iterable` of `DatasetType`
104 DatasetTypes to use as keys for the dict. Values will be obtained
105 by lookups against ``first`` and ``rest``.
106 first : `_DatasetDict`
107 Another dictionary from which to extract values.
109 Additional dictionaries from which to extract values.
113 dictionary : `_DatasetDict`
114 A new dictionary instance.
116 combined = ChainMap(first, *rest)
117 return cls({datasetType: combined[datasetType]
for datasetType
in datasetTypes},
118 universe=first.universe)
122 """The union of all dimensions used by all dataset types in this
123 dictionary, including implied dependencies (`DimensionGraph`).
128 return base.union(*[datasetType.dimensions
for datasetType
in self.keys()])
131 """Unpack nested single-element `DatasetRef` dicts into a new
132 mapping with `DatasetType` keys and `DatasetRef` values.
134 This method assumes that each nest contains exactly one item, as is the
135 case for all "init" datasets.
139 dictionary : `NamedKeyDict`
140 Dictionary mapping `DatasetType` to `DatasetRef`, with both
141 `DatasetType` instances and string names usable as keys.
143 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
146 return NamedKeyDict({datasetType: getOne(refs)
for datasetType, refs
in self.items()})
149 """Unpack nested multi-element `DatasetRef` dicts into a new
150 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
154 dictionary : `NamedKeyDict`
155 Dictionary mapping `DatasetType` to `DatasetRef`, with both
156 `DatasetType` instances and string names usable as keys.
158 return NamedKeyDict({datasetType: list(refs.values())
for datasetType, refs
in self.items()})
160 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
161 ) -> Iterator[DatasetRef]:
162 """Iterate over the contained `DatasetRef` instances that match the
163 given `DatasetType` and data IDs.
167 datasetType : `DatasetType`
168 Dataset type to match.
169 dataIds : `Iterable` [ `DataCoordinate` ]
174 refs : `Iterator` [ `DatasetRef` ]
175 DatasetRef instances for which ``ref.datasetType == datasetType``
176 and ``ref.dataId`` is in ``dataIds``.
178 refs = self[datasetType]
179 return (refs[dataId]
for dataId
in dataIds)
183 """Helper class aggregating information about a `Quantum`, used when
184 constructing a `QuantumGraph`.
186 See `_PipelineScaffolding` for a top-down description of the full
187 scaffolding data structure.
191 task : _TaskScaffolding
192 Back-reference to the helper object for the `PipelineTask` this quantum
193 represents an execution of.
194 dataId : `DataCoordinate`
195 Data ID for this quantum.
197 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
200 self.
inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
201 self.
outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
202 self.
prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
203 universe=dataId.universe)
205 __slots__ = (
"task",
"dataId",
"inputs",
"outputs",
"prerequisites")
208 return f
"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
210 task: _TaskScaffolding
211 """Back-reference to the helper object for the `PipelineTask` this quantum
212 represents an execution of.
215 dataId: DataCoordinate
216 """Data ID for this quantum.
220 """Nested dictionary containing `DatasetRef` inputs to this quantum.
222 This is initialized to map each `DatasetType` to an empty dictionary at
223 construction. Those nested dictionaries are populated (with data IDs as
224 keys) with unresolved `DatasetRef` instances in
225 `_PipelineScaffolding.connectDataIds`.
228 outputs: _DatasetDict
229 """Nested dictionary containing `DatasetRef` outputs this quantum.
232 prerequisites: _DatasetDict
233 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238 """Transform the scaffolding object into a true `Quantum` instance.
243 An actual `Quantum` instance.
245 allInputs = self.
inputs.unpackMultiRefs()
249 config = self.
task.taskDef.config
250 connections = config.connections.ConnectionsClass(config=config)
253 allInputs = connections.adjustQuantum(allInputs)
255 taskName=self.
task.taskDef.taskName,
256 taskClass=self.
task.taskDef.taskClass,
258 initInputs=self.
task.initInputs.unpackSingleRefs(),
259 predictedInputs=allInputs,
260 outputs=self.
outputs.unpackMultiRefs(),
266 """Helper class aggregating information about a `PipelineTask`, used when
267 constructing a `QuantumGraph`.
269 See `_PipelineScaffolding` for a top-down description of the full
270 scaffolding data structure.
275 Data structure that identifies the task class and its config.
276 parent : `_PipelineScaffolding`
277 The parent data structure that will hold the instance being
279 datasetTypes : `TaskDatasetTypes`
280 Data structure that categorizes the dataset types used by this task.
282 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
283 universe = parent.dimensions.universe
285 self.
dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
286 assert self.
dimensions.issubset(parent.dimensions)
289 self.
initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
290 parent.initIntermediates)
291 self.
initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
293 self.
inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
294 self.
outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
295 self.
prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
302 return f
"_TaskScaffolding(taskDef={self.taskDef}, ...)"
305 """Data structure that identifies the task class and its config
309 dimensions: DimensionGraph
310 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
313 initInputs: _DatasetDict
314 """Dictionary containing information about datasets used to construct this
315 task (`_DatasetDict`).
318 initOutputs: _DatasetDict
319 """Dictionary containing information about datasets produced as a
320 side-effect of constructing this task (`_DatasetDict`).
324 """Dictionary containing information about datasets used as regular,
325 graph-constraining inputs to this task (`_DatasetDict`).
328 outputs: _DatasetDict
329 """Dictionary containing information about datasets produced by this task
333 prerequisites: _DatasetDict
334 """Dictionary containing information about input datasets that must be
335 present in the repository before any Pipeline containing this task is run
339 quanta: Dict[DataCoordinate, _QuantumScaffolding]
340 """Dictionary mapping data ID to a scaffolding object for the Quantum of
341 this task with that data ID.
345 """Create a `QuantumGraphTaskNodes` instance from the information in
350 nodes : `QuantumGraphTaskNodes`
351 The `QuantumGraph` elements corresponding to this task.
355 quanta=[q.makeQuantum()
for q
in self.
quanta.values()],
356 initInputs=self.
initInputs.unpackSingleRefs(),
363 """A helper data structure that organizes the information involved in
364 constructing a `QuantumGraph` for a `Pipeline`.
368 pipeline : `Pipeline`
369 Sequence of tasks from which a graph is to be constructed. Must
370 have nested task classes already imported.
371 universe : `DimensionUniverse`
372 Universe of all possible dimensions.
376 The scaffolding data structure contains nested data structures for both
377 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
378 data structures are shared between the pipeline-level structure (which
379 aggregates all datasets and categorizes them from the perspective of the
380 complete pipeline) and the individual tasks that use them as inputs and
383 `QuantumGraph` construction proceeds in four steps, with each corresponding
384 to a different `_PipelineScaffolding` method:
386 1. When `_PipelineScaffolding` is constructed, we extract and categorize
387 the DatasetTypes used by the pipeline (delegating to
388 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
389 nested `_TaskScaffolding` and `_DatasetDict` objects.
391 2. In `connectDataIds`, we construct and run the "Big Join Query", which
392 returns related tuples of all dimensions used to identify any regular
393 input, output, and intermediate datasets (not prerequisites). We then
394 iterate over these tuples of related dimensions, identifying the subsets
395 that correspond to distinct data IDs for each task and dataset type,
396 and then create `_QuantumScaffolding` objects.
398 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
399 dataset data IDs previously identified, transforming unresolved
400 DatasetRefs into resolved DatasetRefs where appropriate. We then look
401 up prerequisite datasets for all quanta.
403 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
404 per-task `_QuantumScaffolding` objects.
407 _LOG.debug(
"Initializing data structures for QuantumGraph generation.")
410 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
413 for attr
in (
"initInputs",
"initIntermediates",
"initOutputs",
414 "inputs",
"intermediates",
"outputs",
"prerequisites"):
415 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
416 universe=registry.dimensions))
419 self.
dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
420 self.outputs.dimensions)
425 if isinstance(pipeline, Pipeline):
426 pipeline = pipeline.toExpandedPipeline()
428 for taskDef, taskDatasetTypes
in zip(pipeline,
429 datasetTypes.byTask.values())]
434 return f
"_PipelineScaffolding(tasks={self.tasks}, ...)"
436 tasks: List[_TaskScaffolding]
437 """Scaffolding data structures for each task in the pipeline
438 (`list` of `_TaskScaffolding`).
441 initInputs: _DatasetDict
442 """Datasets consumed but not produced when constructing the tasks in this
443 pipeline (`_DatasetDict`).
446 initIntermediates: _DatasetDict
447 """Datasets that are both consumed and produced when constructing the tasks
448 in this pipeline (`_DatasetDict`).
451 initOutputs: _DatasetDict
452 """Datasets produced but not consumed when constructing the tasks in this
453 pipeline (`_DatasetDict`).
457 """Datasets that are consumed but not produced when running this pipeline
461 intermediates: _DatasetDict
462 """Datasets that are both produced and consumed when running this pipeline
466 outputs: _DatasetDict
467 """Datasets produced but not consumed when when running this pipeline
471 prerequisites: _DatasetDict
472 """Datasets that are consumed when running this pipeline and looked up
473 per-Quantum when generating the graph (`_DatasetDict`).
476 dimensions: DimensionGraph
477 """All dimensions used by any regular input, intermediate, or output
478 (not prerequisite) dataset; the set of dimension used in the "Big Join
479 Query" (`DimensionGraph`).
481 This is required to be a superset of all task quantum dimensions.
485 """Query for the data IDs that connect nodes in the `QuantumGraph`.
487 This method populates `_TaskScaffolding.dataIds` and
488 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
492 registry : `lsst.daf.butler.Registry`
493 Registry for the data repository; used for all data ID queries.
494 collections : `lsst.daf.butler.CollectionSearch`
495 Object representing the collections to search for input datasets.
496 userQuery : `str`, optional
497 User-provided expression to limit the data IDs processed.
499 _LOG.debug(
"Building query for data IDs.")
501 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
502 for datasetType, refs
in itertools.chain(self.initInputs.items(),
503 self.initIntermediates.items(),
504 self.initOutputs.items()):
505 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
510 _LOG.debug(
"Submitting data ID query and processing results.")
511 resultIter = registry.queryDimensions(
513 datasets=list(self.inputs),
514 collections=collections,
520 for n, commonDataId
in enumerate(resultIter):
526 for datasetType, refs
in itertools.chain(self.inputs.items(), self.intermediates.items(),
527 self.outputs.items()):
528 datasetDataId = commonDataId.subset(datasetType.dimensions)
529 ref = refs.get(datasetDataId)
531 ref = DatasetRef(datasetType, datasetDataId)
532 refs[datasetDataId] = ref
533 refsForRow[datasetType.name] = ref
536 for task
in self.
tasks:
537 quantumDataId = commonDataId.subset(task.dimensions)
538 quantum = task.quanta.get(quantumDataId)
541 task.quanta[quantumDataId] = quantum
550 for datasetType
in task.inputs:
551 ref = refsForRow[datasetType.name]
552 quantum.inputs[datasetType.name][ref.dataId] = ref
553 for datasetType
in task.outputs:
554 ref = refsForRow[datasetType.name]
555 quantum.outputs[datasetType.name][ref.dataId] = ref
557 _LOG.debug(
"Finished processing %d rows from data ID query.", n+1)
559 _LOG.debug(
"Received no rows from data ID query.")
562 """Perform follow up queries for each dataset data ID produced in
565 This method populates `_DatasetScaffolding.refs` (except for those in
570 registry : `lsst.daf.butler.Registry`
571 Registry for the data repository; used for all data ID queries.
572 collections : `lsst.daf.butler.CollectionSearch`
573 Object representing the collections to search for input datasets.
574 run : `str`, optional
575 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
576 output datasets, if it already exists.
577 skipExisting : `bool`, optional
578 If `True` (default), a Quantum is not created if all its outputs
579 already exist in ``run``. Ignored if ``run`` is `None`.
584 Raised if an output dataset already exists in the output run
585 and ``skipExisting`` is `False`. The case where some but not all
586 of a quantum's outputs are present and ``skipExisting`` is `True`
587 cannot be identified at this stage, and is handled by `fillQuanta`
593 for datasetType, refs
in itertools.chain(self.initIntermediates.items(),
594 self.initOutputs.items(),
595 self.intermediates.items(),
596 self.outputs.items()):
597 _LOG.debug(
"Resolving %d datasets for intermediate and/or output dataset %s.",
598 len(refs), datasetType.name)
599 for dataId, unresolvedRef
in refs.items():
604 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
610 f
"output RUN collection '{run}' with data ID {dataId}.")
612 for datasetType, refs
in itertools.chain(self.initInputs.items(), self.inputs.items()):
613 _LOG.debug(
"Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
615 refs[dataId] = registry.findDataset(datasetType, dataId=dataId, collections=collections)
616 if any(ref
is None for ref
in refs.values()):
618 f
"One or more dataset of type '{datasetType.name}' was "
619 f
"present in a previous query, but could not be found now."
620 f
"This is either a logic bug in QuantumGraph generation, "
621 f
"or the input collections have been modified since "
622 f
"QuantumGraph generation began."
626 for task
in self.
tasks:
628 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
633 c.name: c.lookupFunction
634 for c
in iterConnections(task.taskDef.connections,
"prerequisiteInputs")
635 if c.lookupFunction
is not None
638 for quantum
in task.quanta.values():
645 if run
is not None and skipExisting:
648 for datasetType, originalRefs
in quantum.outputs.items():
649 for ref
in task.outputs.extract(datasetType, originalRefs.keys()):
650 if ref.id
is not None:
651 resolvedRefs.append(ref)
653 unresolvedRefs.append(ref)
657 f
"Quantum {quantum.dataId} of task with label "
658 f
"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
659 f
"and others that don't ({unresolvedRefs})."
664 dataIdsToSkip.append(quantum.dataId)
668 for datasetType, refs
in quantum.inputs.items():
669 for ref
in task.inputs.extract(datasetType, refs.keys()):
670 refs[ref.dataId] = ref
679 for datasetType
in task.prerequisites:
680 lookupFunction = lookupFunctions.get(datasetType.name)
681 if lookupFunction
is not None:
683 lookupFunction(datasetType, registry, quantum.dataId, collections)
687 registry.queryDatasets(
689 collections=collections,
690 dataId=quantum.dataId,
695 quantum.prerequisites[datasetType].update({ref.dataId: ref
for ref
in refs})
698 _LOG.debug(
"Pruning %d quanta for task with label '%s' because all of their outputs exist.",
699 len(dataIdsToSkip), task.taskDef.label)
700 for dataId
in dataIdsToSkip:
701 del task.quanta[dataId]
704 """Create a `QuantumGraph` from the quanta already present in
705 the scaffolding data structure.
709 graph : `QuantumGraph`
710 The full `QuantumGraph`.
713 graph.initInputs = self.initInputs.unpackSingleRefs()
714 graph.initOutputs = self.initOutputs.unpackSingleRefs()
715 graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
725 """Base class for exceptions generated by graph builder.
730 class OutputExistsError(GraphBuilderError):
731 """Exception generated when output datasets already exist.
737 """Exception generated when a prerequisite dataset does not exist.
743 """GraphBuilder class is responsible for building task execution graph from
748 registry : `~lsst.daf.butler.Registry`
749 Data butler instance.
750 skipExisting : `bool`, optional
751 If `True` (default), a Quantum is not created if all its outputs
760 def makeGraph(self, pipeline, collections, run, userQuery):
761 """Create execution graph for a pipeline.
765 pipeline : `Pipeline`
766 Pipeline definition, task names/classes and their configs.
767 collections : `lsst.daf.butler.CollectionSearch`
768 Object representing the collections to search for input datasets.
769 run : `str`, optional
770 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
771 output datasets, if it already exists.
773 String which defunes user-defined selection for registry, should be
774 empty or `None` if there is no restrictions on data selection.
778 graph : `QuantumGraph`
783 Raised when user expression cannot be parsed.
785 Raised when output datasets already exist.
787 Other exceptions types may be raised by underlying registry
791 scaffolding.connectDataIds(self.
registry, collections, userQuery)
793 return scaffolding.makeQuantumGraph()