21 from __future__
import annotations
23 """Module defining GraphBuilder class and related methods.
26 __all__ = [
'GraphBuilder']
32 from collections
import ChainMap
33 from dataclasses
import dataclass
34 from typing
import Set, List, Dict, Optional, Iterable
40 from .pipeline
import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
41 from .graph
import QuantumGraph, QuantumGraphTaskNodes
42 from lsst.daf.butler
import (
47 ExpandedDataCoordinate,
50 from lsst.daf.butler.core.utils
import NamedKeyDict
56 _LOG = logging.getLogger(__name__.partition(
".")[2])
61 """Helper class aggregating information about a `DatasetType`, used when
62 constructing a `QuantumGraph`.
64 `_DatasetScaffolding` does not hold the `DatasetType` instance itself
65 because it is usually used as the value type in `_DatasetScaffoldingDict`,
66 which uses `DatasetType` instances as keys.
68 See `_PipelineScaffolding` for a top-down description of the full
69 scaffolding data structure.
73 dimensions : `DimensionGraph`
74 Dimensions of the `DatasetType`.
76 def __init__(self, dimensions: DimensionGraph):
83 __slots__ = (
"dimensions",
"producer",
"consumers",
"dataIds",
"refs")
88 return f
"_DatasetScaffolding(dimensions={self.dimensions}, ...)"
90 dimensions: DimensionGraph
91 """The dimensions of the dataset type (`DimensionGraph`).
93 Set during `_PipelineScaffolding` construction.
96 producer: Optional[_TaskScaffolding]
97 """The scaffolding objects for the Task that produces this dataset.
99 Set during `_PipelineScaffolding` construction.
102 consumers: Dict[str, _TaskScaffolding]
103 """The scaffolding objects for the Tasks that consume this dataset,
104 keyed by their label in the `Pipeline`.
106 Set during `_PipelineScaffolding` construction.
109 dataIds: Set[ExpandedDataCoordinate]
110 """Data IDs for all instances of this dataset type in the graph.
112 Populated after construction by `_PipelineScaffolding.fillDataIds`.
115 refs: List[DatasetRef]
116 """References for all instances of this dataset type in the graph.
118 Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
123 """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
125 See `_PipelineScaffolding` for a top-down description of the full
126 scaffolding data structure.
131 Positional arguments are forwarded to the `dict` constructor.
132 universe : `DimensionUniverse`
133 Universe of all possible dimensions.
135 def __init__(self, *args, universe: DimensionGraph):
141 universe: DimensionUniverse) -> _DatasetScaffoldingDict:
142 """Construct a a dictionary from a flat iterable of `DatasetType` keys.
146 datasetTypes : `iterable` of `DatasetType`
147 DatasetTypes to use as keys for the dict. Values will be
148 constructed from the dimensions of the keys.
149 universe : `DimensionUniverse`
150 Universe of all possible dimensions.
154 dictionary : `_DatasetScaffoldingDict`
155 A new dictionary instance.
158 for datasetType
in datasetTypes),
162 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
163 *rest) -> _DatasetScaffoldingDict:
164 """Return a new dictionary by extracting items corresponding to the
165 given keys from one or more existing dictionaries.
169 datasetTypes : `iterable` of `DatasetType`
170 DatasetTypes to use as keys for the dict. Values will be obtained
171 by lookups against ``first`` and ``rest``.
172 first : `_DatasetScaffoldingDict`
173 Another dictionary from which to extract values.
175 Additional dictionaries from which to extract values.
179 dictionary : `_DatasetScaffoldingDict`
180 A new dictionary instance.
182 combined = ChainMap(first, *rest)
183 return cls(((datasetType, combined[datasetType])
for datasetType
in datasetTypes),
184 universe=first.universe)
188 """The union of all dimensions used by all dataset types in this
189 dictionary, including implied dependencies (`DimensionGraph`).
194 return base.union(*[scaffolding.dimensions
for scaffolding
in self.values()])
197 """Unpack nested single-element `DatasetRef` lists into a new
200 This method assumes that each `_DatasetScaffolding.refs` list contains
201 exactly one `DatasetRef`, as is the case for all "init" datasets.
205 dictionary : `NamedKeyDict`
206 Dictionary mapping `DatasetType` to `DatasetRef`, with both
207 `DatasetType` instances and string names usable as keys.
209 return NamedKeyDict((datasetType, scaffolding.refs[0])
for datasetType, scaffolding
in self.items())
214 """Helper class aggregating information about a `PipelineTask`, used when
215 constructing a `QuantumGraph`.
217 See `_PipelineScaffolding` for a top-down description of the full
218 scaffolding data structure.
223 Data structure that identifies the task class and its config.
224 parent : `_PipelineScaffolding`
225 The parent data structure that will hold the instance being
227 datasetTypes : `TaskDatasetTypes`
228 Data structure that categorizes the dataset types used by this task.
233 Raised if the task's dimensions are not a subset of the union of the
234 pipeline's dataset dimensions.
236 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
237 universe = parent.dimensions.universe
239 self.
dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
240 if not self.
dimensions.issubset(parent.dimensions):
242 f
"{self.dimensions} that are not a subset of "
243 f
"the pipeline dimensions {parent.dimensions}.")
247 self.
initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
248 parent.initInputs, parent.initIntermediates)
249 self.
initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
250 parent.initIntermediates, parent.initOutputs)
251 self.
inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
252 parent.inputs, parent.intermediates)
253 self.
outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
254 parent.intermediates, parent.outputs)
255 self.
prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
256 parent.prerequisites)
259 for dataset
in itertools.chain(self.
initInputs.values(), self.
inputs.values(),
261 dataset.consumers[self.
taskDef.label] = self
263 assert dataset.producer
is None
264 dataset.producer = self
271 return f
"_TaskScaffolding(taskDef={self.taskDef}, ...)"
274 """Data structure that identifies the task class and its config
278 dimensions: DimensionGraph
279 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
282 initInputs: _DatasetScaffoldingDict
283 """Dictionary containing information about datasets used to construct this
284 task (`_DatasetScaffoldingDict`).
287 initOutputs: _DatasetScaffoldingDict
288 """Dictionary containing information about datasets produced as a
289 side-effect of constructing this task (`_DatasetScaffoldingDict`).
292 inputs: _DatasetScaffoldingDict
293 """Dictionary containing information about datasets used as regular,
294 graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
297 outputs: _DatasetScaffoldingDict
298 """Dictionary containing information about datasets produced by this task
299 (`_DatasetScaffoldingDict`).
302 prerequisites: _DatasetScaffoldingDict
303 """Dictionary containing information about input datasets that must be
304 present in the repository before any Pipeline containing this task is run
305 (`_DatasetScaffoldingDict`).
308 dataIds: Set[ExpandedDataCoordinate]
309 """Data IDs for all quanta for this task in the graph (`set` of
310 `ExpandedDataCoordinate`).
312 Populated after construction by `_PipelineScaffolding.fillDataIds`.
315 quanta: List[Quantum]
316 """All quanta for this task in the graph (`list` of `Quantum`).
318 Populated after construction by `_PipelineScaffolding.fillQuanta`.
323 connectionClass = config.connections.ConnectionsClass
324 connectionInstance = connectionClass(config=config)
327 result = connectionInstance.adjustQuantum(quantum.predictedInputs)
328 quantum._predictedInputs = NamedKeyDict(result)
331 self.
quanta.append(quantum)
334 """Create a `QuantumGraphTaskNodes` instance from the information in
339 nodes : `QuantumGraphTaskNodes`
340 The `QuantumGraph` elements corresponding to this task.
352 """A helper data structure that organizes the information involved in
353 constructing a `QuantumGraph` for a `Pipeline`.
357 pipeline : `Pipeline`
358 Sequence of tasks from which a graph is to be constructed. Must
359 have nested task classes already imported.
360 universe : `DimensionUniverse`
361 Universe of all possible dimensions.
366 Raised if the task's dimensions are not a subset of the union of the
367 pipeline's dataset dimensions.
371 The scaffolding data structure contains nested data structures for both
372 tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
373 latter held by `_DatasetScaffoldingDict`. The dataset data structures are
374 shared between the pipeline-level structure (which aggregates all datasets
375 and categorizes them from the perspective of the complete pipeline) and the
376 individual tasks that use them as inputs and outputs.
378 `QuantumGraph` construction proceeds in five steps, with each corresponding
379 to a different `_PipelineScaffolding` method:
381 1. When `_PipelineScaffolding` is constructed, we extract and categorize
382 the DatasetTypes used by the pipeline (delegating to
383 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
384 nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
386 2. In `fillDataIds`, we construct and run the "Big Join Query", which
387 returns related tuples of all dimensions used to identify any regular
388 input, output, and intermediate datasets (not prerequisites). We then
389 iterate over these tuples of related dimensions, identifying the subsets
390 that correspond to distinct data IDs for each task and dataset type.
392 3. In `fillDatasetRefs`, we run follow-up queries against all of the
393 dataset data IDs previously identified, populating the
394 `_DatasetScaffolding.refs` lists - except for those for prerequisite
395 datasets, which cannot be resolved until distinct quanta are
398 4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
399 the inputs and outputs for each `Quantum` and search for prerequisite
400 datasets, populating `_TaskScaffolding.quanta`.
402 5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
403 per-task quanta identified in the previous step.
408 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
411 for attr
in (
"initInputs",
"initIntermediates",
"initOutputs",
412 "inputs",
"intermediates",
"outputs",
"prerequisites"):
413 setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
414 universe=registry.dimensions))
417 self.
dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
418 self.outputs.dimensions)
423 if isinstance(pipeline, Pipeline):
424 pipeline = pipeline.toExpandedPipeline()
426 for taskDef, taskDatasetTypes
in zip(pipeline,
427 datasetTypes.byTask.values())]
432 return f
"_PipelineScaffolding(tasks={self.tasks}, ...)"
434 tasks: List[_TaskScaffolding]
435 """Scaffolding data structures for each task in the pipeline
436 (`list` of `_TaskScaffolding`).
439 initInputs: _DatasetScaffoldingDict
440 """Datasets consumed but not produced when constructing the tasks in this
441 pipeline (`_DatasetScaffoldingDict`).
444 initIntermediates: _DatasetScaffoldingDict
445 """Datasets that are both consumed and produced when constructing the tasks
446 in this pipeline (`_DatasetScaffoldingDict`).
449 initOutputs: _DatasetScaffoldingDict
450 """Datasets produced but not consumed when constructing the tasks in this
451 pipeline (`_DatasetScaffoldingDict`).
454 inputs: _DatasetScaffoldingDict
455 """Datasets that are consumed but not produced when running this pipeline
456 (`_DatasetScaffoldingDict`).
459 intermediates: _DatasetScaffoldingDict
460 """Datasets that are both produced and consumed when running this pipeline
461 (`_DatasetScaffoldingDict`).
464 outputs: _DatasetScaffoldingDict
465 """Datasets produced but not consumed when when running this pipeline
466 (`_DatasetScaffoldingDict`).
469 prerequisites: _DatasetScaffoldingDict
470 """Datasets that are consumed when running this pipeline and looked up
471 per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
474 dimensions: DimensionGraph
475 """All dimensions used by any regular input, intermediate, or output
476 (not prerequisite) dataset; the set of dimension used in the "Big Join
477 Query" (`DimensionGraph`).
479 This is required to be a superset of all task quantum dimensions.
483 """Query for the data IDs that connect nodes in the `QuantumGraph`.
485 This method populates `_TaskScaffolding.dataIds` and
486 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
490 registry : `lsst.daf.butler.Registry`
491 Registry for the data repository; used for all data ID queries.
492 collections : `lsst.daf.butler.CollectionSearch`
493 Object representing the collections to search for input datasets.
494 userQuery : `str`, optional
495 User-provided expression to limit the data IDs processed.
498 emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
499 for scaffolding
in itertools.chain(self.initInputs.values(),
500 self.initIntermediates.values(),
501 self.initOutputs.values()):
502 scaffolding.dataIds.add(emptyDataId)
507 resultIter = registry.queryDimensions(
509 datasets=list(self.inputs),
510 collections=collections,
526 for commonDataId
in resultIter:
527 for taskScaffolding
in self.
tasks:
528 taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
529 for datasetType, scaffolding
in itertools.chain(self.inputs.items(),
530 self.intermediates.items(),
531 self.outputs.items()):
532 scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
535 """Perform follow up queries for each dataset data ID produced in
538 This method populates `_DatasetScaffolding.refs` (except for those in
543 registry : `lsst.daf.butler.Registry`
544 Registry for the data repository; used for all data ID queries.
545 collections : `lsst.daf.butler.CollectionSearch`
546 Object representing the collections to search for input datasets.
547 run : `str`, optional
548 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
549 output datasets, if it already exists.
550 skipExisting : `bool`, optional
551 If `True` (default), a Quantum is not created if all its outputs
552 already exist in ``run``. Ignored if ``run`` is `None`.
557 Raised if an output dataset already exists in the output run
558 and ``skipExisting`` is `False`. The case where some but not all
559 of a quantum's outputs are present and ``skipExisting`` is `True`
560 cannot be identified at this stage, and is handled by `fillQuanta`
564 for datasetType, scaffolding
in itertools.chain(self.initInputs.items(), self.inputs.items()):
565 for dataId
in scaffolding.dataIds:
567 registry.queryDatasets(
569 collections=collections,
576 raise RuntimeError(f
"Expected exactly one instance of input {datasetType} "
577 f
"for data ID {dataId}; got {refs}.")
578 scaffolding.refs.extend(refs)
582 for datasetType, scaffolding
in itertools.chain(self.initIntermediates.items(),
583 self.initOutputs.items(),
584 self.intermediates.items(),
585 self.outputs.items()):
586 for dataId
in scaffolding.dataIds:
593 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
597 ref = DatasetRef(datasetType, dataId)
598 elif not skipExisting:
600 f
"output RUN collection '{run}' with data ID {dataId}.")
601 scaffolding.refs.append(ref)
604 def fillQuanta(self, registry, collections, *, skipExisting=True):
605 """Define quanta for each task by splitting up the datasets associated
606 with each task data ID.
608 This method populates `_TaskScaffolding.quanta`.
612 registry : `lsst.daf.butler.Registry`
613 Registry for the data repository; used for all data ID queries.
614 collections : `lsst.daf.butler.CollectionSearch`
615 Object representing the collections to search for input datasets.
616 skipExisting : `bool`, optional
617 If `True` (default), a Quantum is not created if all its outputs
620 for task
in self.
tasks:
621 for quantumDataId
in task.dataIds:
628 inputs = NamedKeyDict()
629 for datasetType, scaffolding
in task.inputs.items():
630 inputs[datasetType] = [ref
for ref, dataId
in zip(scaffolding.refs, scaffolding.dataIds)
631 if registry.relateDataIds(quantumDataId, dataId)]
633 _LOG.debug(
"%s dataId %s has inputs: %s",
634 task.taskDef.taskName, quantumDataId, list(inputs.names))
637 outputs = NamedKeyDict()
638 allOutputsPresent =
True
639 for datasetType, scaffolding
in task.outputs.items():
640 outputs[datasetType] = []
641 for ref, dataId
in zip(scaffolding.refs, scaffolding.dataIds):
642 if registry.relateDataIds(quantumDataId, dataId):
644 allOutputsPresent =
False
646 assert skipExisting,
"Existing outputs should have already been identified."
647 if not allOutputsPresent:
649 f
"{dataId} already exists, but other outputs "
650 f
"for task with label {task.taskDef.label} "
651 f
"and data ID {quantumDataId} do not.")
652 outputs[datasetType].append(ref)
653 if allOutputsPresent
and skipExisting:
656 _LOG.debug(
"%s dataID %s has outputs: %s",
657 task.taskDef.taskName, quantumDataId, list(outputs.names))
667 connections = task.taskDef.connections
668 for con_name
in connections.prerequisiteInputs:
669 con = getattr(connections, con_name)
670 for datasetType
in task.prerequisites:
671 if datasetType.name == con.name:
673 if con.lookupFunction
is not None:
674 refs = list(con.lookupFunction(datasetType, registry,
675 quantumDataId, collections))
678 registry.queryDatasets(
680 collections=collections,
681 dataId=quantumDataId,
686 inputs[datasetType] = refs
688 _LOG.debug(
"%s dataID %s has inputs+prereqs: %s",
689 task.taskDef.taskName, quantumDataId, list(inputs.names))
693 taskName=task.taskDef.taskName,
694 taskClass=task.taskDef.taskClass,
695 dataId=quantumDataId,
696 initInputs=task.initInputs.unpackRefs(),
697 predictedInputs=inputs,
703 """Create a `QuantumGraph` from the quanta already present in
704 the scaffolding data structure.
707 graph.initInputs = self.initInputs.unpackRefs()
708 graph.initOutputs = self.initOutputs.unpackRefs()
709 graph.initIntermediates = self.initIntermediates.unpackRefs()
719 """Base class for exceptions generated by graph builder.
724 class OutputExistsError(GraphBuilderError):
725 """Exception generated when output datasets already exist.
731 """Exception generated when a prerequisite dataset does not exist.
737 """GraphBuilder class is responsible for building task execution graph from
742 registry : `~lsst.daf.butler.Registry`
743 Data butler instance.
744 skipExisting : `bool`, optional
745 If `True` (default), a Quantum is not created if all its outputs
754 def makeGraph(self, pipeline, collections, run, userQuery):
755 """Create execution graph for a pipeline.
759 pipeline : `Pipeline`
760 Pipeline definition, task names/classes and their configs.
761 collections : `lsst.daf.butler.CollectionSearch`
762 Object representing the collections to search for input datasets.
763 run : `str`, optional
764 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
765 output datasets, if it already exists.
767 String which defunes user-defined selection for registry, should be
768 empty or `None` if there is no restrictions on data selection.
772 graph : `QuantumGraph`
777 Raised when user expression cannot be parsed.
779 Raised when output datasets already exist.
781 Other exceptions types may be raised by underlying registry
785 scaffolding.fillDataIds(self.
registry, collections, userQuery)
788 return scaffolding.makeQuantumGraph()