21 from __future__
import annotations
23 """Module defining GraphBuilder class and related methods. 26 __all__ = [
'GraphBuilder']
33 from collections
import ChainMap
34 from dataclasses
import dataclass
35 from typing
import Set, List, Dict, Optional, Iterable
41 from .pipeline
import PipelineDatasetTypes, TaskDatasetTypes, Pipeline, TaskDef
42 from .graph
import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler
import (
48 ExpandedDataCoordinate,
51 from lsst.daf.butler.core.utils
import NamedKeyDict
57 _LOG = logging.getLogger(__name__.partition(
".")[2])
62 """Helper class aggregating information about a `DatasetType`, used when 63 constructing a `QuantumGraph`. 65 `_DatasetScaffolding` does not hold the `DatasetType` instance itself 66 because it is usually used as the value type in `_DatasetScaffoldingDict`, 67 which uses `DatasetType` instances as keys. 69 See `_PipelineScaffolding` for a top-down description of the full 70 scaffolding data structure. 74 dimensions : `DimensionGraph` 75 Dimensions of the `DatasetType`. 77 def __init__(self, dimensions: DimensionGraph):
84 __slots__ = (
"dimensions",
"producer",
"consumers",
"dataIds",
"refs")
86 dimensions: DimensionGraph
87 """The dimensions of the dataset type (`DimensionGraph`). 89 Set during `_PipelineScaffolding` construction. 92 producer: Optional[_TaskScaffolding]
93 """The scaffolding objects for the Task that produces this dataset. 95 Set during `_PipelineScaffolding` construction. 98 consumers: Dict[str, _TaskScaffolding]
99 """The scaffolding objects for the Tasks that consume this dataset, 100 keyed by their label in the `Pipeline`. 102 Set during `_PipelineScaffolding` construction. 105 dataIds: Set[ExpandedDataCoordinate]
106 """Data IDs for all instances of this dataset type in the graph. 108 Populated after construction by `_PipelineScaffolding.fillDataIds`. 111 refs: List[DatasetRef]
112 """References for all instances of this dataset type in the graph. 114 Populated after construction by `_PipelineScaffolding.fillDatasetRefs`. 119 """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`. 121 See `_PipelineScaffolding` for a top-down description of the full 122 scaffolding data structure. 127 Positional arguments are forwarded to the `dict` constructor. 128 universe : `DimensionUniverse` 129 Universe of all possible dimensions. 131 def __init__(self, *args, universe: DimensionGraph):
137 universe: DimensionUniverse) -> _DatasetScaffoldingDict:
138 """Construct a a dictionary from a flat iterable of `DatasetType` keys. 142 datasetTypes : `iterable` of `DatasetType` 143 DatasetTypes to use as keys for the dict. Values will be 144 constructed from the dimensions of the keys. 145 universe : `DimensionUniverse` 146 Universe of all possible dimensions. 150 dictionary : `_DatasetScaffoldingDict` 151 A new dictionary instance. 154 for datasetType
in datasetTypes),
158 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
159 *rest) -> _DatasetScaffoldingDict:
160 """Return a new dictionary by extracting items corresponding to the 161 given keys from one or more existing dictionaries. 165 datasetTypes : `iterable` of `DatasetType` 166 DatasetTypes to use as keys for the dict. Values will be obtained 167 by lookups against ``first`` and ``rest``. 168 first : `_DatasetScaffoldingDict` 169 Another dictionary from which to extract values. 171 Additional dictionaries from which to extract values. 175 dictionary : `_DatasetScaffoldingDict` 176 A new dictionary instance. 178 combined = ChainMap(first, *rest)
179 return cls(((datasetType, combined[datasetType])
for datasetType
in datasetTypes),
180 universe=first.universe)
184 """The union of all dimensions used by all dataset types in this 185 dictionary, including implied dependencies (`DimensionGraph`). 190 return base.union(*[scaffolding.dimensions
for scaffolding
in self.values()])
193 """Unpack nested single-element `DatasetRef` lists into a new 196 This method assumes that each `_DatasetScaffolding.refs` list contains 197 exactly one `DatasetRef`, as is the case for all "init" datasets. 201 dictionary : `NamedKeyDict` 202 Dictionary mapping `DatasetType` to `DatasetRef`, with both 203 `DatasetType` instances and string names usable as keys. 205 return NamedKeyDict((datasetType, scaffolding.refs[0])
for datasetType, scaffolding
in self.items())
210 """Helper class aggregating information about a `PipelineTask`, used when 211 constructing a `QuantumGraph`. 213 See `_PipelineScaffolding` for a top-down description of the full 214 scaffolding data structure. 219 Data structure that identifies the task class and its config. 220 parent : `_PipelineScaffolding` 221 The parent data structure that will hold the instance being 223 datasetTypes : `TaskDatasetTypes` 224 Data structure that categorizes the dataset types used by this task. 229 Raised if the task's dimensions are not a subset of the union of the 230 pipeline's dataset dimensions. 232 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
233 universe = parent.dimensions.universe
235 self.
dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
236 if not self.
dimensions.issubset(parent.dimensions):
238 f
"{self.dimensions} that are not a subset of " 239 f
"the pipeline dimensions {parent.dimensions}.")
242 self.
initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243 parent.initInputs, parent.initIntermediates)
244 self.
initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245 parent.initIntermediates, parent.initOutputs)
246 self.
inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247 parent.inputs, parent.intermediates)
248 self.
outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249 parent.intermediates, parent.outputs)
250 self.
prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251 parent.prerequisites)
254 for dataset
in itertools.chain(self.
initInputs.values(), self.
inputs.values(),
256 dataset.consumers[self.
taskDef.label] = self
258 assert dataset.producer
is None 259 dataset.producer = self
264 """Data structure that identifies the task class and its config 268 dimensions: DimensionGraph
269 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 272 initInputs: _DatasetScaffoldingDict
273 """Dictionary containing information about datasets used to construct this 274 task (`_DatasetScaffoldingDict`). 277 initOutputs: _DatasetScaffoldingDict
278 """Dictionary containing information about datasets produced as a 279 side-effect of constructing this task (`_DatasetScaffoldingDict`). 282 inputs: _DatasetScaffoldingDict
283 """Dictionary containing information about datasets used as regular, 284 graph-constraining inputs to this task (`_DatasetScaffoldingDict`). 287 outputs: _DatasetScaffoldingDict
288 """Dictionary containing information about datasets produced by this task 289 (`_DatasetScaffoldingDict`). 292 prerequisites: _DatasetScaffoldingDict
293 """Dictionary containing information about input datasets that must be 294 present in the repository before any Pipeline containing this task is run 295 (`_DatasetScaffoldingDict`). 298 dataIds: Set[ExpandedDataCoordinate]
299 """Data IDs for all quanta for this task in the graph (`set` of 300 `ExpandedDataCoordinate`). 302 Populated after construction by `_PipelineScaffolding.fillDataIds`. 305 quanta: List[Quantum]
306 """All quanta for this task in the graph (`list` of `Quantum`). 308 Populated after construction by `_PipelineScaffolding.fillQuanta`. 313 connectionClass = config.connections.ConnectionsClass
314 connectionInstance = connectionClass(config=config)
317 result = connectionInstance.adjustQuantum(quantum.predictedInputs)
318 quantum._predictedInputs = NamedKeyDict(result)
321 self.
quanta.append(quantum)
324 """Create a `QuantumGraphTaskNodes` instance from the information in 329 nodes : `QuantumGraphTaskNodes` 330 The `QuantumGraph` elements corresponding to this task. 342 """A helper data structure that organizes the information involved in 343 constructing a `QuantumGraph` for a `Pipeline`. 347 pipeline : `Pipeline` 348 Sequence of tasks from which a graph is to be constructed. Must 349 have nested task classes already imported. 350 universe : `DimensionUniverse` 351 Universe of all possible dimensions. 356 Raised if the task's dimensions are not a subset of the union of the 357 pipeline's dataset dimensions. 361 The scaffolding data structure contains nested data structures for both 362 tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the 363 latter held by `_DatasetScaffoldingDict`. The dataset data structures are 364 shared between the pipeline-level structure (which aggregates all datasets 365 and categorizes them from the perspective of the complete pipeline) and the 366 individual tasks that use them as inputs and outputs. 368 `QuantumGraph` construction proceeds in five steps, with each corresponding 369 to a different `_PipelineScaffolding` method: 371 1. When `_PipelineScaffolding` is constructed, we extract and categorize 372 the DatasetTypes used by the pipeline (delegating to 373 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 374 nested `_TaskScaffolding` and `_DatasetScaffolding` objects. 376 2. In `fillDataIds`, we construct and run the "Big Join Query", which 377 returns related tuples of all dimensions used to identify any regular 378 input, output, and intermediate datasets (not prerequisites). We then 379 iterate over these tuples of related dimensions, identifying the subsets 380 that correspond to distinct data IDs for each task and dataset type. 382 3. In `fillDatasetRefs`, we run follow-up queries against all of the 383 dataset data IDs previously identified, populating the 384 `_DatasetScaffolding.refs` lists - except for those for prerequisite 385 datasets, which cannot be resolved until distinct quanta are 388 4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into 389 the inputs and outputs for each `Quantum` and search for prerequisite 390 datasets, populating `_TaskScaffolding.quanta`. 392 5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 393 per-task quanta identified in the previous step. 398 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
401 for attr
in (
"initInputs",
"initIntermediates",
"initOutputs",
402 "inputs",
"intermediates",
"outputs",
"prerequisites"):
403 setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
404 universe=registry.dimensions))
407 self.
dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
408 self.outputs.dimensions)
414 for taskDef, taskDatasetTypes
in zip(pipeline, datasetTypes.byTask.values())]
416 tasks: List[_TaskScaffolding]
417 """Scaffolding data structures for each task in the pipeline 418 (`list` of `_TaskScaffolding`). 421 initInputs: _DatasetScaffoldingDict
422 """Datasets consumed but not produced when constructing the tasks in this 423 pipeline (`_DatasetScaffoldingDict`). 426 initIntermediates: _DatasetScaffoldingDict
427 """Datasets that are both consumed and produced when constructing the tasks 428 in this pipeline (`_DatasetScaffoldingDict`). 431 initOutputs: _DatasetScaffoldingDict
432 """Datasets produced but not consumed when constructing the tasks in this 433 pipeline (`_DatasetScaffoldingDict`). 436 inputs: _DatasetScaffoldingDict
437 """Datasets that are consumed but not produced when running this pipeline 438 (`_DatasetScaffoldingDict`). 441 intermediates: _DatasetScaffoldingDict
442 """Datasets that are both produced and consumed when running this pipeline 443 (`_DatasetScaffoldingDict`). 446 outputs: _DatasetScaffoldingDict
447 """Datasets produced but not consumed when when running this pipeline 448 (`_DatasetScaffoldingDict`). 451 prerequisites: _DatasetScaffoldingDict
452 """Datasets that are consumed when running this pipeline and looked up 453 per-Quantum when generating the graph (`_DatasetScaffoldingDict`). 456 dimensions: DimensionGraph
457 """All dimensions used by any regular input, intermediate, or output 458 (not prerequisite) dataset; the set of dimension used in the "Big Join 459 Query" (`DimensionGraph`). 461 This is required to be a superset of all task quantum dimensions. 465 """Query for the data IDs that connect nodes in the `QuantumGraph`. 467 This method populates `_TaskScaffolding.dataIds` and 468 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 472 registry : `lsst.daf.butler.Registry` 473 Registry for the data repository; used for all data ID queries. 474 inputCollections : `~collections.abc.Mapping` 475 Mapping from dataset type name to an ordered sequence of 476 collections to search for that dataset. A `defaultdict` is 477 recommended for the case where the same collections should be 478 used for most datasets. 479 userQuery : `str`, optional 480 User-provided expression to limit the data IDs processed. 483 emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
484 for scaffolding
in itertools.chain(self.initInputs.values(),
485 self.initIntermediates.values(),
486 self.initOutputs.values()):
487 scaffolding.dataIds.add(emptyDataId)
492 resultIter = registry.queryDimensions(
495 datasetType: inputCollections[datasetType.name]
496 for datasetType
in self.inputs
513 for commonDataId
in resultIter:
514 for taskScaffolding
in self.
tasks:
515 taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
516 for datasetType, scaffolding
in itertools.chain(self.inputs.items(),
517 self.intermediates.items(),
518 self.outputs.items()):
519 scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
521 def fillDatasetRefs(self, registry, inputCollections, outputCollection, *,
522 skipExisting=True, clobberExisting=False):
523 """Perform follow up queries for each dataset data ID produced in 526 This method populates `_DatasetScaffolding.refs` (except for those in 531 registry : `lsst.daf.butler.Registry` 532 Registry for the data repository; used for all data ID queries. 533 inputCollections : `~collections.abc.Mapping` 534 Mapping from dataset type name to an ordered sequence of 535 collections to search for that dataset. A `defaultdict` is 536 recommended for the case where the same collections should be 537 used for most datasets. 538 outputCollection : `str` 539 Collection for all output datasets. 540 skipExisting : `bool`, optional 541 If `True` (default), a Quantum is not created if all its outputs 543 clobberExisting : `bool`, optional 544 If `True`, overwrite any outputs that already exist. Cannot be 545 `True` if ``skipExisting`` is. 550 Raised if both `skipExisting` and `clobberExisting` are `True`. 552 Raised if an output dataset already exists in the output collection 553 and both ``skipExisting`` and ``clobberExisting`` are `False`. The 554 case where some but not all of a quantum's outputs are present and 555 ``skipExisting`` is `True` cannot be identified at this stage, and 556 is handled by `fillQuanta` instead. 558 if clobberExisting
and skipExisting:
559 raise ValueError(
"clobberExisting and skipExisting cannot both be true.")
561 for datasetType, scaffolding
in itertools.chain(self.initInputs.items(), self.inputs.items()):
562 for dataId
in scaffolding.dataIds:
564 registry.queryDatasets(
566 collections=inputCollections[datasetType.name],
572 assert len(refs) == 1,
"BJQ guarantees exactly one input for each data ID." 573 scaffolding.refs.extend(refs)
577 for datasetType, scaffolding
in itertools.chain(self.initIntermediates.items(),
578 self.initOutputs.items(),
579 self.intermediates.items(),
580 self.outputs.items()):
581 for dataId
in scaffolding.dataIds:
590 ref = registry.find(collection=outputCollection, datasetType=datasetType, dataId=dataId)
592 ref = DatasetRef(datasetType, dataId)
593 elif not skipExisting:
595 f
"output collection {outputCollection} with data ID {dataId}.")
596 scaffolding.refs.append(ref)
599 def fillQuanta(self, registry, inputCollections, *, skipExisting=True):
600 """Define quanta for each task by splitting up the datasets associated 601 with each task data ID. 603 This method populates `_TaskScaffolding.quanta`. 607 registry : `lsst.daf.butler.Registry` 608 Registry for the data repository; used for all data ID queries. 609 inputCollections : `~collections.abc.Mapping` 610 Mapping from dataset type name to an ordered sequence of 611 collections to search for that dataset. A `defaultdict` is 612 recommended for the case where the same collections should be 613 used for most datasets. 614 skipExisting : `bool`, optional 615 If `True` (default), a Quantum is not created if all its outputs 618 for task
in self.
tasks:
619 for quantumDataId
in task.dataIds:
626 inputs = NamedKeyDict()
627 for datasetType, scaffolding
in task.inputs.items():
628 inputs[datasetType] = [ref
for ref, dataId
in zip(scaffolding.refs, scaffolding.dataIds)
629 if quantumDataId.matches(dataId)]
631 outputs = NamedKeyDict()
632 allOutputsPresent =
True 633 for datasetType, scaffolding
in task.outputs.items():
634 outputs[datasetType] = []
635 for ref, dataId
in zip(scaffolding.refs, scaffolding.dataIds):
636 if quantumDataId.matches(dataId):
638 allOutputsPresent =
False 640 assert skipExisting,
"Existing outputs should have already been identified." 641 if not allOutputsPresent:
643 f
"{dataId} already exists, but other outputs " 644 f
"for task with label {task.taskDef.label} " 645 f
"and data ID {quantumDataId} do not.")
646 outputs[datasetType].append(ref)
647 if allOutputsPresent
and skipExisting:
658 for datasetType, scaffolding
in task.prerequisites.items():
660 registry.queryDatasets(
662 collections=inputCollections[datasetType.name],
663 dataId=quantumDataId,
668 inputs[datasetType] = refs
671 taskName=task.taskDef.taskName,
672 taskClass=task.taskDef.taskClass,
673 dataId=quantumDataId,
674 initInputs=task.initInputs.unpackRefs(),
675 predictedInputs=inputs,
681 """Create a `QuantumGraph` from the quanta already present in 682 the scaffolding data structure. 685 graph.initInputs = self.initInputs.unpackRefs()
686 graph.initOutputs = self.initOutputs.unpackRefs()
687 graph.initIntermediates = self.initIntermediates.unpackRefs()
697 """Base class for exceptions generated by graph builder. 702 class OutputExistsError(GraphBuilderError):
703 """Exception generated when output datasets already exist. 709 """Exception generated when a prerequisite dataset does not exist. 715 """GraphBuilder class is responsible for building task execution graph from 720 taskFactory : `TaskFactory` 721 Factory object used to load/instantiate PipelineTasks 722 registry : `~lsst.daf.butler.Registry` 723 Data butler instance. 724 skipExisting : `bool`, optional 725 If `True` (default), a Quantum is not created if all its outputs 727 clobberExisting : `bool`, optional 728 If `True`, overwrite any outputs that already exist. Cannot be 729 `True` if ``skipExisting`` is. 732 def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False):
739 def _loadTaskClass(self, taskDef):
740 """Make sure task class is loaded. 742 Load task class, update task name to make sure it is fully-qualified, 743 do not update original taskDef in a Pipeline though. 751 `TaskDef` instance, may be the same as parameter if task class is 754 if taskDef.taskClass
is None:
755 tClass, tName = self.
taskFactory.loadTaskClass(taskDef.taskName)
756 taskDef = copy.copy(taskDef)
757 taskDef.taskClass = tClass
758 taskDef.taskName = tName
761 def makeGraph(self, pipeline, inputCollections, outputCollection, userQuery):
762 """Create execution graph for a pipeline. 766 pipeline : `Pipeline` 767 Pipeline definition, task names/classes and their configs. 768 inputCollections : `~collections.abc.Mapping` 769 Mapping from dataset type name to an ordered sequence of 770 collections to search for that dataset. A `defaultdict` is 771 recommended for the case where the same collections should be 772 used for most datasets. 773 outputCollection : `str` 774 Collection for all output datasets. 776 String which defunes user-defined selection for registry, should be 777 empty or `None` if there is no restrictions on data selection. 781 graph : `QuantumGraph` 786 Raised when user expression cannot be parsed. 788 Raised when output datasets already exist. 790 Other exceptions types may be raised by underlying registry 802 scaffolding.fillDataIds(self.
registry, inputCollections, userQuery)
803 scaffolding.fillDatasetRefs(self.
registry, inputCollections, outputCollection,
806 scaffolding.fillQuanta(self.
registry, inputCollections,
809 return scaffolding.makeQuantumGraph()
def fillDatasetRefs(self, registry, inputCollections, outputCollection, skipExisting=True, clobberExisting=False)
def makeQuantumGraph(self)
def fillDataIds(self, registry, inputCollections, userQuery)
def _loadTaskClass(self, taskDef)
def makeGraph(self, pipeline, inputCollections, outputCollection, userQuery)
def makeQuantumGraphTaskNodes(self)
def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False)
def __init__(self, pipeline, registry)
def fillQuanta(self, registry, inputCollections, skipExisting=True)