21 from __future__
import annotations
23 """Module defining GraphBuilder class and related methods. 26 __all__ = [
'GraphBuilder']
33 from collections
import ChainMap
34 from dataclasses
import dataclass
35 from typing
import Set, List, Dict, Optional, Iterable
41 from .pipeline
import PipelineDatasetTypes, TaskDatasetTypes, Pipeline, TaskDef
42 from .graph
import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler
import Quantum, DatasetRef, DimensionGraph, DataId, DimensionUniverse, DatasetType
44 from lsst.daf.butler.core.utils
import NamedKeyDict
45 from lsst.daf.butler.sql
import DataIdQueryBuilder, SingleDatasetQueryBuilder
51 _LOG = logging.getLogger(__name__.partition(
".")[2])
56 """Helper class aggregating information about a `DatasetType`, used when 57 constructing a `QuantumGraph`. 59 `_DatasetScaffolding` does not hold the `DatasetType` instance itself 60 because it is usually used as the value type in `_DatasetScaffoldingDict`, 61 which uses `DatasetType` instances as keys. 63 See `_PipelineScaffolding` for a top-down description of the full 64 scaffolding data structure. 68 dimensions : `DimensionGraph` 69 Dimensions of the `DatasetType`, expanded to include implied 72 def __init__(self, dimensions: DimensionGraph):
79 __slots__ = (
"dimensions",
"producer",
"consumers",
"dataIds",
"refs")
81 dimensions: DimensionGraph
82 """The dimensions of the dataset type, expanded to included implied 85 Set during `_PipelineScaffolding` construction. 88 producer: Optional[_TaskScaffolding]
89 """The scaffolding objects for the Task that produces this dataset. 91 Set during `_PipelineScaffolding` construction. 94 consumers: Dict[str, _TaskScaffolding]
95 """The scaffolding objects for the Tasks that consume this dataset, 96 keyed by their label in the `Pipeline`. 98 Set during `_PipelineScaffolding` construction. 102 """Data IDs for all instances of this dataset type in the graph. 104 These data IDs cover the full set of implied-expanded dimensions (i.e. 105 the `dimensions` attribute of this instance), which is a supserset of the 106 dimensions used in `DatasetRef` instances (e.g. in ``refs``). 108 Populated after construction by `_PipelineScaffolding.fillDataIds`. 111 refs: List[DatasetRef]
112 """References for all instances of this dataset type in the graph. 114 Populated after construction by `_PipelineScaffolding.fillDatasetRefs`. 119 """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`. 121 See `_PipelineScaffolding` for a top-down description of the full 122 scaffolding data structure. 127 Positional arguments are forwarded to the `dict` constructor. 128 universe : `DimensionUniverse` 129 Universe of all possible dimensions. 131 def __init__(self, *args, universe: DimensionGraph):
137 universe: DimensionUniverse) -> _DatasetScaffoldingDict:
138 """Construct a a dictionary from a flat iterable of `DatasetType` keys. 142 datasetTypes : `iterable` of `DatasetType` 143 DatasetTypes to use as keys for the dict. Values will be 144 constructed from the dimensions of the keys. 145 universe : `DimensionUniverse` 146 Universe of all possible dimensions. 150 dictionary : `_DatasetScaffoldingDict` 151 A new dictionary instance. 154 for datasetType
in datasetTypes),
158 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
159 *rest) -> _DatasetScaffoldingDict:
160 """Return a new dictionary by extracting items corresponding to the 161 given keys from one or more existing dictionaries. 165 datasetTypes : `iterable` of `DatasetType` 166 DatasetTypes to use as keys for the dict. Values will be obtained 167 by lookups against ``first`` and ``rest``. 168 first : `_DatasetScaffoldingDict` 169 Another dictionary from which to extract values. 171 Additional dictionaries from which to extract values. 175 dictionary : `_DatasetScaffoldingDict` 176 A new dictionary instance. 178 combined = ChainMap(first, *rest)
179 return cls(((datasetType, combined[datasetType])
for datasetType
in datasetTypes),
180 universe=first.universe)
184 """The union of all dimensions used by all dataset types in this 185 dictionary, including implied dependencies (`DimensionGraph`). 190 return base.union(*(scaffolding.dimensions
for scaffolding
in self.values()), implied=
True)
193 """Unpack nested single-element `DatasetRef` lists into a new 196 This method assumes that each `_DatasetScaffolding.refs` list contains 197 exactly one `DatasetRef`, as is the case for all "init" datasets. 201 dictionary : `NamedKeyDict` 202 Dictionary mapping `DatasetType` to `DatasetRef`, with both 203 `DatasetType` instances and string names usable as keys. 205 return NamedKeyDict((datasetType, scaffolding.refs[0])
for datasetType, scaffolding
in self.items())
210 """Helper class aggregating information about a `PipelineTask`, used when 211 constructing a `QuantumGraph`. 213 See `_PipelineScaffolding` for a top-down description of the full 214 scaffolding data structure. 219 Data structure that identifies the task class and its config. 220 parent : `_PipelineScaffolding` 221 The parent data structure that will hold the instance being 223 datasetTypes : `TaskDatasetTypes` 224 Data structure that categorizes the dataset types used by this task. 229 Raised if the task's dimensions are not a subset of the union of the 230 pipeline's dataset dimensions. 232 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
233 universe = parent.dimensions.universe
235 self.
dimensions = universe.extract(taskDef.config.quantum.dimensions, implied=
True)
236 if not self.
dimensions.issubset(parent.dimensions):
238 f
"{self.dimensions.toSet()} that are not a subset of " 239 f
"the pipeline dimensions {parent.dimensions.toSet()}.")
242 self.
initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243 parent.initInputs, parent.initIntermediates)
244 self.
initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245 parent.initIntermediates, parent.initOutputs)
246 self.
inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247 parent.inputs, parent.intermediates)
248 self.
outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249 parent.intermediates, parent.outputs)
250 self.
prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251 parent.prerequisites)
254 for dataset
in itertools.chain(self.
initInputs.values(), self.
inputs.values(),
256 dataset.consumers[self.
taskDef.label] = self
258 assert dataset.producer
is None 259 dataset.producer = self
264 """Data structure that identifies the task class and its config 268 dimensions: DimensionGraph
269 """The dimensions of a single `Quantum` of this task, expanded to include 270 implied dependencies (`DimensionGraph`). 273 initInputs: _DatasetScaffoldingDict
274 """Dictionary containing information about datasets used to construct this 275 task (`_DatasetScaffoldingDict`). 278 initOutputs: _DatasetScaffoldingDict
279 """Dictionary containing information about datasets produced as a 280 side-effect of constructing this task (`_DatasetScaffoldingDict`). 283 inputs: _DatasetScaffoldingDict
284 """Dictionary containing information about datasets used as regular, 285 graph-constraining inputs to this task (`_DatasetScaffoldingDict`). 288 outputs: _DatasetScaffoldingDict
289 """Dictionary containing information about datasets produced by this task 290 (`_DatasetScaffoldingDict`). 293 prerequisites: _DatasetScaffoldingDict
294 """Dictionary containing information about input datasets that must be 295 present in the repository before any Pipeline containing this task is run 296 (`_DatasetScaffoldingDict`). 300 """Data IDs for all quanta for this task in the graph (`set` of `DataId`). 302 Populated after construction by `_PipelineScaffolding.fillDataIds`. 305 quanta: List[Quantum]
306 """All quanta for this task in the graph (`list` of `Quantum`). 308 Populated after construction by `_PipelineScaffolding.fillQuanta`. 312 """Create a `QuantumGraphTaskNodes` instance from the information in 317 nodes : `QuantumGraphTaskNodes` 318 The `QuantumGraph` elements corresponding to this task. 330 """A helper data structure that organizes the information involved in 331 constructing a `QuantumGraph` for a `Pipeline`. 335 pipeline : `Pipeline` 336 Sequence of tasks from which a graph is to be constructed. Must 337 have nested task classes already imported. 338 universe : `DimensionUniverse` 339 Universe of all possible dimensions. 344 Raised if the task's dimensions are not a subset of the union of the 345 pipeline's dataset dimensions. 349 The scaffolding data structure contains nested data structures for both 350 tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the 351 latter held by `_DatasetScaffoldingDict`. The dataset data structures are 352 shared between the pipeline-level structure (which aggregates all datasets 353 and categorizes them from the perspective of the complete pipeline) and the 354 individual tasks that use them as inputs and outputs. 356 `QuantumGraph` construction proceeds in five steps, with each corresponding 357 to a different `_PipelineScaffolding` method: 359 1. When `_PipelineScaffolding` is constructed, we extract and categorize 360 the DatasetTypes used by the pipeline (delegating to 361 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 362 nested `_TaskScaffolding` and `_DatasetScaffolding` objects. 364 2. In `fillDataIds`, we construct and run the "Big Join Query", which 365 returns related tuples of all dimensions used to identify any regular 366 input, output, and intermediate datasets (not prerequisites). We then 367 iterate over these tuples of related dimensions, identifying the subsets 368 that correspond to distinct data IDs for each task and dataset type. 370 3. In `fillDatasetRefs`, we run follow-up queries against all of the 371 dataset data IDs previously identified, populating the 372 `_DatasetScaffolding.refs` lists - except for those for prerequisite 373 datasets, which cannot be resolved until distinct quanta are 376 4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into 377 the inputs and outputs for each `Quantum` and search for prerequisite 378 datasets, populating `_TaskScaffolding.quanta`. 380 5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 381 per-task quanta identified in the previous step. 386 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, universe=universe)
389 for attr
in (
"initInputs",
"initIntermediates",
"initOutputs",
390 "inputs",
"intermediates",
"outputs",
"prerequisites"):
391 setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
395 self.
dimensions = self.inputs.dimensions.union(self.inputs.dimensions,
396 self.intermediates.dimensions,
397 self.outputs.dimensions, implied=
True)
403 for taskDef, taskDatasetTypes
in zip(pipeline, datasetTypes.byTask.values())]
405 tasks: List[_TaskScaffolding]
406 """Scaffolding data structures for each task in the pipeline 407 (`list` of `_TaskScaffolding`). 410 initInputs: _DatasetScaffoldingDict
411 """Datasets consumed but not produced when constructing the tasks in this 412 pipeline (`_DatasetScaffoldingDict`). 415 initIntermediates: _DatasetScaffoldingDict
416 """Datasets that are both consumed and produced when constructing the tasks 417 in this pipeline (`_DatasetScaffoldingDict`). 420 initOutputs: _DatasetScaffoldingDict
421 """Datasets produced but not consumed when constructing the tasks in this 422 pipeline (`_DatasetScaffoldingDict`). 425 inputs: _DatasetScaffoldingDict
426 """Datasets that are consumed but not produced when running this pipeline 427 (`_DatasetScaffoldingDict`). 430 intermediates: _DatasetScaffoldingDict
431 """Datasets that are both produced and consumed when running this pipeline 432 (`_DatasetScaffoldingDict`). 435 outputs: _DatasetScaffoldingDict
436 """Datasets produced but not consumed when when running this pipeline 437 (`_DatasetScaffoldingDict`). 440 prerequisites: _DatasetScaffoldingDict
441 """Datasets that are consumed when running this pipeline and looked up 442 per-Quantum when generating the graph (`_DatasetScaffoldingDict`). 445 dimensions: DimensionGraph
446 """All dimensions used by any regular input, intermediate, or output 447 (not prerequisite) dataset; the set of dimension used in the "Big Join 448 Query" (`DimensionGraph`). 450 This is required to be a superset of all task quantum dimensions. 454 """Query for the data IDs that connect nodes in the `QuantumGraph`. 456 This method populates `_TaskScaffolding.dataIds` and 457 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 461 registry : `lsst.daf.butler.Registry` 462 Registry for the data repository; used for all data ID queries. 463 originInfo : `lsst.daf.butler.DatasetOriginInfo` 464 Object holding the input and output collections for each 466 userQuery : `str`, optional 467 User-provided expression to limit the data IDs processed. 470 emptyDataId = DataId(dimensions=registry.dimensions.empty)
471 for scaffolding
in itertools.chain(self.initInputs.values(),
472 self.initIntermediates.values(),
473 self.initOutputs.values()):
474 scaffolding.dataIds.add(emptyDataId)
477 query = DataIdQueryBuilder.fromDimensions(registry, self.
dimensions)
480 for datasetType
in self.inputs:
481 query.requireDataset(datasetType, originInfo.getInputCollections(datasetType.name))
484 query.whereParsedExpression(userQuery)
498 for commonDataId
in query.execute():
499 for taskScaffolding
in self.
tasks:
500 dataId = DataId(commonDataId, dimensions=taskScaffolding.dimensions)
501 taskScaffolding.dataIds.add(dataId)
502 for datasetType, scaffolding
in itertools.chain(self.inputs.items(),
503 self.intermediates.items(),
504 self.outputs.items()):
505 dataId = DataId(commonDataId, dimensions=scaffolding.dimensions)
506 scaffolding.dataIds.add(dataId)
508 def fillDatasetRefs(self, registry, originInfo, *, skipExisting=True, clobberExisting=False):
509 """Perform follow up queries for each dataset data ID produced in 512 This method populates `_DatasetScaffolding.refs` (except for those in 517 registry : `lsst.daf.butler.Registry` 518 Registry for the data repository; used for all data ID queries. 519 originInfo : `lsst.daf.butler.DatasetOriginInfo` 520 Object holding the input and output collections for each 522 skipExisting : `bool`, optional 523 If `True` (default), a Quantum is not created if all its outputs 525 clobberExisting : `bool`, optional 526 If `True`, overwrite any outputs that already exist. Cannot be 527 `True` if ``skipExisting`` is. 532 Raised if both `skipExisting` and `clobberExisting` are `True`. 534 Raised if an output dataset already exists in the output collection 535 and both ``skipExisting`` and ``clobberExisting`` are `False`. The 536 case where some but not all of a quantum's outputs are present and 537 ``skipExisting`` is `True` cannot be identified at this stage, and 538 is handled by `fillQuanta` instead. 540 if clobberExisting
and skipExisting:
541 raise ValueError(
"clobberExisting and skipExisting cannot both be true.")
543 for datasetType, scaffolding
in itertools.chain(self.initInputs.items(), self.inputs.items()):
544 for dataId
in scaffolding.dataIds:
550 builder = SingleDatasetQueryBuilder.fromCollections(
551 registry, datasetType,
552 collections=originInfo.getInputCollections(datasetType.name)
554 builder.whereDataId(dataId)
555 ref = builder.executeOne(expandDataId=
True)
560 ref = DatasetRef(datasetType, DataId(dataId, dimensions=datasetType.dimensions))
561 scaffolding.refs.append(ref)
565 for datasetType, scaffolding
in itertools.chain(self.initIntermediates.items(),
566 self.initOutputs.items(),
567 self.intermediates.items(),
568 self.outputs.items()):
569 collection = originInfo.getOutputCollection(datasetType.name)
570 for dataId
in scaffolding.dataIds:
579 ref = registry.find(collection=collection, datasetType=datasetType, dataId=dataId)
583 ref = DatasetRef(datasetType, DataId(dataId, dimensions=datasetType.dimensions))
584 elif not skipExisting:
586 f
"output collection {collection} with data ID {dataId}.")
587 scaffolding.refs.append(ref)
590 def fillQuanta(self, registry, originInfo, *, skipExisting=True):
591 """Define quanta for each task by splitting up the datasets associated 592 with each task data ID. 594 This method populates `_TaskScaffolding.quanta`. 598 registry : `lsst.daf.butler.Registry` 599 Registry for the data repository; used for all data ID queries. 600 originInfo : `lsst.daf.butler.DatasetOriginInfo` 601 Object holding the input and output collections for each 603 skipExisting : `bool`, optional 604 If `True` (default), a Quantum is not created if all its outputs 607 for task
in self.
tasks:
608 for quantumDataId
in task.dataIds:
615 inputs = NamedKeyDict()
616 for datasetType, scaffolding
in task.inputs.items():
617 inputs[datasetType] = [ref
for ref, dataId
in zip(scaffolding.refs, scaffolding.dataIds)
618 if quantumDataId.matches(dataId)]
620 outputs = NamedKeyDict()
621 allOutputsPresent =
True 622 for datasetType, scaffolding
in task.outputs.items():
623 outputs[datasetType] = []
624 for ref, dataId
in zip(scaffolding.refs, scaffolding.dataIds):
625 if quantumDataId.matches(dataId):
627 allOutputsPresent =
False 629 assert skipExisting,
"Existing outputs should have already been identified." 630 if not allOutputsPresent:
632 f
"{dataId} already exists, but other outputs " 633 f
"for task with label {task.taskDef.label} " 634 f
"and data ID {quantumDataId} do not.")
635 outputs[datasetType].append(ref)
636 if allOutputsPresent
and skipExisting:
647 for datasetType, scaffolding
in task.prerequisites.items():
648 builder = SingleDatasetQueryBuilder.fromCollections(
649 registry, datasetType,
650 collections=originInfo.getInputCollections(datasetType.name)
652 if not datasetType.dimensions.issubset(quantumDataId.dimensions()):
653 builder.relateDimensions(quantumDataId.dimensions(), addResultColumns=
False)
654 builder.whereDataId(quantumDataId)
655 refs = list(builder.execute(expandDataId=
True))
658 f
"No instances of prerequisite dataset {datasetType.name} found for task " 659 f
"with label {task.taskDef.label} and quantum data ID {quantumDataId}." 661 inputs[datasetType] = refs
664 taskName=task.taskDef.taskName,
665 taskClass=task.taskDef.taskClass,
666 dataId=quantumDataId,
667 initInputs=task.initInputs.unpackRefs(),
668 predictedInputs=inputs,
674 """Create a `QuantumGraph` from the quanta already present in 675 the scaffolding data structure. 678 graph.initInputs = self.initInputs.unpackRefs()
679 graph.initOutputs = self.initOutputs.unpackRefs()
680 graph.initIntermediates = self.initIntermediates.unpackRefs()
690 """Base class for exceptions generated by graph builder. 695 class OutputExistsError(GraphBuilderError):
696 """Exception generated when output datasets already exist. 702 """Exception generated when a prerequisite dataset does not exist. 708 """GraphBuilder class is responsible for building task execution graph from 713 taskFactory : `TaskFactory` 714 Factory object used to load/instantiate PipelineTasks 715 registry : `~lsst.daf.butler.Registry` 716 Data butler instance. 717 skipExisting : `bool`, optional 718 If `True` (default), a Quantum is not created if all its outputs 720 clobberExisting : `bool`, optional 721 If `True`, overwrite any outputs that already exist. Cannot be 722 `True` if ``skipExisting`` is. 725 def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False):
732 def _loadTaskClass(self, taskDef):
733 """Make sure task class is loaded. 735 Load task class, update task name to make sure it is fully-qualified, 736 do not update original taskDef in a Pipeline though. 744 `TaskDef` instance, may be the same as parameter if task class is 747 if taskDef.taskClass
is None:
748 tClass, tName = self.
taskFactory.loadTaskClass(taskDef.taskName)
749 taskDef = copy.copy(taskDef)
750 taskDef.taskClass = tClass
751 taskDef.taskName = tName
755 """Create execution graph for a pipeline. 759 pipeline : `Pipeline` 760 Pipeline definition, task names/classes and their configs. 761 originInfo : `~lsst.daf.butler.DatasetOriginInfo` 762 Object which provides names of the input/output collections. 764 String which defunes user-defined selection for registry, should be 765 empty or `None` if there is no restrictions on data selection. 769 graph : `QuantumGraph` 774 Raised when user expression cannot be parsed. 776 Raised when output datasets already exist. 778 Other exceptions types may be raised by underlying registry 790 scaffolding.fillDataIds(self.
registry, originInfo, userQuery)
791 scaffolding.fillDatasetRefs(self.
registry, originInfo,
794 scaffolding.fillQuanta(self.
registry, originInfo,
797 return scaffolding.makeQuantumGraph()
def makeGraph(self, pipeline, originInfo, userQuery)
def makeQuantumGraph(self)
def _loadTaskClass(self, taskDef)
def makeQuantumGraphTaskNodes(self)
def fillDataIds(self, registry, originInfo, userQuery)
def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False)
def __init__(self, pipeline, universe)
def fillQuanta(self, registry, originInfo, skipExisting=True)
def fillDatasetRefs(self, registry, originInfo, skipExisting=True, clobberExisting=False)