21 from __future__
import annotations
23 """Module defining Pipeline class and related methods.
26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes",
"LabelSpecifier"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Set, Union, Generator, TYPE_CHECKING, Optional
40 from lsst.daf.butler
import DatasetType, NamedValueSet, Registry, SkyPixDimension
42 from .configOverrides
import ConfigOverrides
43 from .connections
import iterConnections
44 from .pipelineTask
import PipelineTask
46 from .
import pipelineIR
47 from .
import pipeTools
50 from lsst.obs.base.instrument
import Instrument
63 """A structure to specify a subset of labels to load
65 This structure may contain a set of labels to be used in subsetting a
66 pipeline, or a beginning and end point. Beginning or end may be empty,
67 in which case the range will be a half open interval. Unlike python
68 iteration bounds, end bounds are *INCLUDED*. Note that range based
69 selection is not well defined for pipelines that are not linear in nature,
70 and correct behavior is not guaranteed, or may vary from run to run.
72 labels: Optional[Set[str]] =
None
73 begin: Optional[str] =
None
74 end: Optional[str] =
None
77 if self.labels
is not None and (self.begin
or self.end):
78 raise ValueError(
"This struct can only be initialized with a labels set or "
79 "a begin (and/or) end specifier")
83 """TaskDef is a collection of information about task needed by Pipeline.
85 The information includes task name, configuration object and optional
86 task class. This class is just a collection of attributes and it exposes
87 all of them so that attributes could potentially be modified in place
88 (e.g. if configuration needs extra overrides).
93 `PipelineTask` class name, currently it is not specified whether this
94 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
95 Framework should be prepared to handle all cases.
96 config : `lsst.pex.config.Config`
97 Instance of the configuration class corresponding to this task class,
98 usually with all overrides applied. This config will be frozen.
99 taskClass : `type` or ``None``
100 `PipelineTask` class object, can be ``None``. If ``None`` then
101 framework will have to locate and load class.
102 label : `str`, optional
103 Task label, usually a short string unique in a pipeline.
105 def __init__(self, taskName, config, taskClass=None, label=""):
111 self.
connections = config.connections.ConnectionsClass(config=config)
115 """Name of a dataset type for configuration of this task (`str`)
117 return self.
label +
"_config"
121 """Name of a dataset type for metadata of this task, `None` if
122 metadata is not to be saved (`str`)
124 if self.
config.saveMetadata:
125 return self.
label +
"_metadata"
132 rep +=
", label=" + self.
label
137 if not isinstance(other, TaskDef):
142 return self.
taskClass == other.taskClass
and self.
label == other.label
149 """A `Pipeline` is a representation of a series of tasks to run, and the
150 configuration for those tasks.
155 A description of that this pipeline does.
158 pipeline_dict = {
"description": description,
"tasks": {}}
163 """Load a pipeline defined in a pipeline yaml file.
168 A path that points to a pipeline defined in yaml format. This
169 filename may also supply additional labels to be used in
170 subsetting the loaded Pipeline. These labels are separated from
171 the path by a colon, and may be specified as a comma separated
172 list, or a range denoted as beginning..end. Beginning or end may
173 be empty, in which case the range will be a half open interval.
174 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
175 that range based selection is not well defined for pipelines that
176 are not linear in nature, and correct behavior is not guaranteed,
177 or may vary from run to run.
182 The pipeline loaded from specified location with appropriate (if
187 This method attempts to prune any contracts that contain labels which
188 are not in the declared subset of labels. This pruning is done using a
189 string based matching due to the nature of contracts and may prune more
194 pipeline: Pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
197 if labelSpecifier
is not None:
198 pipeline = pipeline.subsetFromLabels(labelSpecifier)
202 """Subset a pipeline to contain only labels specified in labelSpecifier
206 labelSpecifier : `labelSpecifier`
207 Object containing labels that describes how to subset a pipeline.
211 pipeline : `Pipeline`
212 A new pipeline object that is a subset of the old pipeline
217 Raised if there is an issue with specified labels
221 This method attempts to prune any contracts that contain labels which
222 are not in the declared subset of labels. This pruning is done using a
223 string based matching due to the nature of contracts and may prune more
227 if labelSpecifier.labels:
228 labelSet = labelSpecifier.labels
239 pipeline = copy.deepcopy(self)
240 pipeline._pipelineIR.contracts = []
241 labels = {taskdef.label:
True for taskdef
in pipeline.toExpandedPipeline()}
244 if labelSpecifier.begin
is not None:
245 if labelSpecifier.begin
not in labels:
246 raise ValueError(f
"Beginning of range subset, {labelSpecifier.begin}, not found in "
247 "pipeline definition")
248 if labelSpecifier.end
is not None:
249 if labelSpecifier.end
not in labels:
250 raise ValueError(f
"End of range subset, {labelSpecifier.end}, not found in pipeline "
255 if labelSpecifier.begin
is not None:
256 if label != labelSpecifier.begin:
259 labelSpecifier.begin =
None
261 if labelSpecifier.end
is not None and label == labelSpecifier.end:
263 return Pipeline.fromIR(self.
_pipelineIR.subset_from_labels(labelSet))
266 def _parseFileSpecifier(fileSpecifer):
267 """Split appart a filename path from label subsets
269 split = fileSpecifer.split(
':')
272 return fileSpecifer,
None
275 raise ValueError(
"Only one : is allowed when specifying a pipeline to load")
279 filename, labelSubset = split[0], split[1]
281 if ',' in labelSubset:
282 if '..' in labelSubset:
283 raise ValueError(
"Can only specify a list of labels or a range"
284 "when loading a Pipline not both")
285 labels = set(labelSubset.split(
","))
288 elif '..' in labelSubset:
292 begin, end = labelSubset.split(
"..")
294 raise ValueError(
"Only one range can be specified when loading a pipeline")
295 specifier =
LabelSpecifier(begin=begin
if begin
else None, end=end
if end
else None)
298 labels = {labelSubset}
301 return filename, specifier
305 """Create a pipeline from string formatted as a pipeline document.
309 pipeline_string : `str`
310 A string that is formatted according like a pipeline document
316 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
320 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
321 """Create a pipeline from an already created `PipelineIR` object.
325 deserialized_pipeline: `PipelineIR`
326 An already created pipeline intermediate representation object
332 pipeline = cls.__new__(cls)
333 pipeline._pipelineIR = deserialized_pipeline
338 """Create a new pipeline by copying an already existing `Pipeline`.
343 An already created pipeline intermediate representation object
349 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
355 """Add an instrument to the pipeline, or replace an instrument that is
360 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
361 Either a derived class object of a `lsst.daf.butler.instrument` or
362 a string corresponding to a fully qualified
363 `lsst.daf.butler.instrument` name.
365 if isinstance(instrument, str):
370 instrument = f
"{instrument.__module__}.{instrument.__qualname__}"
374 """Get the instrument from the pipeline.
378 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
379 A derived class object of a `lsst.daf.butler.instrument`, a string
380 corresponding to a fully qualified `lsst.daf.butler.instrument`
381 name, or None if the pipeline does not have an instrument.
385 def addTask(self, task: Union[PipelineTask, str], label: str):
386 """Add a new task to the pipeline, or replace a task that is already
387 associated with the supplied label.
391 task: `PipelineTask` or `str`
392 Either a derived class object of a `PipelineTask` or a string
393 corresponding to a fully qualified `PipelineTask` name.
395 A label that is used to identify the `PipelineTask` being added
397 if isinstance(task, str):
399 elif issubclass(task, PipelineTask):
400 taskName = f
"{task.__module__}.{task.__qualname__}"
402 raise ValueError(
"task must be either a child class of PipelineTask or a string containing"
403 " a fully qualified name to one")
408 if isinstance(task, str):
409 task = doImport(task)
410 label = task._DefaultName
414 """Remove a task from the pipeline.
419 The label used to identify the task that is to be removed
424 If no task with that label exists in the pipeline
430 """Apply single config override.
437 Fully-qualified field name.
439 Value to be given to a field.
444 """Add overrides from a specified file.
449 The label used to identify the task associated with config to
452 Path to the override file.
457 """Add Overrides by running a snippet of python code against a config.
462 The label used to identity the task associated with config to
465 A string which is valid python code to be executed. This is done
466 with config as the only local accessible value.
471 if label ==
"parameters":
472 if newConfig.rest.keys() - self.
_pipelineIR.parameters.mapping.keys():
473 raise ValueError(
"Cannot override parameters that are not defined in pipeline")
474 self.
_pipelineIR.parameters.mapping.update(newConfig.rest)
476 raise ValueError(
"Setting parameters section with config file is not supported")
478 raise ValueError(
"Setting parameters section using python block in unsupported")
481 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
482 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
488 """Returns a generator of TaskDefs which can be used to create quantum
493 generator : generator of `TaskDef`
494 The generator returned will be the sorted iterator of tasks which
495 are to be used in constructing a quantum graph.
500 If a dataId is supplied in a config block. This is in place for
504 for label, taskIR
in self.
_pipelineIR.tasks.items():
505 taskClass = doImport(taskIR.klass)
506 taskName = taskClass.__qualname__
507 config = taskClass.ConfigClass()
510 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
511 if taskIR.config
is not None:
512 for configIR
in (configIr.formatted(self.
_pipelineIR.parameters)
513 for configIr
in taskIR.config):
514 if configIR.dataId
is not None:
515 raise NotImplementedError(
"Specializing a config on a partial data id is not yet "
516 "supported in Pipeline definition")
518 if configIR.dataId
is None:
520 for configFile
in configIR.file:
521 overrides.addFileOverride(os.path.expandvars(configFile))
522 if configIR.python
is not None:
523 overrides.addPythonOverride(configIR.python)
524 for key, value
in configIR.rest.items():
525 overrides.addValueOverride(key, value)
526 overrides.applyTo(config)
529 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
533 label_to_config = {x.label: x.config
for x
in taskDefs}
537 success = eval(contract.contract,
None, label_to_config)
539 extra_info = f
": {contract.msg}" if contract.msg
is not None else ""
541 f
"satisfied{extra_info}")
543 yield from pipeTools.orderPipeline(taskDefs)
549 if not isinstance(other, Pipeline):
554 @dataclass(frozen=
True)
556 """An immutable struct that extracts and classifies the dataset types used
560 initInputs: NamedValueSet[DatasetType]
561 """Dataset types that are needed as inputs in order to construct this Task.
563 Task-level `initInputs` may be classified as either
564 `~PipelineDatasetTypes.initInputs` or
565 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
568 initOutputs: NamedValueSet[DatasetType]
569 """Dataset types that may be written after constructing this Task.
571 Task-level `initOutputs` may be classified as either
572 `~PipelineDatasetTypes.initOutputs` or
573 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
576 inputs: NamedValueSet[DatasetType]
577 """Dataset types that are regular inputs to this Task.
579 If an input dataset needed for a Quantum cannot be found in the input
580 collection(s) or produced by another Task in the Pipeline, that Quantum
581 (and all dependent Quanta) will not be produced.
583 Task-level `inputs` may be classified as either
584 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
585 at the Pipeline level.
588 prerequisites: NamedValueSet[DatasetType]
589 """Dataset types that are prerequisite inputs to this Task.
591 Prerequisite inputs must exist in the input collection(s) before the
592 pipeline is run, but do not constrain the graph - if a prerequisite is
593 missing for a Quantum, `PrerequisiteMissingError` is raised.
595 Prerequisite inputs are not resolved until the second stage of
596 QuantumGraph generation.
599 outputs: NamedValueSet[DatasetType]
600 """Dataset types that are produced by this Task.
602 Task-level `outputs` may be classified as either
603 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
604 at the Pipeline level.
608 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
609 """Extract and classify the dataset types from a single `PipelineTask`.
614 An instance of a `TaskDef` class for a particular `PipelineTask`.
616 Registry used to construct normalized `DatasetType` objects and
617 retrieve those that are incomplete.
621 types: `TaskDatasetTypes`
622 The dataset types used by this task.
624 def makeDatasetTypesSet(connectionType, freeze=True):
625 """Constructs a set of true `DatasetType` objects
629 connectionType : `str`
630 Name of the connection type to produce a set for, corresponds
631 to an attribute of type `list` on the connection class instance
632 freeze : `bool`, optional
633 If `True`, call `NamedValueSet.freeze` on the object returned.
637 datasetTypes : `NamedValueSet`
638 A set of all datasetTypes which correspond to the input
639 connection type specified in the connection class of this
644 This function is a closure over the variables ``registry`` and
647 datasetTypes = NamedValueSet()
649 dimensions = set(getattr(c,
'dimensions', set()))
650 if "skypix" in dimensions:
652 datasetType = registry.getDatasetType(c.name)
653 except LookupError
as err:
655 f
"DatasetType '{c.name}' referenced by "
656 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
657 f
"placeholder, but does not already exist in the registry. "
658 f
"Note that reference catalog names are now used as the dataset "
659 f
"type name instead of 'ref_cat'."
661 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
662 rest2 = set(dim.name
for dim
in datasetType.dimensions
663 if not isinstance(dim, SkyPixDimension))
665 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in "
666 f
"connections ({rest1}) are inconsistent with those in "
667 f
"registry's version of this dataset ({rest2}).")
673 registryDatasetType =
None
675 registryDatasetType = registry.getDatasetType(c.name)
677 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
678 parentStorageClass = DatasetType.PlaceholderParentStorageClass \
679 if componentName
else None
680 datasetType = c.makeDatasetType(
682 parentStorageClass=parentStorageClass
684 registryDatasetType = datasetType
686 datasetType = c.makeDatasetType(
688 parentStorageClass=registryDatasetType.parentStorageClass
691 if registryDatasetType
and datasetType != registryDatasetType:
692 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with "
693 f
"registry definition ({registryDatasetType}) "
694 f
"for {taskDef.label}.")
695 datasetTypes.add(datasetType)
697 datasetTypes.freeze()
701 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
702 if taskDef.metadataDatasetName
is not None:
705 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
706 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertySet")}
710 initInputs=makeDatasetTypesSet(
"initInputs"),
711 initOutputs=makeDatasetTypesSet(
"initOutputs"),
712 inputs=makeDatasetTypesSet(
"inputs"),
713 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
718 @dataclass(frozen=
True)
720 """An immutable struct that classifies the dataset types used in a
724 initInputs: NamedValueSet[DatasetType]
725 """Dataset types that are needed as inputs in order to construct the Tasks
728 This does not include dataset types that are produced when constructing
729 other Tasks in the Pipeline (these are classified as `initIntermediates`).
732 initOutputs: NamedValueSet[DatasetType]
733 """Dataset types that may be written after constructing the Tasks in this
736 This does not include dataset types that are also used as inputs when
737 constructing other Tasks in the Pipeline (these are classified as
738 `initIntermediates`).
741 initIntermediates: NamedValueSet[DatasetType]
742 """Dataset types that are both used when constructing one or more Tasks
743 in the Pipeline and produced as a side-effect of constructing another
744 Task in the Pipeline.
747 inputs: NamedValueSet[DatasetType]
748 """Dataset types that are regular inputs for the full pipeline.
750 If an input dataset needed for a Quantum cannot be found in the input
751 collection(s), that Quantum (and all dependent Quanta) will not be
755 prerequisites: NamedValueSet[DatasetType]
756 """Dataset types that are prerequisite inputs for the full Pipeline.
758 Prerequisite inputs must exist in the input collection(s) before the
759 pipeline is run, but do not constrain the graph - if a prerequisite is
760 missing for a Quantum, `PrerequisiteMissingError` is raised.
762 Prerequisite inputs are not resolved until the second stage of
763 QuantumGraph generation.
766 intermediates: NamedValueSet[DatasetType]
767 """Dataset types that are output by one Task in the Pipeline and consumed
768 as inputs by one or more other Tasks in the Pipeline.
771 outputs: NamedValueSet[DatasetType]
772 """Dataset types that are output by a Task in the Pipeline and not consumed
773 by any other Task in the Pipeline.
776 byTask: Mapping[str, TaskDatasetTypes]
777 """Per-Task dataset types, keyed by label in the `Pipeline`.
779 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
780 neither has been modified since the dataset types were extracted, of
785 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
786 """Extract and classify the dataset types from all tasks in a
792 An ordered collection of tasks that can be run together.
794 Registry used to construct normalized `DatasetType` objects and
795 retrieve those that are incomplete.
799 types: `PipelineDatasetTypes`
800 The dataset types used by this `Pipeline`.
805 Raised if Tasks are inconsistent about which datasets are marked
806 prerequisite. This indicates that the Tasks cannot be run as part
807 of the same `Pipeline`.
809 allInputs = NamedValueSet()
810 allOutputs = NamedValueSet()
811 allInitInputs = NamedValueSet()
812 allInitOutputs = NamedValueSet()
813 prerequisites = NamedValueSet()
815 if isinstance(pipeline, Pipeline):
816 pipeline = pipeline.toExpandedPipeline()
817 for taskDef
in pipeline:
818 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
819 allInitInputs |= thisTask.initInputs
820 allInitOutputs |= thisTask.initOutputs
821 allInputs |= thisTask.inputs
822 prerequisites |= thisTask.prerequisites
823 allOutputs |= thisTask.outputs
824 byTask[taskDef.label] = thisTask
825 if not prerequisites.isdisjoint(allInputs):
826 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
827 {dt.name
for dt
in allInputs & prerequisites}
829 if not prerequisites.isdisjoint(allOutputs):
830 raise ValueError(
"{} marked as both prerequisites and outputs".format(
831 {dt.name
for dt
in allOutputs & prerequisites}
836 intermediateComponents = NamedValueSet()
837 intermediateComposites = NamedValueSet()
838 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
839 for dsType
in allInputs:
841 name, component = dsType.nameAndComponent()
845 if component
is not None:
846 if name
in outputNameMapping:
847 if outputNameMapping[name].dimensions != dsType.dimensions:
848 raise ValueError(f
"Component dataset type {dsType.name} has different "
849 f
"dimensions ({dsType.dimensions}) than its parent "
850 f
"({outputNameMapping[name].dimensions}).")
851 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
852 universe=registry.dimensions)
853 intermediateComponents.add(dsType)
854 intermediateComposites.add(composite)
856 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
857 common = a.names & b.names
859 if a[name] != b[name]:
860 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
862 checkConsistency(allInitInputs, allInitOutputs)
863 checkConsistency(allInputs, allOutputs)
864 checkConsistency(allInputs, intermediateComposites)
865 checkConsistency(allOutputs, intermediateComposites)
867 def frozen(s: NamedValueSet) -> NamedValueSet:
872 initInputs=
frozen(allInitInputs - allInitOutputs),
873 initIntermediates=
frozen(allInitInputs & allInitOutputs),
874 initOutputs=
frozen(allInitOutputs - allInitInputs),
875 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
876 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
877 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
878 prerequisites=
frozen(prerequisites),
879 byTask=MappingProxyType(byTask),