21 from __future__
import annotations
23 """Module defining Pipeline class and related methods.
26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes",
"LabelSpecifier"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Set, Union, Generator, TYPE_CHECKING, Optional
41 from lsst.daf.butler
import DatasetType, NamedValueSet, Registry, SkyPixDimension
43 from .configOverrides
import ConfigOverrides
44 from .connections
import iterConnections
45 from .pipelineTask
import PipelineTask
47 from .
import pipelineIR
48 from .
import pipeTools
51 from lsst.obs.base.instrument
import Instrument
64 """A structure to specify a subset of labels to load
66 This structure may contain a set of labels to be used in subsetting a
67 pipeline, or a beginning and end point. Beginning or end may be empty,
68 in which case the range will be a half open interval. Unlike python
69 iteration bounds, end bounds are *INCLUDED*. Note that range based
70 selection is not well defined for pipelines that are not linear in nature,
71 and correct behavior is not guaranteed, or may vary from run to run.
73 labels: Optional[Set[str]] =
None
74 begin: Optional[str] =
None
75 end: Optional[str] =
None
78 if self.labels
is not None and (self.begin
or self.end):
79 raise ValueError(
"This struct can only be initialized with a labels set or "
80 "a begin (and/or) end specifier")
84 """TaskDef is a collection of information about task needed by Pipeline.
86 The information includes task name, configuration object and optional
87 task class. This class is just a collection of attributes and it exposes
88 all of them so that attributes could potentially be modified in place
89 (e.g. if configuration needs extra overrides).
94 `PipelineTask` class name, currently it is not specified whether this
95 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
96 Framework should be prepared to handle all cases.
97 config : `lsst.pex.config.Config`
98 Instance of the configuration class corresponding to this task class,
99 usually with all overrides applied. This config will be frozen.
100 taskClass : `type` or ``None``
101 `PipelineTask` class object, can be ``None``. If ``None`` then
102 framework will have to locate and load class.
103 label : `str`, optional
104 Task label, usually a short string unique in a pipeline.
106 def __init__(self, taskName, config, taskClass=None, label=""):
112 self.
connections = config.connections.ConnectionsClass(config=config)
116 """Name of a dataset type for configuration of this task (`str`)
118 return self.
label +
"_config"
122 """Name of a dataset type for metadata of this task, `None` if
123 metadata is not to be saved (`str`)
125 if self.
config.saveMetadata:
126 return self.
label +
"_metadata"
133 rep +=
", label=" + self.
label
138 if not isinstance(other, TaskDef):
140 return self.
config == other.config
and\
142 self.
label == other.label
149 """A `Pipeline` is a representation of a series of tasks to run, and the
150 configuration for those tasks.
155 A description of that this pipeline does.
158 pipeline_dict = {
"description": description,
"tasks": {}}
163 """Load a pipeline defined in a pipeline yaml file.
168 A path that points to a pipeline defined in yaml format. This
169 filename may also supply additional labels to be used in
170 subsetting the loaded Pipeline. These labels are separated from
171 the path by a colon, and may be specified as a comma separated
172 list, or a range denoted as beginning..end. Beginning or end may
173 be empty, in which case the range will be a half open interval.
174 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
175 that range based selection is not well defined for pipelines that
176 are not linear in nature, and correct behavior is not guaranteed,
177 or may vary from run to run.
182 The pipeline loaded from specified location with appropriate (if
187 This method attempts to prune any contracts that contain labels which
188 are not in the declared subset of labels. This pruning is done using a
189 string based matching due to the nature of contracts and may prune more
194 pipeline: Pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
197 if labelSpecifier
is not None:
198 pipeline = pipeline.subsetFromLabels(labelSpecifier)
202 """Subset a pipeline to contain only labels specified in labelSpecifier
206 labelSpecifier : `labelSpecifier`
207 Object containing labels that describes how to subset a pipeline.
211 pipeline : `Pipeline`
212 A new pipeline object that is a subset of the old pipeline
217 Raised if there is an issue with specified labels
221 This method attempts to prune any contracts that contain labels which
222 are not in the declared subset of labels. This pruning is done using a
223 string based matching due to the nature of contracts and may prune more
227 pipeline = copy.deepcopy(self)
229 def remove_contracts(label: str):
230 """Remove any contracts that contain the given label
232 String comparison used in this way is not the most elegant and may
233 have issues, but it is the only feasible way when users can specify
234 contracts with generic strings.
237 for contract
in pipeline._pipelineIR.contracts:
240 if re.match(f
".*([^A-Za-z0-9_]|^){label}[.]", contract.contract):
242 new_contracts.append(contract)
243 pipeline._pipelineIR.contracts = new_contracts
247 if labelSpecifier.labels:
249 if not labelSpecifier.labels.issubset(pipeline._pipelineIR.tasks.keys()):
250 difference = labelSpecifier.labels.difference(pipeline._pipelineIR.tasks.keys())
251 raise ValueError(
"Not all supplied labels are in the pipeline definition, extra labels:"
254 pipeline_labels = list(pipeline._pipelineIR.tasks.keys())
255 for label
in pipeline_labels:
256 if label
not in labelSpecifier.labels:
257 pipeline.removeTask(label)
258 remove_contracts(label)
268 contractSave = pipeline._pipelineIR.contracts
269 pipeline._pipelineIR.contracts = []
270 labels = {taskdef.label:
True for taskdef
in pipeline.toExpandedPipeline()}
271 pipeline._pipelineIR.contracts = contractSave
274 if labelSpecifier.begin
is not None:
275 if labelSpecifier.begin
not in labels:
276 raise ValueError(f
"Beginning of range subset, {labelSpecifier.begin}, not found in "
277 "pipeline definition")
278 if labelSpecifier.end
is not None:
279 if labelSpecifier.end
not in labels:
280 raise ValueError(f
"End of range subset, {labelSpecifier.end}, not found in pipeline "
287 if labelSpecifier.begin:
288 if label != labelSpecifier.begin:
289 pipeline.removeTask(label)
290 remove_contracts(label)
293 labelSpecifier.begin =
None
296 if labelSpecifier.end:
297 if label != labelSpecifier.end:
299 pipeline.removeTask(label)
300 remove_contracts(label)
307 def _parseFileSpecifier(fileSpecifer):
308 """Split appart a filename path from label subsets
310 split = fileSpecifer.split(
':')
313 return fileSpecifer,
None
316 raise ValueError(
"Only one : is allowed when specifying a pipeline to load")
320 filename, labelSubset = split[0], split[1]
322 if ',' in labelSubset:
323 if '..' in labelSubset:
324 raise ValueError(
"Can only specify a list of labels or a range"
325 "when loading a Pipline not both")
326 labels = set(labelSubset.split(
","))
329 elif '..' in labelSubset:
333 begin, end = labelSubset.split(
"..")
335 raise ValueError(
"Only one range can be specified when loading a pipeline")
336 specifier =
LabelSpecifier(begin=begin
if begin
else None, end=end
if end
else None)
339 labels = {labelSubset}
342 return filename, specifier
346 """Create a pipeline from string formatted as a pipeline document.
350 pipeline_string : `str`
351 A string that is formatted according like a pipeline document
357 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
361 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
362 """Create a pipeline from an already created `PipelineIR` object.
366 deserialized_pipeline: `PipelineIR`
367 An already created pipeline intermediate representation object
373 pipeline = cls.__new__(cls)
374 pipeline._pipelineIR = deserialized_pipeline
379 """Create a new pipeline by copying an already existing `Pipeline`.
384 An already created pipeline intermediate representation object
390 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
396 """Add an instrument to the pipeline, or replace an instrument that is
401 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
402 Either a derived class object of a `lsst.daf.butler.instrument` or
403 a string corresponding to a fully qualified
404 `lsst.daf.butler.instrument` name.
406 if isinstance(instrument, str):
411 instrument = f
"{instrument.__module__}.{instrument.__qualname__}"
415 """Get the instrument from the pipeline.
419 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
420 A derived class object of a `lsst.daf.butler.instrument`, a string
421 corresponding to a fully qualified `lsst.daf.butler.instrument`
422 name, or None if the pipeline does not have an instrument.
426 def addTask(self, task: Union[PipelineTask, str], label: str):
427 """Add a new task to the pipeline, or replace a task that is already
428 associated with the supplied label.
432 task: `PipelineTask` or `str`
433 Either a derived class object of a `PipelineTask` or a string
434 corresponding to a fully qualified `PipelineTask` name.
436 A label that is used to identify the `PipelineTask` being added
438 if isinstance(task, str):
440 elif issubclass(task, PipelineTask):
441 taskName = f
"{task.__module__}.{task.__qualname__}"
443 raise ValueError(
"task must be either a child class of PipelineTask or a string containing"
444 " a fully qualified name to one")
449 if isinstance(task, str):
450 task = doImport(task)
451 label = task._DefaultName
455 """Remove a task from the pipeline.
460 The label used to identify the task that is to be removed
465 If no task with that label exists in the pipeline
471 """Apply single config override.
478 Fully-qualified field name.
480 Value to be given to a field.
485 """Add overrides from a specified file.
490 The label used to identify the task associated with config to
493 Path to the override file.
498 """Add Overrides by running a snippet of python code against a config.
503 The label used to identity the task associated with config to
506 A string which is valid python code to be executed. This is done
507 with config as the only local accessible value.
513 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
514 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
520 """Returns a generator of TaskDefs which can be used to create quantum
525 generator : generator of `TaskDef`
526 The generator returned will be the sorted iterator of tasks which
527 are to be used in constructing a quantum graph.
532 If a dataId is supplied in a config block. This is in place for
536 for label, taskIR
in self.
_pipelineIR.tasks.items():
537 taskClass = doImport(taskIR.klass)
538 taskName = taskClass.__qualname__
539 config = taskClass.ConfigClass()
542 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
543 if taskIR.config
is not None:
544 for configIR
in taskIR.config:
545 if configIR.dataId
is not None:
546 raise NotImplementedError(
"Specializing a config on a partial data id is not yet "
547 "supported in Pipeline definition")
549 if configIR.dataId
is None:
551 for configFile
in configIR.file:
552 overrides.addFileOverride(os.path.expandvars(configFile))
553 if configIR.python
is not None:
554 overrides.addPythonOverride(configIR.python)
555 for key, value
in configIR.rest.items():
556 overrides.addValueOverride(key, value)
557 overrides.applyTo(config)
560 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
564 label_to_config = {x.label: x.config
for x
in taskDefs}
568 success = eval(contract.contract,
None, label_to_config)
570 extra_info = f
": {contract.msg}" if contract.msg
is not None else ""
572 f
"satisfied{extra_info}")
574 yield from pipeTools.orderPipeline(taskDefs)
580 if not isinstance(other, Pipeline):
585 @dataclass(frozen=
True)
587 """An immutable struct that extracts and classifies the dataset types used
591 initInputs: NamedValueSet[DatasetType]
592 """Dataset types that are needed as inputs in order to construct this Task.
594 Task-level `initInputs` may be classified as either
595 `~PipelineDatasetTypes.initInputs` or
596 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
599 initOutputs: NamedValueSet[DatasetType]
600 """Dataset types that may be written after constructing this Task.
602 Task-level `initOutputs` may be classified as either
603 `~PipelineDatasetTypes.initOutputs` or
604 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
607 inputs: NamedValueSet[DatasetType]
608 """Dataset types that are regular inputs to this Task.
610 If an input dataset needed for a Quantum cannot be found in the input
611 collection(s) or produced by another Task in the Pipeline, that Quantum
612 (and all dependent Quanta) will not be produced.
614 Task-level `inputs` may be classified as either
615 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
616 at the Pipeline level.
619 prerequisites: NamedValueSet[DatasetType]
620 """Dataset types that are prerequisite inputs to this Task.
622 Prerequisite inputs must exist in the input collection(s) before the
623 pipeline is run, but do not constrain the graph - if a prerequisite is
624 missing for a Quantum, `PrerequisiteMissingError` is raised.
626 Prerequisite inputs are not resolved until the second stage of
627 QuantumGraph generation.
630 outputs: NamedValueSet[DatasetType]
631 """Dataset types that are produced by this Task.
633 Task-level `outputs` may be classified as either
634 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
635 at the Pipeline level.
639 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
640 """Extract and classify the dataset types from a single `PipelineTask`.
645 An instance of a `TaskDef` class for a particular `PipelineTask`.
647 Registry used to construct normalized `DatasetType` objects and
648 retrieve those that are incomplete.
652 types: `TaskDatasetTypes`
653 The dataset types used by this task.
655 def makeDatasetTypesSet(connectionType, freeze=True):
656 """Constructs a set of true `DatasetType` objects
660 connectionType : `str`
661 Name of the connection type to produce a set for, corresponds
662 to an attribute of type `list` on the connection class instance
663 freeze : `bool`, optional
664 If `True`, call `NamedValueSet.freeze` on the object returned.
668 datasetTypes : `NamedValueSet`
669 A set of all datasetTypes which correspond to the input
670 connection type specified in the connection class of this
675 This function is a closure over the variables ``registry`` and
678 datasetTypes = NamedValueSet()
680 dimensions = set(getattr(c,
'dimensions', set()))
681 if "skypix" in dimensions:
683 datasetType = registry.getDatasetType(c.name)
684 except LookupError
as err:
686 f
"DatasetType '{c.name}' referenced by "
687 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
688 f
"placeholder, but does not already exist in the registry. "
689 f
"Note that reference catalog names are now used as the dataset "
690 f
"type name instead of 'ref_cat'."
692 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
693 rest2 = set(dim.name
for dim
in datasetType.dimensions
694 if not isinstance(dim, SkyPixDimension))
696 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in "
697 f
"connections ({rest1}) are inconsistent with those in "
698 f
"registry's version of this dataset ({rest2}).")
704 registryDatasetType =
None
706 registryDatasetType = registry.getDatasetType(c.name)
708 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
709 parentStorageClass = DatasetType.PlaceholderParentStorageClass \
710 if componentName
else None
711 datasetType = c.makeDatasetType(
713 parentStorageClass=parentStorageClass
715 registryDatasetType = datasetType
717 datasetType = c.makeDatasetType(
719 parentStorageClass=registryDatasetType.parentStorageClass
722 if registryDatasetType
and datasetType != registryDatasetType:
723 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with "
724 f
"registry definition ({registryDatasetType}) "
725 f
"for {taskDef.label}.")
726 datasetTypes.add(datasetType)
728 datasetTypes.freeze()
732 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
733 if taskDef.metadataDatasetName
is not None:
736 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
737 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertySet")}
741 initInputs=makeDatasetTypesSet(
"initInputs"),
742 initOutputs=makeDatasetTypesSet(
"initOutputs"),
743 inputs=makeDatasetTypesSet(
"inputs"),
744 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
749 @dataclass(frozen=
True)
751 """An immutable struct that classifies the dataset types used in a
755 initInputs: NamedValueSet[DatasetType]
756 """Dataset types that are needed as inputs in order to construct the Tasks
759 This does not include dataset types that are produced when constructing
760 other Tasks in the Pipeline (these are classified as `initIntermediates`).
763 initOutputs: NamedValueSet[DatasetType]
764 """Dataset types that may be written after constructing the Tasks in this
767 This does not include dataset types that are also used as inputs when
768 constructing other Tasks in the Pipeline (these are classified as
769 `initIntermediates`).
772 initIntermediates: NamedValueSet[DatasetType]
773 """Dataset types that are both used when constructing one or more Tasks
774 in the Pipeline and produced as a side-effect of constructing another
775 Task in the Pipeline.
778 inputs: NamedValueSet[DatasetType]
779 """Dataset types that are regular inputs for the full pipeline.
781 If an input dataset needed for a Quantum cannot be found in the input
782 collection(s), that Quantum (and all dependent Quanta) will not be
786 prerequisites: NamedValueSet[DatasetType]
787 """Dataset types that are prerequisite inputs for the full Pipeline.
789 Prerequisite inputs must exist in the input collection(s) before the
790 pipeline is run, but do not constrain the graph - if a prerequisite is
791 missing for a Quantum, `PrerequisiteMissingError` is raised.
793 Prerequisite inputs are not resolved until the second stage of
794 QuantumGraph generation.
797 intermediates: NamedValueSet[DatasetType]
798 """Dataset types that are output by one Task in the Pipeline and consumed
799 as inputs by one or more other Tasks in the Pipeline.
802 outputs: NamedValueSet[DatasetType]
803 """Dataset types that are output by a Task in the Pipeline and not consumed
804 by any other Task in the Pipeline.
807 byTask: Mapping[str, TaskDatasetTypes]
808 """Per-Task dataset types, keyed by label in the `Pipeline`.
810 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
811 neither has been modified since the dataset types were extracted, of
816 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
817 """Extract and classify the dataset types from all tasks in a
823 An ordered collection of tasks that can be run together.
825 Registry used to construct normalized `DatasetType` objects and
826 retrieve those that are incomplete.
830 types: `PipelineDatasetTypes`
831 The dataset types used by this `Pipeline`.
836 Raised if Tasks are inconsistent about which datasets are marked
837 prerequisite. This indicates that the Tasks cannot be run as part
838 of the same `Pipeline`.
840 allInputs = NamedValueSet()
841 allOutputs = NamedValueSet()
842 allInitInputs = NamedValueSet()
843 allInitOutputs = NamedValueSet()
844 prerequisites = NamedValueSet()
846 if isinstance(pipeline, Pipeline):
847 pipeline = pipeline.toExpandedPipeline()
848 for taskDef
in pipeline:
849 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
850 allInitInputs |= thisTask.initInputs
851 allInitOutputs |= thisTask.initOutputs
852 allInputs |= thisTask.inputs
853 prerequisites |= thisTask.prerequisites
854 allOutputs |= thisTask.outputs
855 byTask[taskDef.label] = thisTask
856 if not prerequisites.isdisjoint(allInputs):
857 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
858 {dt.name
for dt
in allInputs & prerequisites}
860 if not prerequisites.isdisjoint(allOutputs):
861 raise ValueError(
"{} marked as both prerequisites and outputs".format(
862 {dt.name
for dt
in allOutputs & prerequisites}
867 intermediateComponents = NamedValueSet()
868 intermediateComposites = NamedValueSet()
869 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
870 for dsType
in allInputs:
872 name, component = dsType.nameAndComponent()
876 if component
is not None:
877 if name
in outputNameMapping:
878 if outputNameMapping[name].dimensions != dsType.dimensions:
879 raise ValueError(f
"Component dataset type {dsType.name} has different "
880 f
"dimensions ({dsType.dimensions}) than its parent "
881 f
"({outputNameMapping[name].dimensions}).")
882 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
883 universe=registry.dimensions)
884 intermediateComponents.add(dsType)
885 intermediateComposites.add(composite)
887 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
888 common = a.names & b.names
890 if a[name] != b[name]:
891 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
893 checkConsistency(allInitInputs, allInitOutputs)
894 checkConsistency(allInputs, allOutputs)
895 checkConsistency(allInputs, intermediateComposites)
896 checkConsistency(allOutputs, intermediateComposites)
898 def frozen(s: NamedValueSet) -> NamedValueSet:
903 initInputs=
frozen(allInitInputs - allInitOutputs),
904 initIntermediates=
frozen(allInitInputs & allInitOutputs),
905 initOutputs=
frozen(allInitOutputs - allInitInputs),
906 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
907 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
908 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
909 prerequisites=
frozen(prerequisites),
910 byTask=MappingProxyType(byTask),