21 from __future__
import annotations
23 """Module defining Pipeline class and related methods.
26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Union, Generator, TYPE_CHECKING
40 from lsst.daf.butler
import DatasetType, NamedValueSet, Registry, SkyPixDimension
42 from .configOverrides
import ConfigOverrides
43 from .connections
import iterConnections
44 from .pipelineTask
import PipelineTask
46 from .
import pipelineIR
47 from .
import pipeTools
50 from lsst.obs.base.instrument
import Instrument
62 """TaskDef is a collection of information about task needed by Pipeline.
64 The information includes task name, configuration object and optional
65 task class. This class is just a collection of attributes and it exposes
66 all of them so that attributes could potentially be modified in place
67 (e.g. if configuration needs extra overrides).
72 `PipelineTask` class name, currently it is not specified whether this
73 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
74 Framework should be prepared to handle all cases.
75 config : `lsst.pex.config.Config`
76 Instance of the configuration class corresponding to this task class,
77 usually with all overrides applied.
78 taskClass : `type` or ``None``
79 `PipelineTask` class object, can be ``None``. If ``None`` then
80 framework will have to locate and load class.
81 label : `str`, optional
82 Task label, usually a short string unique in a pipeline.
84 def __init__(self, taskName, config, taskClass=None, label=""):
89 self.
connections = config.connections.ConnectionsClass(config=config)
93 """Name of a dataset type for configuration of this task (`str`)
95 return self.
label +
"_config"
99 """Name of a dataset type for metadata of this task, `None` if
100 metadata is not to be saved (`str`)
102 if self.
config.saveMetadata:
103 return self.
label +
"_metadata"
110 rep +=
", label=" + self.
label
116 """A `Pipeline` is a representation of a series of tasks to run, and the
117 configuration for those tasks.
122 A description of that this pipeline does.
125 pipeline_dict = {
"description": description,
"tasks": {}}
130 """Load a pipeline defined in a pipeline yaml file.
135 A path that points to a pipeline defined in yaml format
141 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
146 """Create a pipeline from string formatted as a pipeline document.
150 pipeline_string : `str`
151 A string that is formatted according like a pipeline document
157 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
161 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
162 """Create a pipeline from an already created `PipelineIR` object.
166 deserialized_pipeline: `PipelineIR`
167 An already created pipeline intermediate representation object
173 pipeline = cls.__new__(cls)
174 pipeline._pipelineIR = deserialized_pipeline
179 """Create a new pipeline by copying an already existing `Pipeline`.
184 An already created pipeline intermediate representation object
190 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
196 """Add an instrument to the pipeline, or replace an instrument that is
201 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
202 Either a derived class object of a `lsst.daf.butler.instrument` or a
203 string corresponding to a fully qualified
204 `lsst.daf.butler.instrument` name.
206 if isinstance(instrument, str):
210 instrument = f
"{instrument.__module__}.{instrument.__qualname__}"
214 """Get the instrument from the pipeline.
218 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
219 A derived class object of a `lsst.daf.butler.instrument`, a string
220 corresponding to a fully qualified `lsst.daf.butler.instrument`
221 name, or None if the pipeline does not have an instrument.
225 def addTask(self, task: Union[PipelineTask, str], label: str):
226 """Add a new task to the pipeline, or replace a task that is already
227 associated with the supplied label.
231 task: `PipelineTask` or `str`
232 Either a derived class object of a `PipelineTask` or a string
233 corresponding to a fully qualified `PipelineTask` name.
235 A label that is used to identify the `PipelineTask` being added
237 if isinstance(task, str):
239 elif issubclass(task, PipelineTask):
240 taskName = f
"{task.__module__}.{task.__qualname__}"
242 raise ValueError(
"task must be either a child class of PipelineTask or a string containing"
243 " a fully qualified name to one")
248 if isinstance(task, str):
249 task = doImport(task)
250 label = task._DefaultName
254 """Remove a task from the pipeline.
259 The label used to identify the task that is to be removed
264 If no task with that label exists in the pipeline
270 """Apply single config override.
277 Fully-qualified field name.
279 Value to be given to a field.
284 """Add overrides from a specified file.
289 The label used to identify the task associated with config to
292 Path to the override file.
297 """Add Overrides by running a snippet of python code against a config.
302 The label used to identity the task associated with config to
305 A string which is valid python code to be executed. This is done
306 with config as the only local accessible value.
312 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
313 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
319 """Returns a generator of TaskDefs which can be used to create quantum
324 generator : generator of `TaskDef`
325 The generator returned will be the sorted iterator of tasks which
326 are to be used in constructing a quantum graph.
331 If a dataId is supplied in a config block. This is in place for
335 for label, taskIR
in self.
_pipelineIR.tasks.items():
336 taskClass = doImport(taskIR.klass)
337 taskName = taskClass.__qualname__
338 config = taskClass.ConfigClass()
341 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
342 if taskIR.config
is not None:
343 for configIR
in taskIR.config:
344 if configIR.dataId
is not None:
345 raise NotImplementedError(
"Specializing a config on a partial data id is not yet "
346 "supported in Pipeline definition")
348 if configIR.dataId
is None:
350 for configFile
in configIR.file:
351 overrides.addFileOverride(os.path.expandvars(configFile))
352 if configIR.python
is not None:
353 overrides.addPythonOverride(configIR.python)
354 for key, value
in configIR.rest.items():
355 overrides.addValueOverride(key, value)
356 overrides.applyTo(config)
359 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
363 label_to_config = {x.label: x.config
for x
in taskDefs}
367 success = eval(contract.contract,
None, label_to_config)
369 extra_info = f
": {contract.msg}" if contract.msg
is not None else ""
371 f
"satisfied{extra_info}")
373 yield from pipeTools.orderPipeline(taskDefs)
379 if not isinstance(other, Pipeline):
384 @dataclass(frozen=
True)
386 """An immutable struct that extracts and classifies the dataset types used
390 initInputs: NamedValueSet[DatasetType]
391 """Dataset types that are needed as inputs in order to construct this Task.
393 Task-level `initInputs` may be classified as either
394 `~PipelineDatasetTypes.initInputs` or
395 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
398 initOutputs: NamedValueSet[DatasetType]
399 """Dataset types that may be written after constructing this Task.
401 Task-level `initOutputs` may be classified as either
402 `~PipelineDatasetTypes.initOutputs` or
403 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
406 inputs: NamedValueSet[DatasetType]
407 """Dataset types that are regular inputs to this Task.
409 If an input dataset needed for a Quantum cannot be found in the input
410 collection(s) or produced by another Task in the Pipeline, that Quantum
411 (and all dependent Quanta) will not be produced.
413 Task-level `inputs` may be classified as either
414 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
415 at the Pipeline level.
418 prerequisites: NamedValueSet[DatasetType]
419 """Dataset types that are prerequisite inputs to this Task.
421 Prerequisite inputs must exist in the input collection(s) before the
422 pipeline is run, but do not constrain the graph - if a prerequisite is
423 missing for a Quantum, `PrerequisiteMissingError` is raised.
425 Prerequisite inputs are not resolved until the second stage of
426 QuantumGraph generation.
429 outputs: NamedValueSet[DatasetType]
430 """Dataset types that are produced by this Task.
432 Task-level `outputs` may be classified as either
433 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
434 at the Pipeline level.
438 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
439 """Extract and classify the dataset types from a single `PipelineTask`.
444 An instance of a `TaskDef` class for a particular `PipelineTask`.
446 Registry used to construct normalized `DatasetType` objects and
447 retrieve those that are incomplete.
451 types: `TaskDatasetTypes`
452 The dataset types used by this task.
454 def makeDatasetTypesSet(connectionType, freeze=True):
455 """Constructs a set of true `DatasetType` objects
459 connectionType : `str`
460 Name of the connection type to produce a set for, corresponds
461 to an attribute of type `list` on the connection class instance
462 freeze : `bool`, optional
463 If `True`, call `NamedValueSet.freeze` on the object returned.
467 datasetTypes : `NamedValueSet`
468 A set of all datasetTypes which correspond to the input
469 connection type specified in the connection class of this
474 This function is a closure over the variables ``registry`` and
477 datasetTypes = NamedValueSet()
479 dimensions = set(getattr(c,
'dimensions', set()))
480 if "skypix" in dimensions:
482 datasetType = registry.getDatasetType(c.name)
483 except LookupError
as err:
485 f
"DatasetType '{c.name}' referenced by "
486 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
487 f
"placeholder, but does not already exist in the registry. "
488 f
"Note that reference catalog names are now used as the dataset "
489 f
"type name instead of 'ref_cat'."
491 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
492 rest2 = set(dim.name
for dim
in datasetType.dimensions
493 if not isinstance(dim, SkyPixDimension))
495 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in "
496 f
"connections ({rest1}) are inconsistent with those in "
497 f
"registry's version of this dataset ({rest2}).")
503 registryDatasetType =
None
505 registryDatasetType = registry.getDatasetType(c.name)
507 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
508 parentStorageClass = DatasetType.PlaceholderParentStorageClass \
509 if componentName
else None
510 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
512 parentStorageClass=parentStorageClass)
513 registryDatasetType = datasetType
515 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
517 parentStorageClass=registryDatasetType.parentStorageClass)
519 if registryDatasetType
and datasetType != registryDatasetType:
520 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with "
521 f
"registry definition ({registryDatasetType}) "
522 f
"for {taskDef.label}.")
523 datasetTypes.add(datasetType)
525 datasetTypes.freeze()
529 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
530 if taskDef.metadataDatasetName
is not None:
533 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
534 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertySet")}
538 initInputs=makeDatasetTypesSet(
"initInputs"),
539 initOutputs=makeDatasetTypesSet(
"initOutputs"),
540 inputs=makeDatasetTypesSet(
"inputs"),
541 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
546 @dataclass(frozen=
True)
548 """An immutable struct that classifies the dataset types used in a
552 initInputs: NamedValueSet[DatasetType]
553 """Dataset types that are needed as inputs in order to construct the Tasks
556 This does not include dataset types that are produced when constructing
557 other Tasks in the Pipeline (these are classified as `initIntermediates`).
560 initOutputs: NamedValueSet[DatasetType]
561 """Dataset types that may be written after constructing the Tasks in this
564 This does not include dataset types that are also used as inputs when
565 constructing other Tasks in the Pipeline (these are classified as
566 `initIntermediates`).
569 initIntermediates: NamedValueSet[DatasetType]
570 """Dataset types that are both used when constructing one or more Tasks
571 in the Pipeline and produced as a side-effect of constructing another
572 Task in the Pipeline.
575 inputs: NamedValueSet[DatasetType]
576 """Dataset types that are regular inputs for the full pipeline.
578 If an input dataset needed for a Quantum cannot be found in the input
579 collection(s), that Quantum (and all dependent Quanta) will not be
583 prerequisites: NamedValueSet[DatasetType]
584 """Dataset types that are prerequisite inputs for the full Pipeline.
586 Prerequisite inputs must exist in the input collection(s) before the
587 pipeline is run, but do not constrain the graph - if a prerequisite is
588 missing for a Quantum, `PrerequisiteMissingError` is raised.
590 Prerequisite inputs are not resolved until the second stage of
591 QuantumGraph generation.
594 intermediates: NamedValueSet[DatasetType]
595 """Dataset types that are output by one Task in the Pipeline and consumed
596 as inputs by one or more other Tasks in the Pipeline.
599 outputs: NamedValueSet[DatasetType]
600 """Dataset types that are output by a Task in the Pipeline and not consumed
601 by any other Task in the Pipeline.
604 byTask: Mapping[str, TaskDatasetTypes]
605 """Per-Task dataset types, keyed by label in the `Pipeline`.
607 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
608 neither has been modified since the dataset types were extracted, of
613 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
614 """Extract and classify the dataset types from all tasks in a
620 An ordered collection of tasks that can be run together.
622 Registry used to construct normalized `DatasetType` objects and
623 retrieve those that are incomplete.
627 types: `PipelineDatasetTypes`
628 The dataset types used by this `Pipeline`.
633 Raised if Tasks are inconsistent about which datasets are marked
634 prerequisite. This indicates that the Tasks cannot be run as part
635 of the same `Pipeline`.
637 allInputs = NamedValueSet()
638 allOutputs = NamedValueSet()
639 allInitInputs = NamedValueSet()
640 allInitOutputs = NamedValueSet()
641 prerequisites = NamedValueSet()
643 if isinstance(pipeline, Pipeline):
644 pipeline = pipeline.toExpandedPipeline()
645 for taskDef
in pipeline:
646 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
647 allInitInputs |= thisTask.initInputs
648 allInitOutputs |= thisTask.initOutputs
649 allInputs |= thisTask.inputs
650 prerequisites |= thisTask.prerequisites
651 allOutputs |= thisTask.outputs
652 byTask[taskDef.label] = thisTask
653 if not prerequisites.isdisjoint(allInputs):
654 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
655 {dt.name
for dt
in allInputs & prerequisites}
657 if not prerequisites.isdisjoint(allOutputs):
658 raise ValueError(
"{} marked as both prerequisites and outputs".format(
659 {dt.name
for dt
in allOutputs & prerequisites}
664 intermediateComponents = NamedValueSet()
665 intermediateComposites = NamedValueSet()
666 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
667 for dsType
in allInputs:
669 name, component = dsType.nameAndComponent()
673 if component
is not None:
674 if name
in outputNameMapping:
675 if outputNameMapping[name].dimensions != dsType.dimensions:
676 raise ValueError(f
"Component dataset type {dsType.name} has different "
677 f
"dimensions ({dsType.dimensions}) than its parent "
678 f
"({outputNameMapping[name].dimensions}).")
679 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
680 universe=registry.dimensions)
681 intermediateComponents.add(dsType)
682 intermediateComposites.add(composite)
684 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
685 common = a.names & b.names
687 if a[name] != b[name]:
688 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
690 checkConsistency(allInitInputs, allInitOutputs)
691 checkConsistency(allInputs, allOutputs)
692 checkConsistency(allInputs, intermediateComposites)
693 checkConsistency(allOutputs, intermediateComposites)
695 def frozen(s: NamedValueSet) -> NamedValueSet:
700 initInputs=
frozen(allInitInputs - allInitOutputs),
701 initIntermediates=
frozen(allInitInputs & allInitOutputs),
702 initOutputs=
frozen(allInitOutputs - allInitInputs),
703 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
704 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
705 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
706 prerequisites=
frozen(prerequisites),
707 byTask=MappingProxyType(byTask),