21 from __future__
import annotations
23 """Module defining Pipeline class and related methods.
26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Union, Generator, TYPE_CHECKING
40 from lsst.daf.butler
import DatasetType, NamedValueSet, Registry, SkyPixDimension
42 from .configOverrides
import ConfigOverrides
43 from .connections
import iterConnections
44 from .pipelineTask
import PipelineTask
46 from .
import pipelineIR
47 from .
import pipeTools
50 from lsst.obs.base.instrument
import Instrument
62 """TaskDef is a collection of information about task needed by Pipeline.
64 The information includes task name, configuration object and optional
65 task class. This class is just a collection of attributes and it exposes
66 all of them so that attributes could potentially be modified in place
67 (e.g. if configuration needs extra overrides).
72 `PipelineTask` class name, currently it is not specified whether this
73 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
74 Framework should be prepared to handle all cases.
75 config : `lsst.pex.config.Config`
76 Instance of the configuration class corresponding to this task class,
77 usually with all overrides applied. This config will be frozen.
78 taskClass : `type` or ``None``
79 `PipelineTask` class object, can be ``None``. If ``None`` then
80 framework will have to locate and load class.
81 label : `str`, optional
82 Task label, usually a short string unique in a pipeline.
84 def __init__(self, taskName, config, taskClass=None, label=""):
90 self.
connections = config.connections.ConnectionsClass(config=config)
94 """Name of a dataset type for configuration of this task (`str`)
96 return self.
label +
"_config"
100 """Name of a dataset type for metadata of this task, `None` if
101 metadata is not to be saved (`str`)
103 if self.
config.saveMetadata:
104 return self.
label +
"_metadata"
111 rep +=
", label=" + self.
label
117 """A `Pipeline` is a representation of a series of tasks to run, and the
118 configuration for those tasks.
123 A description of that this pipeline does.
126 pipeline_dict = {
"description": description,
"tasks": {}}
131 """Load a pipeline defined in a pipeline yaml file.
136 A path that points to a pipeline defined in yaml format
142 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
147 """Create a pipeline from string formatted as a pipeline document.
151 pipeline_string : `str`
152 A string that is formatted according like a pipeline document
158 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
162 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
163 """Create a pipeline from an already created `PipelineIR` object.
167 deserialized_pipeline: `PipelineIR`
168 An already created pipeline intermediate representation object
174 pipeline = cls.__new__(cls)
175 pipeline._pipelineIR = deserialized_pipeline
180 """Create a new pipeline by copying an already existing `Pipeline`.
185 An already created pipeline intermediate representation object
191 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
197 """Add an instrument to the pipeline, or replace an instrument that is
202 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
203 Either a derived class object of a `lsst.daf.butler.instrument` or a
204 string corresponding to a fully qualified
205 `lsst.daf.butler.instrument` name.
207 if isinstance(instrument, str):
211 instrument = f
"{instrument.__module__}.{instrument.__qualname__}"
215 """Get the instrument from the pipeline.
219 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
220 A derived class object of a `lsst.daf.butler.instrument`, a string
221 corresponding to a fully qualified `lsst.daf.butler.instrument`
222 name, or None if the pipeline does not have an instrument.
226 def addTask(self, task: Union[PipelineTask, str], label: str):
227 """Add a new task to the pipeline, or replace a task that is already
228 associated with the supplied label.
232 task: `PipelineTask` or `str`
233 Either a derived class object of a `PipelineTask` or a string
234 corresponding to a fully qualified `PipelineTask` name.
236 A label that is used to identify the `PipelineTask` being added
238 if isinstance(task, str):
240 elif issubclass(task, PipelineTask):
241 taskName = f
"{task.__module__}.{task.__qualname__}"
243 raise ValueError(
"task must be either a child class of PipelineTask or a string containing"
244 " a fully qualified name to one")
249 if isinstance(task, str):
250 task = doImport(task)
251 label = task._DefaultName
255 """Remove a task from the pipeline.
260 The label used to identify the task that is to be removed
265 If no task with that label exists in the pipeline
271 """Apply single config override.
278 Fully-qualified field name.
280 Value to be given to a field.
285 """Add overrides from a specified file.
290 The label used to identify the task associated with config to
293 Path to the override file.
298 """Add Overrides by running a snippet of python code against a config.
303 The label used to identity the task associated with config to
306 A string which is valid python code to be executed. This is done
307 with config as the only local accessible value.
313 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
314 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
320 """Returns a generator of TaskDefs which can be used to create quantum
325 generator : generator of `TaskDef`
326 The generator returned will be the sorted iterator of tasks which
327 are to be used in constructing a quantum graph.
332 If a dataId is supplied in a config block. This is in place for
336 for label, taskIR
in self.
_pipelineIR.tasks.items():
337 taskClass = doImport(taskIR.klass)
338 taskName = taskClass.__qualname__
339 config = taskClass.ConfigClass()
342 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
343 if taskIR.config
is not None:
344 for configIR
in taskIR.config:
345 if configIR.dataId
is not None:
346 raise NotImplementedError(
"Specializing a config on a partial data id is not yet "
347 "supported in Pipeline definition")
349 if configIR.dataId
is None:
351 for configFile
in configIR.file:
352 overrides.addFileOverride(os.path.expandvars(configFile))
353 if configIR.python
is not None:
354 overrides.addPythonOverride(configIR.python)
355 for key, value
in configIR.rest.items():
356 overrides.addValueOverride(key, value)
357 overrides.applyTo(config)
360 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
364 label_to_config = {x.label: x.config
for x
in taskDefs}
368 success = eval(contract.contract,
None, label_to_config)
370 extra_info = f
": {contract.msg}" if contract.msg
is not None else ""
372 f
"satisfied{extra_info}")
374 yield from pipeTools.orderPipeline(taskDefs)
380 if not isinstance(other, Pipeline):
385 @dataclass(frozen=
True)
387 """An immutable struct that extracts and classifies the dataset types used
391 initInputs: NamedValueSet[DatasetType]
392 """Dataset types that are needed as inputs in order to construct this Task.
394 Task-level `initInputs` may be classified as either
395 `~PipelineDatasetTypes.initInputs` or
396 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
399 initOutputs: NamedValueSet[DatasetType]
400 """Dataset types that may be written after constructing this Task.
402 Task-level `initOutputs` may be classified as either
403 `~PipelineDatasetTypes.initOutputs` or
404 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
407 inputs: NamedValueSet[DatasetType]
408 """Dataset types that are regular inputs to this Task.
410 If an input dataset needed for a Quantum cannot be found in the input
411 collection(s) or produced by another Task in the Pipeline, that Quantum
412 (and all dependent Quanta) will not be produced.
414 Task-level `inputs` may be classified as either
415 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
416 at the Pipeline level.
419 prerequisites: NamedValueSet[DatasetType]
420 """Dataset types that are prerequisite inputs to this Task.
422 Prerequisite inputs must exist in the input collection(s) before the
423 pipeline is run, but do not constrain the graph - if a prerequisite is
424 missing for a Quantum, `PrerequisiteMissingError` is raised.
426 Prerequisite inputs are not resolved until the second stage of
427 QuantumGraph generation.
430 outputs: NamedValueSet[DatasetType]
431 """Dataset types that are produced by this Task.
433 Task-level `outputs` may be classified as either
434 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
435 at the Pipeline level.
439 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
440 """Extract and classify the dataset types from a single `PipelineTask`.
445 An instance of a `TaskDef` class for a particular `PipelineTask`.
447 Registry used to construct normalized `DatasetType` objects and
448 retrieve those that are incomplete.
452 types: `TaskDatasetTypes`
453 The dataset types used by this task.
455 def makeDatasetTypesSet(connectionType, freeze=True):
456 """Constructs a set of true `DatasetType` objects
460 connectionType : `str`
461 Name of the connection type to produce a set for, corresponds
462 to an attribute of type `list` on the connection class instance
463 freeze : `bool`, optional
464 If `True`, call `NamedValueSet.freeze` on the object returned.
468 datasetTypes : `NamedValueSet`
469 A set of all datasetTypes which correspond to the input
470 connection type specified in the connection class of this
475 This function is a closure over the variables ``registry`` and
478 datasetTypes = NamedValueSet()
480 dimensions = set(getattr(c,
'dimensions', set()))
481 if "skypix" in dimensions:
483 datasetType = registry.getDatasetType(c.name)
484 except LookupError
as err:
486 f
"DatasetType '{c.name}' referenced by "
487 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
488 f
"placeholder, but does not already exist in the registry. "
489 f
"Note that reference catalog names are now used as the dataset "
490 f
"type name instead of 'ref_cat'."
492 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
493 rest2 = set(dim.name
for dim
in datasetType.dimensions
494 if not isinstance(dim, SkyPixDimension))
496 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in "
497 f
"connections ({rest1}) are inconsistent with those in "
498 f
"registry's version of this dataset ({rest2}).")
504 registryDatasetType =
None
506 registryDatasetType = registry.getDatasetType(c.name)
508 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
509 parentStorageClass = DatasetType.PlaceholderParentStorageClass \
510 if componentName
else None
511 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
513 parentStorageClass=parentStorageClass)
514 registryDatasetType = datasetType
516 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
518 parentStorageClass=registryDatasetType.parentStorageClass)
520 if registryDatasetType
and datasetType != registryDatasetType:
521 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with "
522 f
"registry definition ({registryDatasetType}) "
523 f
"for {taskDef.label}.")
524 datasetTypes.add(datasetType)
526 datasetTypes.freeze()
530 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
531 if taskDef.metadataDatasetName
is not None:
534 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
535 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertySet")}
539 initInputs=makeDatasetTypesSet(
"initInputs"),
540 initOutputs=makeDatasetTypesSet(
"initOutputs"),
541 inputs=makeDatasetTypesSet(
"inputs"),
542 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
547 @dataclass(frozen=
True)
549 """An immutable struct that classifies the dataset types used in a
553 initInputs: NamedValueSet[DatasetType]
554 """Dataset types that are needed as inputs in order to construct the Tasks
557 This does not include dataset types that are produced when constructing
558 other Tasks in the Pipeline (these are classified as `initIntermediates`).
561 initOutputs: NamedValueSet[DatasetType]
562 """Dataset types that may be written after constructing the Tasks in this
565 This does not include dataset types that are also used as inputs when
566 constructing other Tasks in the Pipeline (these are classified as
567 `initIntermediates`).
570 initIntermediates: NamedValueSet[DatasetType]
571 """Dataset types that are both used when constructing one or more Tasks
572 in the Pipeline and produced as a side-effect of constructing another
573 Task in the Pipeline.
576 inputs: NamedValueSet[DatasetType]
577 """Dataset types that are regular inputs for the full pipeline.
579 If an input dataset needed for a Quantum cannot be found in the input
580 collection(s), that Quantum (and all dependent Quanta) will not be
584 prerequisites: NamedValueSet[DatasetType]
585 """Dataset types that are prerequisite inputs for the full Pipeline.
587 Prerequisite inputs must exist in the input collection(s) before the
588 pipeline is run, but do not constrain the graph - if a prerequisite is
589 missing for a Quantum, `PrerequisiteMissingError` is raised.
591 Prerequisite inputs are not resolved until the second stage of
592 QuantumGraph generation.
595 intermediates: NamedValueSet[DatasetType]
596 """Dataset types that are output by one Task in the Pipeline and consumed
597 as inputs by one or more other Tasks in the Pipeline.
600 outputs: NamedValueSet[DatasetType]
601 """Dataset types that are output by a Task in the Pipeline and not consumed
602 by any other Task in the Pipeline.
605 byTask: Mapping[str, TaskDatasetTypes]
606 """Per-Task dataset types, keyed by label in the `Pipeline`.
608 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
609 neither has been modified since the dataset types were extracted, of
614 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
615 """Extract and classify the dataset types from all tasks in a
621 An ordered collection of tasks that can be run together.
623 Registry used to construct normalized `DatasetType` objects and
624 retrieve those that are incomplete.
628 types: `PipelineDatasetTypes`
629 The dataset types used by this `Pipeline`.
634 Raised if Tasks are inconsistent about which datasets are marked
635 prerequisite. This indicates that the Tasks cannot be run as part
636 of the same `Pipeline`.
638 allInputs = NamedValueSet()
639 allOutputs = NamedValueSet()
640 allInitInputs = NamedValueSet()
641 allInitOutputs = NamedValueSet()
642 prerequisites = NamedValueSet()
644 if isinstance(pipeline, Pipeline):
645 pipeline = pipeline.toExpandedPipeline()
646 for taskDef
in pipeline:
647 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
648 allInitInputs |= thisTask.initInputs
649 allInitOutputs |= thisTask.initOutputs
650 allInputs |= thisTask.inputs
651 prerequisites |= thisTask.prerequisites
652 allOutputs |= thisTask.outputs
653 byTask[taskDef.label] = thisTask
654 if not prerequisites.isdisjoint(allInputs):
655 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
656 {dt.name
for dt
in allInputs & prerequisites}
658 if not prerequisites.isdisjoint(allOutputs):
659 raise ValueError(
"{} marked as both prerequisites and outputs".format(
660 {dt.name
for dt
in allOutputs & prerequisites}
665 intermediateComponents = NamedValueSet()
666 intermediateComposites = NamedValueSet()
667 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
668 for dsType
in allInputs:
670 name, component = dsType.nameAndComponent()
674 if component
is not None:
675 if name
in outputNameMapping:
676 if outputNameMapping[name].dimensions != dsType.dimensions:
677 raise ValueError(f
"Component dataset type {dsType.name} has different "
678 f
"dimensions ({dsType.dimensions}) than its parent "
679 f
"({outputNameMapping[name].dimensions}).")
680 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
681 universe=registry.dimensions)
682 intermediateComponents.add(dsType)
683 intermediateComposites.add(composite)
685 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
686 common = a.names & b.names
688 if a[name] != b[name]:
689 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
691 checkConsistency(allInitInputs, allInitOutputs)
692 checkConsistency(allInputs, allOutputs)
693 checkConsistency(allInputs, intermediateComposites)
694 checkConsistency(allOutputs, intermediateComposites)
696 def frozen(s: NamedValueSet) -> NamedValueSet:
701 initInputs=
frozen(allInitInputs - allInitOutputs),
702 initIntermediates=
frozen(allInitInputs & allInitOutputs),
703 initOutputs=
frozen(allInitOutputs - allInitInputs),
704 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
705 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
706 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
707 prerequisites=
frozen(prerequisites),
708 byTask=MappingProxyType(byTask),