21 from __future__
import annotations
23 """Module defining Pipeline class and related methods.
26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Union, Generator, TYPE_CHECKING
40 from lsst.daf.butler
import DatasetType, NamedValueSet, Registry, SkyPixDimension
42 from .configOverrides
import ConfigOverrides
43 from .connections
import iterConnections
44 from .pipelineTask
import PipelineTask
46 from .
import pipelineIR
47 from .
import pipeTools
50 from lsst.obs.base.instrument
import Instrument
62 """TaskDef is a collection of information about task needed by Pipeline.
64 The information includes task name, configuration object and optional
65 task class. This class is just a collection of attributes and it exposes
66 all of them so that attributes could potentially be modified in place
67 (e.g. if configuration needs extra overrides).
72 `PipelineTask` class name, currently it is not specified whether this
73 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
74 Framework should be prepared to handle all cases.
75 config : `lsst.pex.config.Config`
76 Instance of the configuration class corresponding to this task class,
77 usually with all overrides applied.
78 taskClass : `type` or ``None``
79 `PipelineTask` class object, can be ``None``. If ``None`` then
80 framework will have to locate and load class.
81 label : `str`, optional
82 Task label, usually a short string unique in a pipeline.
84 def __init__(self, taskName, config, taskClass=None, label=""):
89 self.
connections = config.connections.ConnectionsClass(config=config)
93 """Name of a dataset type for configuration of this task (`str`)
95 return self.
label +
"_config"
99 """Name of a dataset type for metadata of this task, `None` if
100 metadata is not to be saved (`str`)
102 if self.
config.saveMetadata:
103 return self.
label +
"_metadata"
110 rep +=
", label=" + self.
label
116 """A `Pipeline` is a representation of a series of tasks to run, and the
117 configuration for those tasks.
122 A description of that this pipeline does.
125 pipeline_dict = {
"description": description,
"tasks": {}}
130 """Load a pipeline defined in a pipeline yaml file.
135 A path that points to a pipeline defined in yaml format
141 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
146 """Create a pipeline from string formatted as a pipeline document.
150 pipeline_string : `str`
151 A string that is formatted according like a pipeline document
157 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
161 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
162 """Create a pipeline from an already created `PipelineIR` object.
166 deserialized_pipeline: `PipelineIR`
167 An already created pipeline intermediate representation object
173 pipeline = cls.__new__(cls)
174 pipeline._pipelineIR = deserialized_pipeline
179 """Create a new pipeline by copying an already existing `Pipeline`.
184 An already created pipeline intermediate representation object
190 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
196 """Add an instrument to the pipeline, or replace an instrument that is
201 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
202 Either a derived class object of a `lsst.daf.butler.instrument` or a
203 string corresponding to a fully qualified
204 `lsst.daf.butler.instrument` name.
206 if isinstance(instrument, str):
210 instrument = f
"{instrument.__module__}.{instrument.__qualname__}"
213 def addTask(self, task: Union[PipelineTask, str], label: str):
214 """Add a new task to the pipeline, or replace a task that is already
215 associated with the supplied label.
219 task: `PipelineTask` or `str`
220 Either a derived class object of a `PipelineTask` or a string
221 corresponding to a fully qualified `PipelineTask` name.
223 A label that is used to identify the `PipelineTask` being added
225 if isinstance(task, str):
227 elif issubclass(task, PipelineTask):
228 taskName = f
"{task.__module__}.{task.__qualname__}"
230 raise ValueError(
"task must be either a child class of PipelineTask or a string containing"
231 " a fully qualified name to one")
236 if isinstance(task, str):
237 task = doImport(task)
238 label = task._DefaultName
242 """Remove a task from the pipeline.
247 The label used to identify the task that is to be removed
252 If no task with that label exists in the pipeline
258 """Apply single config override.
265 Fully-qualified field name.
267 Value to be given to a field.
272 """Add overrides from a specified file.
277 The label used to identify the task associated with config to
280 Path to the override file.
285 """Add Overrides by running a snippet of python code against a config.
290 The label used to identity the task associated with config to
293 A string which is valid python code to be executed. This is done
294 with config as the only local accessible value.
300 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
301 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
307 """Returns a generator of TaskDefs which can be used to create quantum
312 generator : generator of `TaskDef`
313 The generator returned will be the sorted iterator of tasks which
314 are to be used in constructing a quantum graph.
319 If a dataId is supplied in a config block. This is in place for
323 for label, taskIR
in self.
_pipelineIR.tasks.items():
324 taskClass = doImport(taskIR.klass)
325 taskName = taskClass.__qualname__
326 config = taskClass.ConfigClass()
329 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
330 if taskIR.config
is not None:
331 for configIR
in taskIR.config:
332 if configIR.dataId
is not None:
333 raise NotImplementedError(
"Specializing a config on a partial data id is not yet "
334 "supported in Pipeline definition")
336 if configIR.dataId
is None:
338 for configFile
in configIR.file:
339 overrides.addFileOverride(os.path.expandvars(configFile))
340 if configIR.python
is not None:
341 overrides.addPythonOverride(configIR.python)
342 for key, value
in configIR.rest.items():
343 overrides.addValueOverride(key, value)
344 overrides.applyTo(config)
347 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
351 label_to_config = {x.label: x.config
for x
in taskDefs}
355 success = eval(contract.contract,
None, label_to_config)
357 extra_info = f
": {contract.msg}" if contract.msg
is not None else ""
359 f
"satisfied{extra_info}")
361 yield from pipeTools.orderPipeline(taskDefs)
367 if not isinstance(other, Pipeline):
372 @dataclass(frozen=
True)
374 """An immutable struct that extracts and classifies the dataset types used
378 initInputs: NamedValueSet[DatasetType]
379 """Dataset types that are needed as inputs in order to construct this Task.
381 Task-level `initInputs` may be classified as either
382 `~PipelineDatasetTypes.initInputs` or
383 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
386 initOutputs: NamedValueSet[DatasetType]
387 """Dataset types that may be written after constructing this Task.
389 Task-level `initOutputs` may be classified as either
390 `~PipelineDatasetTypes.initOutputs` or
391 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
394 inputs: NamedValueSet[DatasetType]
395 """Dataset types that are regular inputs to this Task.
397 If an input dataset needed for a Quantum cannot be found in the input
398 collection(s) or produced by another Task in the Pipeline, that Quantum
399 (and all dependent Quanta) will not be produced.
401 Task-level `inputs` may be classified as either
402 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
403 at the Pipeline level.
406 prerequisites: NamedValueSet[DatasetType]
407 """Dataset types that are prerequisite inputs to this Task.
409 Prerequisite inputs must exist in the input collection(s) before the
410 pipeline is run, but do not constrain the graph - if a prerequisite is
411 missing for a Quantum, `PrerequisiteMissingError` is raised.
413 Prerequisite inputs are not resolved until the second stage of
414 QuantumGraph generation.
417 outputs: NamedValueSet[DatasetType]
418 """Dataset types that are produced by this Task.
420 Task-level `outputs` may be classified as either
421 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
422 at the Pipeline level.
426 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
427 """Extract and classify the dataset types from a single `PipelineTask`.
432 An instance of a `TaskDef` class for a particular `PipelineTask`.
434 Registry used to construct normalized `DatasetType` objects and
435 retrieve those that are incomplete.
439 types: `TaskDatasetTypes`
440 The dataset types used by this task.
442 def makeDatasetTypesSet(connectionType, freeze=True):
443 """Constructs a set of true `DatasetType` objects
447 connectionType : `str`
448 Name of the connection type to produce a set for, corresponds
449 to an attribute of type `list` on the connection class instance
450 freeze : `bool`, optional
451 If `True`, call `NamedValueSet.freeze` on the object returned.
455 datasetTypes : `NamedValueSet`
456 A set of all datasetTypes which correspond to the input
457 connection type specified in the connection class of this
462 This function is a closure over the variables ``registry`` and
465 datasetTypes = NamedValueSet()
467 dimensions = set(getattr(c,
'dimensions', set()))
468 if "skypix" in dimensions:
470 datasetType = registry.getDatasetType(c.name)
471 except LookupError
as err:
473 f
"DatasetType '{c.name}' referenced by "
474 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
475 f
"placeholder, but does not already exist in the registry. "
476 f
"Note that reference catalog names are now used as the dataset "
477 f
"type name instead of 'ref_cat'."
479 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
480 rest2 = set(dim.name
for dim
in datasetType.dimensions
481 if not isinstance(dim, SkyPixDimension))
483 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in "
484 f
"connections ({rest1}) are inconsistent with those in "
485 f
"registry's version of this dataset ({rest2}).")
491 registryDatasetType =
None
493 registryDatasetType = registry.getDatasetType(c.name)
495 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
496 parentStorageClass = DatasetType.PlaceholderParentStorageClass \
497 if componentName
else None
498 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
500 parentStorageClass=parentStorageClass)
501 registryDatasetType = datasetType
503 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
505 parentStorageClass=registryDatasetType.parentStorageClass)
507 if registryDatasetType
and datasetType != registryDatasetType:
508 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with "
509 f
"registry definition ({registryDatasetType}) "
510 f
"for {taskDef.label}.")
511 datasetTypes.add(datasetType)
513 datasetTypes.freeze()
517 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
518 if taskDef.metadataDatasetName
is not None:
521 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
522 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertySet")}
526 initInputs=makeDatasetTypesSet(
"initInputs"),
527 initOutputs=makeDatasetTypesSet(
"initOutputs"),
528 inputs=makeDatasetTypesSet(
"inputs"),
529 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
534 @dataclass(frozen=
True)
536 """An immutable struct that classifies the dataset types used in a
540 initInputs: NamedValueSet[DatasetType]
541 """Dataset types that are needed as inputs in order to construct the Tasks
544 This does not include dataset types that are produced when constructing
545 other Tasks in the Pipeline (these are classified as `initIntermediates`).
548 initOutputs: NamedValueSet[DatasetType]
549 """Dataset types that may be written after constructing the Tasks in this
552 This does not include dataset types that are also used as inputs when
553 constructing other Tasks in the Pipeline (these are classified as
554 `initIntermediates`).
557 initIntermediates: NamedValueSet[DatasetType]
558 """Dataset types that are both used when constructing one or more Tasks
559 in the Pipeline and produced as a side-effect of constructing another
560 Task in the Pipeline.
563 inputs: NamedValueSet[DatasetType]
564 """Dataset types that are regular inputs for the full pipeline.
566 If an input dataset needed for a Quantum cannot be found in the input
567 collection(s), that Quantum (and all dependent Quanta) will not be
571 prerequisites: NamedValueSet[DatasetType]
572 """Dataset types that are prerequisite inputs for the full Pipeline.
574 Prerequisite inputs must exist in the input collection(s) before the
575 pipeline is run, but do not constrain the graph - if a prerequisite is
576 missing for a Quantum, `PrerequisiteMissingError` is raised.
578 Prerequisite inputs are not resolved until the second stage of
579 QuantumGraph generation.
582 intermediates: NamedValueSet[DatasetType]
583 """Dataset types that are output by one Task in the Pipeline and consumed
584 as inputs by one or more other Tasks in the Pipeline.
587 outputs: NamedValueSet[DatasetType]
588 """Dataset types that are output by a Task in the Pipeline and not consumed
589 by any other Task in the Pipeline.
592 byTask: Mapping[str, TaskDatasetTypes]
593 """Per-Task dataset types, keyed by label in the `Pipeline`.
595 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
596 neither has been modified since the dataset types were extracted, of
601 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
602 """Extract and classify the dataset types from all tasks in a
608 An ordered collection of tasks that can be run together.
610 Registry used to construct normalized `DatasetType` objects and
611 retrieve those that are incomplete.
615 types: `PipelineDatasetTypes`
616 The dataset types used by this `Pipeline`.
621 Raised if Tasks are inconsistent about which datasets are marked
622 prerequisite. This indicates that the Tasks cannot be run as part
623 of the same `Pipeline`.
625 allInputs = NamedValueSet()
626 allOutputs = NamedValueSet()
627 allInitInputs = NamedValueSet()
628 allInitOutputs = NamedValueSet()
629 prerequisites = NamedValueSet()
631 if isinstance(pipeline, Pipeline):
632 pipeline = pipeline.toExpandedPipeline()
633 for taskDef
in pipeline:
634 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
635 allInitInputs |= thisTask.initInputs
636 allInitOutputs |= thisTask.initOutputs
637 allInputs |= thisTask.inputs
638 prerequisites |= thisTask.prerequisites
639 allOutputs |= thisTask.outputs
640 byTask[taskDef.label] = thisTask
641 if not prerequisites.isdisjoint(allInputs):
642 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
643 {dt.name
for dt
in allInputs & prerequisites}
645 if not prerequisites.isdisjoint(allOutputs):
646 raise ValueError(
"{} marked as both prerequisites and outputs".format(
647 {dt.name
for dt
in allOutputs & prerequisites}
652 intermediateComponents = NamedValueSet()
653 intermediateComposites = NamedValueSet()
654 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
655 for dsType
in allInputs:
657 name, component = dsType.nameAndComponent()
661 if component
is not None:
662 if name
in outputNameMapping:
663 if outputNameMapping[name].dimensions != dsType.dimensions:
664 raise ValueError(f
"Component dataset type {dsType.name} has different "
665 f
"dimensions ({dsType.dimensions}) than its parent "
666 f
"({outputNameMapping[name].dimensions}).")
667 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
668 universe=registry.dimensions)
669 intermediateComponents.add(dsType)
670 intermediateComposites.add(composite)
672 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
673 common = a.names & b.names
675 if a[name] != b[name]:
676 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
678 checkConsistency(allInitInputs, allInitOutputs)
679 checkConsistency(allInputs, allOutputs)
680 checkConsistency(allInputs, intermediateComposites)
681 checkConsistency(allOutputs, intermediateComposites)
683 def frozen(s: NamedValueSet) -> NamedValueSet:
688 initInputs=
frozen(allInitInputs - allInitOutputs),
689 initIntermediates=
frozen(allInitInputs & allInitOutputs),
690 initOutputs=
frozen(allInitOutputs - allInitInputs),
691 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
692 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
693 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
694 prerequisites=
frozen(prerequisites),
695 byTask=MappingProxyType(byTask),