21 from __future__
import annotations
23 """Module defining Pipeline class and related methods.
26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Union, Generator, TYPE_CHECKING
39 from lsst.daf.butler
import DatasetType, Registry, SkyPixDimension
40 from lsst.daf.butler.core.utils
import NamedValueSet
42 from .configOverrides
import ConfigOverrides
43 from .connections
import iterConnections
44 from .pipelineTask
import PipelineTask
46 from .
import pipelineIR
47 from .
import pipeTools
50 from lsst.obs.base.instrument
import Instrument
62 """TaskDef is a collection of information about task needed by Pipeline.
64 The information includes task name, configuration object and optional
65 task class. This class is just a collection of attributes and it exposes
66 all of them so that attributes could potentially be modified in place
67 (e.g. if configuration needs extra overrides).
72 `PipelineTask` class name, currently it is not specified whether this
73 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
74 Framework should be prepared to handle all cases.
75 config : `lsst.pex.config.Config`
76 Instance of the configuration class corresponding to this task class,
77 usually with all overrides applied.
78 taskClass : `type` or ``None``
79 `PipelineTask` class object, can be ``None``. If ``None`` then
80 framework will have to locate and load class.
81 label : `str`, optional
82 Task label, usually a short string unique in a pipeline.
84 def __init__(self, taskName, config, taskClass=None, label=""):
89 self.
connections = config.connections.ConnectionsClass(config=config)
93 """Name of a dataset type for configuration of this task (`str`)
95 return self.
label +
"_config"
99 """Name of a dataset type for metadata of this task, `None` if
100 metadata is not to be saved (`str`)
102 if self.
config.saveMetadata:
103 return self.
label +
"_metadata"
110 rep +=
", label=" + self.
label
116 """A `Pipeline` is a representation of a series of tasks to run, and the
117 configuration for those tasks.
122 A description of that this pipeline does.
125 pipeline_dict = {
"description": description,
"tasks": {}}
130 """Load a pipeline defined in a pipeline yaml file.
135 A path that points to a pipeline defined in yaml format
141 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
146 """Create a pipeline from string formatted as a pipeline document.
150 pipeline_string : `str`
151 A string that is formatted according like a pipeline document
157 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
161 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
162 """Create a pipeline from an already created `PipelineIR` object.
166 deserialized_pipeline: `PipelineIR`
167 An already created pipeline intermediate representation object
173 pipeline = cls.__new__(cls)
174 pipeline._pipelineIR = deserialized_pipeline
179 """Create a new pipeline by copying an already existing `Pipeline`.
184 An already created pipeline intermediate representation object
190 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
196 """Add an instrument to the pipeline, or replace an instrument that is
201 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
202 Either a derived class object of a `lsst.daf.butler.instrument` or a
203 string corresponding to a fully qualified
204 `lsst.daf.butler.instrument` name.
206 if isinstance(instrument, str):
210 instrument = f
"{instrument.__module__}.{instrument.__qualname__}"
213 def addTask(self, task: Union[PipelineTask, str], label: str):
214 """Add a new task to the pipeline, or replace a task that is already
215 associated with the supplied label.
219 task: `PipelineTask` or `str`
220 Either a derived class object of a `PipelineTask` or a string
221 corresponding to a fully qualified `PipelineTask` name.
223 A label that is used to identify the `PipelineTask` being added
225 if isinstance(task, str):
227 elif issubclass(task, PipelineTask):
228 taskName = f
"{task.__module__}.{task.__qualname__}"
230 raise ValueError(
"task must be either a child class of PipelineTask or a string containing"
231 " a fully qualified name to one")
236 if isinstance(task, str):
237 task = doImport(task)
238 label = task._DefaultName
242 """Remove a task from the pipeline.
247 The label used to identify the task that is to be removed
252 If no task with that label exists in the pipeline
258 """Apply single config override.
265 Fully-qualified field name.
267 Value to be given to a field.
272 """Add overrides from a specified file.
277 The label used to identify the task associated with config to
280 Path to the override file.
285 """Add Overrides by running a snippet of python code against a config.
290 The label used to identity the task associated with config to
293 A string which is valid python code to be executed. This is done
294 with config as the only local accessible value.
300 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
301 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
307 """Returns a generator of TaskDefs which can be used to create quantum
312 generator : generator of `TaskDef`
313 The generator returned will be the sorted iterator of tasks which
314 are to be used in constructing a quantum graph.
319 If a dataId is supplied in a config block. This is in place for
323 for label, taskIR
in self.
_pipelineIR.tasks.items():
324 taskClass = doImport(taskIR.klass)
325 taskName = taskClass.__qualname__
326 config = taskClass.ConfigClass()
329 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
330 if taskIR.config
is not None:
331 for configIR
in taskIR.config:
332 if configIR.dataId
is not None:
333 raise NotImplementedError(
"Specializing a config on a partial data id is not yet "
334 "supported in Pipeline definition")
336 if configIR.dataId
is None:
338 for configFile
in configIR.file:
339 overrides.addFileOverride(configFile)
340 if configIR.python
is not None:
341 overrides.addPythonOverride(configIR.python)
342 for key, value
in configIR.rest.items():
343 overrides.addValueOverride(key, value)
344 overrides.applyTo(config)
347 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
351 label_to_config = {x.label: x.config
for x
in taskDefs}
355 success = eval(contract.contract,
None, label_to_config)
357 extra_info = f
": {contract.msg}" if contract.msg
is not None else ""
359 f
"satisfied{extra_info}")
361 yield from pipeTools.orderPipeline(taskDefs)
367 if not isinstance(other, Pipeline):
372 @dataclass(frozen=
True)
374 """An immutable struct that extracts and classifies the dataset types used
378 initInputs: NamedValueSet[DatasetType]
379 """Dataset types that are needed as inputs in order to construct this Task.
381 Task-level `initInputs` may be classified as either
382 `~PipelineDatasetTypes.initInputs` or
383 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
386 initOutputs: NamedValueSet[DatasetType]
387 """Dataset types that may be written after constructing this Task.
389 Task-level `initOutputs` may be classified as either
390 `~PipelineDatasetTypes.initOutputs` or
391 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
394 inputs: NamedValueSet[DatasetType]
395 """Dataset types that are regular inputs to this Task.
397 If an input dataset needed for a Quantum cannot be found in the input
398 collection(s) or produced by another Task in the Pipeline, that Quantum
399 (and all dependent Quanta) will not be produced.
401 Task-level `inputs` may be classified as either
402 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
403 at the Pipeline level.
406 prerequisites: NamedValueSet[DatasetType]
407 """Dataset types that are prerequisite inputs to this Task.
409 Prerequisite inputs must exist in the input collection(s) before the
410 pipeline is run, but do not constrain the graph - if a prerequisite is
411 missing for a Quantum, `PrerequisiteMissingError` is raised.
413 Prerequisite inputs are not resolved until the second stage of
414 QuantumGraph generation.
417 outputs: NamedValueSet[DatasetType]
418 """Dataset types that are produced by this Task.
420 Task-level `outputs` may be classified as either
421 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
422 at the Pipeline level.
426 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
427 """Extract and classify the dataset types from a single `PipelineTask`.
432 An instance of a `TaskDef` class for a particular `PipelineTask`.
434 Registry used to construct normalized `DatasetType` objects and
435 retrieve those that are incomplete.
439 types: `TaskDatasetTypes`
440 The dataset types used by this task.
442 def makeDatasetTypesSet(connectionType, freeze=True):
443 """Constructs a set of true `DatasetType` objects
447 connectionType : `str`
448 Name of the connection type to produce a set for, corresponds
449 to an attribute of type `list` on the connection class instance
450 freeze : `bool`, optional
451 If `True`, call `NamedValueSet.freeze` on the object returned.
455 datasetTypes : `NamedValueSet`
456 A set of all datasetTypes which correspond to the input
457 connection type specified in the connection class of this
462 This function is a closure over the variables ``registry`` and
465 datasetTypes = NamedValueSet()
467 dimensions = set(getattr(c,
'dimensions', set()))
468 if "skypix" in dimensions:
470 datasetType = registry.getDatasetType(c.name)
471 except LookupError
as err:
473 f
"DatasetType '{c.name}' referenced by "
474 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
475 f
"placeholder, but does not already exist in the registry. "
476 f
"Note that reference catalog names are now used as the dataset "
477 f
"type name instead of 'ref_cat'."
479 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
480 rest2 = set(dim.name
for dim
in datasetType.dimensions
481 if not isinstance(dim, SkyPixDimension))
483 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in "
484 f
"connections ({rest1}) are inconsistent with those in "
485 f
"registry's version of this dataset ({rest2}).")
487 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
490 registryDatasetType = registry.getDatasetType(c.name)
492 registryDatasetType = datasetType
493 if datasetType != registryDatasetType:
494 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with "
495 f
"registry definition ({registryDatasetType})")
496 datasetTypes.add(datasetType)
498 datasetTypes.freeze()
502 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
503 if taskDef.metadataDatasetName
is not None:
506 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
507 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertyList")}
511 initInputs=makeDatasetTypesSet(
"initInputs"),
512 initOutputs=makeDatasetTypesSet(
"initOutputs"),
513 inputs=makeDatasetTypesSet(
"inputs"),
514 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
519 @dataclass(frozen=
True)
521 """An immutable struct that classifies the dataset types used in a
525 initInputs: NamedValueSet[DatasetType]
526 """Dataset types that are needed as inputs in order to construct the Tasks
529 This does not include dataset types that are produced when constructing
530 other Tasks in the Pipeline (these are classified as `initIntermediates`).
533 initOutputs: NamedValueSet[DatasetType]
534 """Dataset types that may be written after constructing the Tasks in this
537 This does not include dataset types that are also used as inputs when
538 constructing other Tasks in the Pipeline (these are classified as
539 `initIntermediates`).
542 initIntermediates: NamedValueSet[DatasetType]
543 """Dataset types that are both used when constructing one or more Tasks
544 in the Pipeline and produced as a side-effect of constructing another
545 Task in the Pipeline.
548 inputs: NamedValueSet[DatasetType]
549 """Dataset types that are regular inputs for the full pipeline.
551 If an input dataset needed for a Quantum cannot be found in the input
552 collection(s), that Quantum (and all dependent Quanta) will not be
556 prerequisites: NamedValueSet[DatasetType]
557 """Dataset types that are prerequisite inputs for the full Pipeline.
559 Prerequisite inputs must exist in the input collection(s) before the
560 pipeline is run, but do not constrain the graph - if a prerequisite is
561 missing for a Quantum, `PrerequisiteMissingError` is raised.
563 Prerequisite inputs are not resolved until the second stage of
564 QuantumGraph generation.
567 intermediates: NamedValueSet[DatasetType]
568 """Dataset types that are output by one Task in the Pipeline and consumed
569 as inputs by one or more other Tasks in the Pipeline.
572 outputs: NamedValueSet[DatasetType]
573 """Dataset types that are output by a Task in the Pipeline and not consumed
574 by any other Task in the Pipeline.
577 byTask: Mapping[str, TaskDatasetTypes]
578 """Per-Task dataset types, keyed by label in the `Pipeline`.
580 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
581 neither has been modified since the dataset types were extracted, of
586 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
587 """Extract and classify the dataset types from all tasks in a
593 An ordered collection of tasks that can be run together.
595 Registry used to construct normalized `DatasetType` objects and
596 retrieve those that are incomplete.
600 types: `PipelineDatasetTypes`
601 The dataset types used by this `Pipeline`.
606 Raised if Tasks are inconsistent about which datasets are marked
607 prerequisite. This indicates that the Tasks cannot be run as part
608 of the same `Pipeline`.
610 allInputs = NamedValueSet()
611 allOutputs = NamedValueSet()
612 allInitInputs = NamedValueSet()
613 allInitOutputs = NamedValueSet()
614 prerequisites = NamedValueSet()
616 if isinstance(pipeline, Pipeline):
617 pipeline = pipeline.toExpandedPipeline()
618 for taskDef
in pipeline:
619 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
620 allInitInputs |= thisTask.initInputs
621 allInitOutputs |= thisTask.initOutputs
622 allInputs |= thisTask.inputs
623 prerequisites |= thisTask.prerequisites
624 allOutputs |= thisTask.outputs
625 byTask[taskDef.label] = thisTask
626 if not prerequisites.isdisjoint(allInputs):
627 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
628 {dt.name
for dt
in allInputs & prerequisites}
630 if not prerequisites.isdisjoint(allOutputs):
631 raise ValueError(
"{} marked as both prerequisites and outputs".format(
632 {dt.name
for dt
in allOutputs & prerequisites}
637 intermediateComponents = NamedValueSet()
638 intermediateComposites = NamedValueSet()
639 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
640 for dsType
in allInputs:
642 name, component = dsType.nameAndComponent()
646 if component
is not None:
647 if name
in outputNameMapping:
648 if outputNameMapping[name].dimensions != dsType.dimensions:
649 raise ValueError(f
"Component dataset type {dsType.name} has different "
650 f
"dimensions ({dsType.dimensions}) than its parent "
651 f
"({outputNameMapping[name].dimensions}).")
652 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
653 universe=registry.dimensions)
654 intermediateComponents.add(dsType)
655 intermediateComposites.add(composite)
657 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
658 common = a.names & b.names
660 if a[name] != b[name]:
661 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
663 checkConsistency(allInitInputs, allInitOutputs)
664 checkConsistency(allInputs, allOutputs)
665 checkConsistency(allInputs, intermediateComposites)
666 checkConsistency(allOutputs, intermediateComposites)
668 def frozen(s: NamedValueSet) -> NamedValueSet:
673 initInputs=
frozen(allInitInputs - allInitOutputs),
674 initIntermediates=
frozen(allInitInputs & allInitOutputs),
675 initOutputs=
frozen(allInitOutputs - allInitInputs),
676 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
677 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
678 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
679 prerequisites=
frozen(prerequisites),
680 byTask=MappingProxyType(byTask),