21 from __future__
import annotations
23 """Module defining Pipeline class and related methods. 26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import Mapping, Union, Generator, TYPE_CHECKING
39 from lsst.daf.butler
import DatasetType, Registry, SkyPixDimension
40 from lsst.daf.butler.core.utils
import NamedValueSet
42 from .configOverrides
import ConfigOverrides
43 from .connections
import iterConnections
44 from .pipelineTask
import PipelineTask
46 from .
import pipelineIR
47 from .
import pipeTools
50 from lsst.obs.base.instrument
import Instrument
62 """TaskDef is a collection of information about task needed by Pipeline. 64 The information includes task name, configuration object and optional 65 task class. This class is just a collection of attributes and it exposes 66 all of them so that attributes could potentially be modified in place 67 (e.g. if configuration needs extra overrides). 72 `PipelineTask` class name, currently it is not specified whether this 73 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 74 Framework should be prepared to handle all cases. 75 config : `lsst.pex.config.Config` 76 Instance of the configuration class corresponding to this task class, 77 usually with all overrides applied. 78 taskClass : `type` or ``None`` 79 `PipelineTask` class object, can be ``None``. If ``None`` then 80 framework will have to locate and load class. 81 label : `str`, optional 82 Task label, usually a short string unique in a pipeline. 84 def __init__(self, taskName, config, taskClass=None, label=""):
89 self.
connections = config.connections.ConnectionsClass(config=config)
93 """Name of a dataset type for metadata of this task, `None` if 94 metadata is not to be saved (`str`) 96 if self.
config.saveMetadata:
97 return self.
label +
"_metadata" 104 rep +=
", label=" + self.
label 110 """A `Pipeline` is a representation of a series of tasks to run, and the 111 configuration for those tasks. 116 A description of that this pipeline does. 119 pipeline_dict = {
"description": description,
"tasks": {}}
124 """Load a pipeline defined in a pipeline yaml file. 129 A path that points to a pipeline defined in yaml format 135 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_file(filename))
140 """Create a pipeline from string formatted as a pipeline document. 144 pipeline_string : `str` 145 A string that is formatted according like a pipeline document 151 pipeline = cls.
fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
155 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
156 """Create a pipeline from an already created `PipelineIR` object. 160 deserialized_pipeline: `PipelineIR` 161 An already created pipeline intermediate representation object 167 pipeline = cls.__new__(cls)
168 pipeline._pipelineIR = deserialized_pipeline
173 """Create a new pipeline by copying an already existing `Pipeline`. 178 An already created pipeline intermediate representation object 184 return cls.
fromIR(copy.deep_copy(pipeline._pipelineIR))
190 """Add an instrument to the pipeline, or replace an instrument that is 195 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 196 Either a derived class object of a `lsst.daf.butler.instrument` or a 197 string corresponding to a fully qualified 198 `lsst.daf.butler.instrument` name. 200 if isinstance(instrument, str):
204 instrument = f
"{instrument.__module__}.{instrument.__qualname__}" 207 def addTask(self, task: Union[PipelineTask, str], label: str):
208 """Add a new task to the pipeline, or replace a task that is already 209 associated with the supplied label. 213 task: `PipelineTask` or `str` 214 Either a derived class object of a `PipelineTask` or a string 215 corresponding to a fully qualified `PipelineTask` name. 217 A label that is used to identify the `PipelineTask` being added 219 if isinstance(task, str):
221 elif issubclass(task, PipelineTask):
222 taskName = f
"{task.__module__}.{task.__qualname__}" 224 raise ValueError(
"task must be either a child class of PipelineTask or a string containing" 225 " a fully qualified name to one")
230 if isinstance(task, str):
231 task = doImport(task)
232 label = task._DefaultName
236 """Remove a task from the pipeline. 241 The label used to identify the task that is to be removed 246 If no task with that label exists in the pipeline 252 """Apply single config override. 259 Fully-qualified field name. 261 Value to be given to a field. 266 """Add overrides from a specified file. 271 The label used to identify the task associated with config to 274 Path to the override file. 279 """Add Overrides by running a snippet of python code against a config. 284 The label used to identity the task associated with config to 287 A string which is valid python code to be executed. This is done 288 with config as the only local accessible value. 294 raise LookupError(f
"There are no tasks labeled '{label}' in the pipeline")
295 self.
_pipelineIR.tasks[label].add_or_update_config(newConfig)
301 """Returns a generator of TaskDefs which can be used to create quantum 306 generator : generator of `TaskDef` 307 The generator returned will be the sorted iterator of tasks which 308 are to be used in constructing a quantum graph. 313 If a dataId is supplied in a config block. This is in place for 317 for label, taskIR
in self.
_pipelineIR.tasks.items():
318 taskClass = doImport(taskIR.klass)
319 taskName = taskClass.__qualname__
320 config = taskClass.ConfigClass()
323 overrides.addInstrumentOverride(self.
_pipelineIR.instrument, taskClass._DefaultName)
324 if taskIR.config
is not None:
325 for configIR
in taskIR.config:
326 if configIR.dataId
is not None:
327 raise NotImplementedError(
"Specializing a config on a partial data id is not yet " 328 "supported in Pipeline definition")
330 if configIR.dataId
is None:
332 for configFile
in configIR.file:
333 overrides.addFileOverride(configFile)
334 if configIR.python
is not None:
335 overrides.addPythonOverride(configIR.python)
336 for key, value
in configIR.rest.items():
337 overrides.addValueOverride(key, value)
338 overrides.applyTo(config)
341 taskDefs.append(
TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
345 label_to_config = {x.label: x.config
for x
in taskDefs}
349 success = eval(contract.contract,
None, label_to_config)
351 extra_info = f
": {contract.msg}" if contract.msg
is not None else "" 353 f
"satisfied{extra_info}")
355 yield from pipeTools.orderPipeline(taskDefs)
361 if not isinstance(other, Pipeline):
366 @dataclass(frozen=
True)
368 """An immutable struct that extracts and classifies the dataset types used 372 initInputs: NamedValueSet[DatasetType]
373 """Dataset types that are needed as inputs in order to construct this Task. 375 Task-level `initInputs` may be classified as either 376 `~PipelineDatasetTypes.initInputs` or 377 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 380 initOutputs: NamedValueSet[DatasetType]
381 """Dataset types that may be written after constructing this Task. 383 Task-level `initOutputs` may be classified as either 384 `~PipelineDatasetTypes.initOutputs` or 385 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 388 inputs: NamedValueSet[DatasetType]
389 """Dataset types that are regular inputs to this Task. 391 If an input dataset needed for a Quantum cannot be found in the input 392 collection(s) or produced by another Task in the Pipeline, that Quantum 393 (and all dependent Quanta) will not be produced. 395 Task-level `inputs` may be classified as either 396 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 397 at the Pipeline level. 400 prerequisites: NamedValueSet[DatasetType]
401 """Dataset types that are prerequisite inputs to this Task. 403 Prerequisite inputs must exist in the input collection(s) before the 404 pipeline is run, but do not constrain the graph - if a prerequisite is 405 missing for a Quantum, `PrerequisiteMissingError` is raised. 407 Prerequisite inputs are not resolved until the second stage of 408 QuantumGraph generation. 411 outputs: NamedValueSet[DatasetType]
412 """Dataset types that are produced by this Task. 414 Task-level `outputs` may be classified as either 415 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 416 at the Pipeline level. 420 def fromTaskDef(cls, taskDef: TaskDef, *, registry: Registry) -> TaskDatasetTypes:
421 """Extract and classify the dataset types from a single `PipelineTask`. 426 An instance of a `TaskDef` class for a particular `PipelineTask`. 428 Registry used to construct normalized `DatasetType` objects and 429 retrieve those that are incomplete. 433 types: `TaskDatasetTypes` 434 The dataset types used by this task. 436 def makeDatasetTypesSet(connectionType, freeze=True):
437 """Constructs a set of true `DatasetType` objects 441 connectionType : `str` 442 Name of the connection type to produce a set for, corresponds 443 to an attribute of type `list` on the connection class instance 444 freeze : `bool`, optional 445 If `True`, call `NamedValueSet.freeze` on the object returned. 449 datasetTypes : `NamedValueSet` 450 A set of all datasetTypes which correspond to the input 451 connection type specified in the connection class of this 456 This function is a closure over the variables ``registry`` and 459 datasetTypes = NamedValueSet()
461 dimensions = set(getattr(c,
'dimensions', set()))
462 if "skypix" in dimensions:
464 datasetType = registry.getDatasetType(c.name)
465 except LookupError
as err:
467 f
"DatasetType '{c.name}' referenced by " 468 f
"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 469 f
"placeholder, but does not already exist in the registry. " 470 f
"Note that reference catalog names are now used as the dataset " 471 f
"type name instead of 'ref_cat'." 473 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
474 rest2 = set(dim.name
for dim
in datasetType.dimensions
475 if not isinstance(dim, SkyPixDimension))
477 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in " 478 f
"connections ({rest1}) are inconsistent with those in " 479 f
"registry's version of this dataset ({rest2}).")
481 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
484 registryDatasetType = registry.getDatasetType(c.name)
486 registryDatasetType = datasetType
487 if datasetType != registryDatasetType:
488 raise ValueError(f
"Supplied dataset type ({datasetType}) inconsistent with " 489 f
"registry definition ({registryDatasetType})")
490 datasetTypes.add(datasetType)
492 datasetTypes.freeze()
496 outputs = makeDatasetTypesSet(
"outputs", freeze=
False)
497 if taskDef.metadataDatasetName
is not None:
500 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
501 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions,
"PropertyList")}
505 initInputs=makeDatasetTypesSet(
"initInputs"),
506 initOutputs=makeDatasetTypesSet(
"initOutputs"),
507 inputs=makeDatasetTypesSet(
"inputs"),
508 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
513 @dataclass(frozen=
True)
515 """An immutable struct that classifies the dataset types used in a 519 initInputs: NamedValueSet[DatasetType]
520 """Dataset types that are needed as inputs in order to construct the Tasks 523 This does not include dataset types that are produced when constructing 524 other Tasks in the Pipeline (these are classified as `initIntermediates`). 527 initOutputs: NamedValueSet[DatasetType]
528 """Dataset types that may be written after constructing the Tasks in this 531 This does not include dataset types that are also used as inputs when 532 constructing other Tasks in the Pipeline (these are classified as 533 `initIntermediates`). 536 initIntermediates: NamedValueSet[DatasetType]
537 """Dataset types that are both used when constructing one or more Tasks 538 in the Pipeline and produced as a side-effect of constructing another 539 Task in the Pipeline. 542 inputs: NamedValueSet[DatasetType]
543 """Dataset types that are regular inputs for the full pipeline. 545 If an input dataset needed for a Quantum cannot be found in the input 546 collection(s), that Quantum (and all dependent Quanta) will not be 550 prerequisites: NamedValueSet[DatasetType]
551 """Dataset types that are prerequisite inputs for the full Pipeline. 553 Prerequisite inputs must exist in the input collection(s) before the 554 pipeline is run, but do not constrain the graph - if a prerequisite is 555 missing for a Quantum, `PrerequisiteMissingError` is raised. 557 Prerequisite inputs are not resolved until the second stage of 558 QuantumGraph generation. 561 intermediates: NamedValueSet[DatasetType]
562 """Dataset types that are output by one Task in the Pipeline and consumed 563 as inputs by one or more other Tasks in the Pipeline. 566 outputs: NamedValueSet[DatasetType]
567 """Dataset types that are output by a Task in the Pipeline and not consumed 568 by any other Task in the Pipeline. 571 byTask: Mapping[str, TaskDatasetTypes]
572 """Per-Task dataset types, keyed by label in the `Pipeline`. 574 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 575 neither has been modified since the dataset types were extracted, of 580 def fromPipeline(cls, pipeline, *, registry: Registry) -> PipelineDatasetTypes:
581 """Extract and classify the dataset types from all tasks in a 587 An ordered collection of tasks that can be run together. 589 Registry used to construct normalized `DatasetType` objects and 590 retrieve those that are incomplete. 594 types: `PipelineDatasetTypes` 595 The dataset types used by this `Pipeline`. 600 Raised if Tasks are inconsistent about which datasets are marked 601 prerequisite. This indicates that the Tasks cannot be run as part 602 of the same `Pipeline`. 604 allInputs = NamedValueSet()
605 allOutputs = NamedValueSet()
606 allInitInputs = NamedValueSet()
607 allInitOutputs = NamedValueSet()
608 prerequisites = NamedValueSet()
610 if isinstance(pipeline, Pipeline):
611 pipeline = pipeline.toExpandedPipeline()
612 for taskDef
in pipeline:
613 thisTask = TaskDatasetTypes.fromTaskDef(taskDef, registry=registry)
614 allInitInputs |= thisTask.initInputs
615 allInitOutputs |= thisTask.initOutputs
616 allInputs |= thisTask.inputs
617 prerequisites |= thisTask.prerequisites
618 allOutputs |= thisTask.outputs
619 byTask[taskDef.label] = thisTask
620 if not prerequisites.isdisjoint(allInputs):
621 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
622 {dt.name
for dt
in allInputs & prerequisites}
624 if not prerequisites.isdisjoint(allOutputs):
625 raise ValueError(
"{} marked as both prerequisites and outputs".format(
626 {dt.name
for dt
in allOutputs & prerequisites}
631 intermediateComponents = NamedValueSet()
632 intermediateComposites = NamedValueSet()
633 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
634 for dsType
in allInputs:
636 name, component = dsType.nameAndComponent()
640 if component
is not None:
641 if name
in outputNameMapping:
642 if outputNameMapping[name].dimensions != dsType.dimensions:
643 raise ValueError(f
"Component dataset type {dsType.name} has different " 644 f
"dimensions ({dsType.dimensions}) than its parent " 645 f
"({outputNameMapping[name].dimensions}).")
646 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
647 universe=registry.dimensions)
648 intermediateComponents.add(dsType)
649 intermediateComposites.add(composite)
651 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
652 common = a.names & b.names
654 if a[name] != b[name]:
655 raise ValueError(f
"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
657 checkConsistency(allInitInputs, allInitOutputs)
658 checkConsistency(allInputs, allOutputs)
659 checkConsistency(allInputs, intermediateComposites)
660 checkConsistency(allOutputs, intermediateComposites)
662 def frozen(s: NamedValueSet) -> NamedValueSet:
667 initInputs=
frozen(allInitInputs - allInitOutputs),
668 initIntermediates=
frozen(allInitInputs & allInitOutputs),
669 initOutputs=
frozen(allInitOutputs - allInitInputs),
670 inputs=
frozen(allInputs - allOutputs - intermediateComponents),
671 intermediates=
frozen(allInputs & allOutputs | intermediateComponents),
672 outputs=
frozen(allOutputs - allInputs - intermediateComposites),
673 prerequisites=
frozen(prerequisites),
674 byTask=MappingProxyType(byTask),
def metadataDatasetName(self)
def __init__(self, taskName, config, taskClass=None, label="")
def toExpandedPipeline(self)