21 from __future__
import annotations
23 """Module defining Pipeline class and related methods. 26 __all__ = [
"Pipeline",
"TaskDef",
"TaskDatasetTypes",
"PipelineDatasetTypes"]
31 from dataclasses
import dataclass
32 from types
import MappingProxyType
33 from typing
import FrozenSet, Mapping
37 from lsst.daf.butler
import DatasetType, Registry, SkyPixDimension
38 from .connections
import PipelineTaskConnections, iterConnections
50 """TaskDef is a collection of information about task needed by Pipeline. 52 The information includes task name, configuration object and optional 53 task class. This class is just a collection of attributes and it exposes 54 all of them so that attributes could potentially be modified in place 55 (e.g. if configuration needs extra overrides). 60 `PipelineTask` class name, currently it is not specified whether this 61 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 62 Framework should be prepared to handle all cases. 63 config : `lsst.pex.config.Config` 64 Instance of the configuration class corresponding to this task class, 65 usually with all overrides applied. 66 taskClass : `type` or ``None`` 67 `PipelineTask` class object, can be ``None``. If ``None`` then 68 framework will have to locate and load class. 69 label : `str`, optional 70 Task label, usually a short string unique in a pipeline. 72 def __init__(self, taskName, config, taskClass=None, label=""):
77 self.
connections = config.connections.ConnectionsClass(config=config)
82 rep +=
", label=" + self.
label 88 """Pipeline is a sequence of `TaskDef` objects. 90 Pipeline is given as one of the inputs to a supervising framework 91 which builds execution graph out of it. Pipeline contains a sequence 92 of `TaskDef` instances. 94 Main purpose of this class is to provide a mechanism to pass pipeline 95 definition from users to supervising framework. That mechanism is 96 implemented using simple serialization and de-serialization via 97 `pickle`. Note that pipeline serialization is not guaranteed to be 98 compatible between different versions or releases. 100 In current implementation Pipeline is a list (it inherits from `list`) 101 and one can use all list methods on pipeline. Content of the pipeline 102 can be modified, it is up to the client to verify that modifications 103 leave pipeline in a consistent state. One could modify container 104 directly by adding or removing its elements. 108 pipeline : iterable of `TaskDef` instances, optional 109 Initial sequence of tasks. 112 list.__init__(self, iterable
or [])
115 """Return task index given its label. 125 Task index, or -1 if label is not found. 127 for idx, taskDef
in enumerate(self):
128 if taskDef.label == label:
133 infos = [str(tdef)
for tdef
in self]
134 return "Pipeline({})".format(
", ".join(infos))
137 @dataclass(frozen=
True)
139 """An immutable struct that extracts and classifies the dataset types used 143 initInputs: FrozenSet[DatasetType]
144 """Dataset types that are needed as inputs in order to construct this Task. 146 Task-level `initInputs` may be classified as either 147 `~PipelineDatasetTypes.initInputs` or 148 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 151 initOutputs: FrozenSet[DatasetType]
152 """Dataset types that may be written after constructing this Task. 154 Task-level `initOutputs` may be classified as either 155 `~PipelineDatasetTypes.initOutputs` or 156 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 159 inputs: FrozenSet[DatasetType]
160 """Dataset types that are regular inputs to this Task. 162 If an input dataset needed for a Quantum cannot be found in the input 163 collection(s) or produced by another Task in the Pipeline, that Quantum 164 (and all dependent Quanta) will not be produced. 166 Task-level `inputs` may be classified as either 167 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 168 at the Pipeline level. 171 prerequisites: FrozenSet[DatasetType]
172 """Dataset types that are prerequisite inputs to this Task. 174 Prerequisite inputs must exist in the input collection(s) before the 175 pipeline is run, but do not constrain the graph - if a prerequisite is 176 missing for a Quantum, `PrerequisiteMissingError` is raised. 178 Prerequisite inputs are not resolved until the second stage of 179 QuantumGraph generation. 182 outputs: FrozenSet[DatasetType]
183 """Dataset types that are produced by this Task. 185 Task-level `outputs` may be classified as either 186 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 187 at the Pipeline level. 192 registry: Registry) -> TaskDatasetTypes:
193 """Extract and classify the dataset types from a single `PipelineTask`. 197 connectionsInstance: `PipelineTaskConnections` 198 An instance of a `PipelineTaskConnections` class for a particular 201 Registry used to construct normalized `DatasetType` objects and 202 retrieve those that are incomplete. 206 types: `TaskDatasetTypes` 207 The dataset types used by this task. 209 def makeDatasetTypesSet(connectionType):
210 """Constructs a set of true `DatasetType` objects 214 connectionType : `str` 215 Name of the connection type to produce a set for, corresponds 216 to an attribute of type `list` on the connection class instance 220 datasetTypes : `frozenset` 221 A set of all datasetTypes which correspond to the input 222 connection type specified in the connection class of this 227 This function is a closure over the variables ``registry`` and 228 ``connectionsInstance``. 232 dimensions = set(getattr(c,
'dimensions', set()))
233 if "skypix" in dimensions:
235 datasetType = registry.getDatasetType(c.name)
236 except LookupError
as err:
238 f
"DatasetType '{c.name}' referenced by " 239 f
"{type(connectionsInstance).__name__} uses 'skypix' as a dimension " 240 f
"placeholder, but does not already exist in the registry. " 241 f
"Note that reference catalog names are now used as the dataset " 242 f
"type name instead of 'ref_cat'." 244 rest1 = set(registry.dimensions.extract(dimensions - set([
"skypix"])).names)
245 rest2 = set(dim.name
for dim
in datasetType.dimensions
246 if not isinstance(dim, SkyPixDimension))
248 raise ValueError(f
"Non-skypix dimensions for dataset type {c.name} declared in " 249 f
"connections ({rest1}) are inconsistent with those in " 250 f
"registry's version of this dataset ({rest2}).")
252 datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
254 datasetTypes.append(datasetType)
255 return frozenset(datasetTypes)
258 initInputs=makeDatasetTypesSet(
"initInputs"),
259 initOutputs=makeDatasetTypesSet(
"initOutputs"),
260 inputs=makeDatasetTypesSet(
"inputs"),
261 prerequisites=makeDatasetTypesSet(
"prerequisiteInputs"),
262 outputs=makeDatasetTypesSet(
"outputs"),
266 @dataclass(frozen=
True)
268 """An immutable struct that classifies the dataset types used in a 272 initInputs: FrozenSet[DatasetType]
273 """Dataset types that are needed as inputs in order to construct the Tasks 276 This does not include dataset types that are produced when constructing 277 other Tasks in the Pipeline (these are classified as `initIntermediates`). 280 initOutputs: FrozenSet[DatasetType]
281 """Dataset types that may be written after constructing the Tasks in this 284 This does not include dataset types that are also used as inputs when 285 constructing other Tasks in the Pipeline (these are classified as 286 `initIntermediates`). 289 initIntermediates: FrozenSet[DatasetType]
290 """Dataset types that are both used when constructing one or more Tasks 291 in the Pipeline and produced as a side-effect of constructing another 292 Task in the Pipeline. 295 inputs: FrozenSet[DatasetType]
296 """Dataset types that are regular inputs for the full pipeline. 298 If an input dataset needed for a Quantum cannot be found in the input 299 collection(s), that Quantum (and all dependent Quanta) will not be 303 prerequisites: FrozenSet[DatasetType]
304 """Dataset types that are prerequisite inputs for the full Pipeline. 306 Prerequisite inputs must exist in the input collection(s) before the 307 pipeline is run, but do not constrain the graph - if a prerequisite is 308 missing for a Quantum, `PrerequisiteMissingError` is raised. 310 Prerequisite inputs are not resolved until the second stage of 311 QuantumGraph generation. 314 intermediates: FrozenSet[DatasetType]
315 """Dataset types that are output by one Task in the Pipeline and consumed 316 as inputs by one or more other Tasks in the Pipeline. 319 outputs: FrozenSet[DatasetType]
320 """Dataset types that are output by a Task in the Pipeline and not consumed 321 by any other Task in the Pipeline. 324 byTask: Mapping[str, TaskDatasetTypes]
325 """Per-Task dataset types, keyed by label in the `Pipeline`. 327 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 328 neither has been modified since the dataset types were extracted, of 333 def fromPipeline(cls, pipeline: Pipeline, *, registry: Registry) -> PipelineDatasetTypes:
334 """Extract and classify the dataset types from all tasks in a 340 An ordered collection of tasks that can be run together. 342 Registry used to construct normalized `DatasetType` objects and 343 retrieve those that are incomplete. 347 types: `PipelineDatasetTypes` 348 The dataset types used by this `Pipeline`. 353 Raised if Tasks are inconsistent about which datasets are marked 354 prerequisite. This indicates that the Tasks cannot be run as part 355 of the same `Pipeline`. 359 allInitInputs = set()
360 allInitOutputs = set()
361 prerequisites = set()
363 for taskDef
in pipeline:
364 thisTask = TaskDatasetTypes.fromConnections(taskDef.connections, registry=registry)
365 allInitInputs.update(thisTask.initInputs)
366 allInitOutputs.update(thisTask.initOutputs)
367 allInputs.update(thisTask.inputs)
368 prerequisites.update(thisTask.prerequisites)
369 allOutputs.update(thisTask.outputs)
370 byTask[taskDef.label] = thisTask
371 if not prerequisites.isdisjoint(allInputs):
372 raise ValueError(
"{} marked as both prerequisites and regular inputs".format(
373 {dt.name
for dt
in allInputs & prerequisites}
375 if not prerequisites.isdisjoint(allOutputs):
376 raise ValueError(
"{} marked as both prerequisites and outputs".format(
377 {dt.name
for dt
in allOutputs & prerequisites}
382 intermediateComponents = set()
383 intermediateComposites = set()
384 outputNameMapping = {dsType.name: dsType
for dsType
in allOutputs}
385 for dsType
in allInputs:
387 name, component = dsType.nameAndComponent()
391 if component
is not None:
392 if name
in outputNameMapping
and outputNameMapping[name].dimensions == dsType.dimensions:
393 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
394 universe=registry.dimensions)
395 intermediateComponents.add(dsType)
396 intermediateComposites.add(composite)
398 initInputs=frozenset(allInitInputs - allInitOutputs),
399 initIntermediates=frozenset(allInitInputs & allInitOutputs),
400 initOutputs=frozenset(allInitOutputs - allInitInputs),
401 inputs=frozenset(allInputs - allOutputs - intermediateComponents),
402 intermediates=frozenset(allInputs & allOutputs | intermediateComponents),
403 outputs=frozenset(allOutputs - allInputs - intermediateComposites),
404 prerequisites=frozenset(prerequisites),
405 byTask=MappingProxyType(byTask),
def __init__(self, taskName, config, taskClass=None, label="")
def __init__(self, iterable=None)
def labelIndex(self, label)