lsst.pipe.base  18.1.0-5-gbd1decb
pipeline.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining Pipeline class and related methods.
24 """
25 
26 __all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes"]
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 from dataclasses import dataclass
32 from types import MappingProxyType
33 from typing import FrozenSet, Mapping
34 
35 # -----------------------------
36 # Imports for other modules --
37 # -----------------------------
38 from lsst.daf.butler import DatasetType, DimensionUniverse
39 from .connections import PipelineTaskConnections, iterConnections
40 
41 # ----------------------------------
42 # Local non-exported definitions --
43 # ----------------------------------
44 
45 # ------------------------
46 # Exported definitions --
47 # ------------------------
48 
49 
50 class TaskDef:
51  """TaskDef is a collection of information about task needed by Pipeline.
52 
53  The information includes task name, configuration object and optional
54  task class. This class is just a collection of attributes and it exposes
55  all of them so that attributes could potentially be modified in place
56  (e.g. if configuration needs extra overrides).
57 
58  Attributes
59  ----------
60  taskName : `str`
61  `PipelineTask` class name, currently it is not specified whether this
62  is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
63  Framework should be prepared to handle all cases.
64  config : `lsst.pex.config.Config`
65  Instance of the configuration class corresponding to this task class,
66  usually with all overrides applied.
67  taskClass : `type` or ``None``
68  `PipelineTask` class object, can be ``None``. If ``None`` then
69  framework will have to locate and load class.
70  label : `str`, optional
71  Task label, usually a short string unique in a pipeline.
72  """
73  def __init__(self, taskName, config, taskClass=None, label=""):
74  self.taskName = taskName
75  self.config = config
76  self.taskClass = taskClass
77  self.label = label
78  self.connections = config.connections.ConnectionsClass(config=config)
79 
80  def __str__(self):
81  rep = "TaskDef(" + self.taskName
82  if self.label:
83  rep += ", label=" + self.label
84  rep += ")"
85  return rep
86 
87 
88 class Pipeline(list):
89  """Pipeline is a sequence of `TaskDef` objects.
90 
91  Pipeline is given as one of the inputs to a supervising framework
92  which builds execution graph out of it. Pipeline contains a sequence
93  of `TaskDef` instances.
94 
95  Main purpose of this class is to provide a mechanism to pass pipeline
96  definition from users to supervising framework. That mechanism is
97  implemented using simple serialization and de-serialization via
98  `pickle`. Note that pipeline serialization is not guaranteed to be
99  compatible between different versions or releases.
100 
101  In current implementation Pipeline is a list (it inherits from `list`)
102  and one can use all list methods on pipeline. Content of the pipeline
103  can be modified, it is up to the client to verify that modifications
104  leave pipeline in a consistent state. One could modify container
105  directly by adding or removing its elements.
106 
107  Parameters
108  ----------
109  pipeline : iterable of `TaskDef` instances, optional
110  Initial sequence of tasks.
111  """
112  def __init__(self, iterable=None):
113  list.__init__(self, iterable or [])
114 
115  def labelIndex(self, label):
116  """Return task index given its label.
117 
118  Parameters
119  ----------
120  label : `str`
121  Task label.
122 
123  Returns
124  -------
125  index : `int`
126  Task index, or -1 if label is not found.
127  """
128  for idx, taskDef in enumerate(self):
129  if taskDef.label == label:
130  return idx
131  return -1
132 
133  def __str__(self):
134  infos = [str(tdef) for tdef in self]
135  return "Pipeline({})".format(", ".join(infos))
136 
137 
138 @dataclass(frozen=True)
140  """An immutable struct that extracts and classifies the dataset types used
141  by a `PipelineTask`
142  """
143 
144  initInputs: FrozenSet[DatasetType]
145  """Dataset types that are needed as inputs in order to construct this Task.
146 
147  Task-level `initInputs` may be classified as either
148  `~PipelineDatasetTypes.initInputs` or
149  `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
150  """
151 
152  initOutputs: FrozenSet[DatasetType]
153  """Dataset types that may be written after constructing this Task.
154 
155  Task-level `initOutputs` may be classified as either
156  `~PipelineDatasetTypes.initOutputs` or
157  `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
158  """
159 
160  inputs: FrozenSet[DatasetType]
161  """Dataset types that are regular inputs to this Task.
162 
163  If an input dataset needed for a Quantum cannot be found in the input
164  collection(s) or produced by another Task in the Pipeline, that Quantum
165  (and all dependent Quanta) will not be produced.
166 
167  Task-level `inputs` may be classified as either
168  `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
169  at the Pipeline level.
170  """
171 
172  prerequisites: FrozenSet[DatasetType]
173  """Dataset types that are prerequisite inputs to this Task.
174 
175  Prerequisite inputs must exist in the input collection(s) before the
176  pipeline is run, but do not constrain the graph - if a prerequisite is
177  missing for a Quantum, `PrerequisiteMissingError` is raised.
178 
179  Prerequisite inputs are not resolved until the second stage of
180  QuantumGraph generation.
181  """
182 
183  outputs: FrozenSet[DatasetType]
184  """Dataset types that are produced by this Task.
185 
186  Task-level `outputs` may be classified as either
187  `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
188  at the Pipeline level.
189  """
190 
191  @classmethod
192  def fromConnections(cls, connectionsInstance: PipelineTaskConnections, *,
193  universe: DimensionUniverse) -> TaskDatasetTypes:
194  """Extract and classify the dataset types from a single `PipelineTask`.
195 
196  Parameters
197  ----------
198  connectionsInstance: `PipelineTaskConnections`
199  An instance of a `PipelineTaskConnections` class for a particular
200  `PipelineTask`.
201  universe: `DimensionUniverse`
202  Set of all known dimensions, used to construct normalized
203  `DatasetType` objects.
204 
205  Returns
206  -------
207  types: `TaskDatasetTypes`
208  The dataset types used by this task.
209  """
210  def makeDatasetTypesSet(connectionType):
211  """Constructs a set of true `DatasetType` objects
212 
213  Parameters
214  ----------
215  connectionType : `str`
216  Name of the connection type to produce a set for, corresponds
217  to an attribute of type `list` on the connection class instance
218 
219  Returns
220  -------
221  datasetTypes : `frozenset`
222  A set of all datasetTypes which correspond to the input
223  connection type specified in the connection class of this
224  `PipelineTask`
225 
226  Notes
227  -----
228  This function is a closure over the variables univers and
229  connectionsInstnace
230  """
231  datasetTypes = []
232  for c in iterConnections(connectionsInstance, connectionType):
233  dimensions = getattr(c, 'dimensions', set())
234  datasetTypes.append(DatasetType(c.name, universe.extract(dimensions), c.storageClass))
235  return frozenset(datasetTypes)
236 
237  return cls(
238  initInputs=makeDatasetTypesSet("initInputs"),
239  initOutputs=makeDatasetTypesSet("initOutputs"),
240  inputs=makeDatasetTypesSet("inputs"),
241  prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
242  outputs=makeDatasetTypesSet("outputs"),
243  )
244 
245 
246 @dataclass(frozen=True)
248  """An immutable struct that classifies the dataset types used in a
249  `Pipeline`.
250  """
251 
252  initInputs: FrozenSet[DatasetType]
253  """Dataset types that are needed as inputs in order to construct the Tasks
254  in this Pipeline.
255 
256  This does not include dataset types that are produced when constructing
257  other Tasks in the Pipeline (these are classified as `initIntermediates`).
258  """
259 
260  initOutputs: FrozenSet[DatasetType]
261  """Dataset types that may be written after constructing the Tasks in this
262  Pipeline.
263 
264  This does not include dataset types that are also used as inputs when
265  constructing other Tasks in the Pipeline (these are classified as
266  `initIntermediates`).
267  """
268 
269  initIntermediates: FrozenSet[DatasetType]
270  """Dataset types that are both used when constructing one or more Tasks
271  in the Pipeline and produced as a side-effect of constructing another
272  Task in the Pipeline.
273  """
274 
275  inputs: FrozenSet[DatasetType]
276  """Dataset types that are regular inputs for the full pipeline.
277 
278  If an input dataset needed for a Quantum cannot be found in the input
279  collection(s), that Quantum (and all dependent Quanta) will not be
280  produced.
281  """
282 
283  prerequisites: FrozenSet[DatasetType]
284  """Dataset types that are prerequisite inputs for the full Pipeline.
285 
286  Prerequisite inputs must exist in the input collection(s) before the
287  pipeline is run, but do not constrain the graph - if a prerequisite is
288  missing for a Quantum, `PrerequisiteMissingError` is raised.
289 
290  Prerequisite inputs are not resolved until the second stage of
291  QuantumGraph generation.
292  """
293 
294  intermediates: FrozenSet[DatasetType]
295  """Dataset types that are output by one Task in the Pipeline and consumed
296  as inputs by one or more other Tasks in the Pipeline.
297  """
298 
299  outputs: FrozenSet[DatasetType]
300  """Dataset types that are output by a Task in the Pipeline and not consumed
301  by any other Task in the Pipeline.
302  """
303 
304  byTask: Mapping[str, TaskDatasetTypes]
305  """Per-Task dataset types, keyed by label in the `Pipeline`.
306 
307  This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
308  neither has been modified since the dataset types were extracted, of
309  course).
310  """
311 
312  @classmethod
313  def fromPipeline(cls, pipeline: Pipeline, *, universe: DimensionUniverse) -> PipelineDatasetTypes:
314  """Extract and classify the dataset types from all tasks in a
315  `Pipeline`.
316 
317  Parameters
318  ----------
319  pipeline: `Pipeline`
320  An ordered collection of tasks that can be run together.
321  universe: `DimensionUniverse`
322  Set of all known dimensions, used to construct normalized
323  `DatasetType` objects.
324 
325  Returns
326  -------
327  types: `PipelineDatasetTypes`
328  The dataset types used by this `Pipeline`.
329 
330  Raises
331  ------
332  ValueError
333  Raised if Tasks are inconsistent about which datasets are marked
334  prerequisite. This indicates that the Tasks cannot be run as part
335  of the same `Pipeline`.
336  """
337  allInputs = set()
338  allOutputs = set()
339  allInitInputs = set()
340  allInitOutputs = set()
341  prerequisites = set()
342  byTask = dict()
343  for taskDef in pipeline:
344  thisTask = TaskDatasetTypes.fromConnections(taskDef.connections, universe=universe)
345  allInitInputs.update(thisTask.initInputs)
346  allInitOutputs.update(thisTask.initOutputs)
347  allInputs.update(thisTask.inputs)
348  prerequisites.update(thisTask.prerequisites)
349  allOutputs.update(thisTask.outputs)
350  byTask[taskDef.label] = thisTask
351  if not prerequisites.isdisjoint(allInputs):
352  raise ValueError("{} marked as both prerequisites and regular inputs".format(
353  {dt.name for dt in allInputs & prerequisites}
354  ))
355  if not prerequisites.isdisjoint(allOutputs):
356  raise ValueError("{} marked as both prerequisites and outputs".format(
357  {dt.name for dt in allOutputs & prerequisites}
358  ))
359  # Make sure that components which are marked as inputs get treated as
360  # intermediates if there is an output which produces the composite
361  # containing the component
362  intermediateComponents = set()
363  intermediateComposites = set()
364  outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
365  for dsType in allInputs:
366  # get the name of a possible component
367  name, component = dsType.nameAndComponent()
368  # if there is a component name, that means this is a component
369  # DatasetType, if there is an output which produces the parent of
370  # this component, treat this input as an intermediate
371  if component is not None:
372  if name in outputNameMapping and outputNameMapping[name].dimensions == dsType.dimensions:
373  composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
374  universe=universe)
375  intermediateComponents.add(dsType)
376  intermediateComposites.add(composite)
377  return cls(
378  initInputs=frozenset(allInitInputs - allInitOutputs),
379  initIntermediates=frozenset(allInitInputs & allInitOutputs),
380  initOutputs=frozenset(allInitOutputs - allInitInputs),
381  inputs=frozenset(allInputs - allOutputs - intermediateComponents),
382  intermediates=frozenset(allInputs & allOutputs | intermediateComponents),
383  outputs=frozenset(allOutputs - allInputs - intermediateComposites),
384  prerequisites=frozenset(prerequisites),
385  byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
386  )
def __init__(self, taskName, config, taskClass=None, label="")
Definition: pipeline.py:73
def __init__(self, iterable=None)
Definition: pipeline.py:112