lsst.pipe.base  18.0.0-2-g0ee56d7+7
pipeline.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining Pipeline class and related methods.
24 """
25 
26 __all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes"]
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 from dataclasses import dataclass
32 from typing import FrozenSet, Mapping, Type
33 from types import MappingProxyType
34 
35 # -----------------------------
36 # Imports for other modules --
37 # -----------------------------
38 from lsst.daf.butler import DatasetType, DimensionUniverse
39 from .pipelineTask import PipelineTask
40 from .config import PipelineTaskConfig
41 
42 # ----------------------------------
43 # Local non-exported definitions --
44 # ----------------------------------
45 
46 # ------------------------
47 # Exported definitions --
48 # ------------------------
49 
50 
51 class TaskDef:
52  """TaskDef is a collection of information about task needed by Pipeline.
53 
54  The information includes task name, configuration object and optional
55  task class. This class is just a collection of attributes and it exposes
56  all of them so that attributes could potentially be modified in place
57  (e.g. if configuration needs extra overrides).
58 
59  Attributes
60  ----------
61  taskName : `str`
62  `PipelineTask` class name, currently it is not specified whether this
63  is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
64  Framework should be prepared to handle all cases.
65  config : `lsst.pex.config.Config`
66  Instance of the configuration class corresponding to this task class,
67  usually with all overrides applied.
68  taskClass : `type` or ``None``
69  `PipelineTask` class object, can be ``None``. If ``None`` then
70  framework will have to locate and load class.
71  label : `str`, optional
72  Task label, usually a short string unique in a pipeline.
73  """
74  def __init__(self, taskName, config, taskClass=None, label=""):
75  self.taskName = taskName
76  self.config = config
77  self.taskClass = taskClass
78  self.label = label
79 
80  def __str__(self):
81  rep = "TaskDef(" + self.taskName
82  if self.label:
83  rep += ", label=" + self.label
84  rep += ")"
85  return rep
86 
87 
88 class Pipeline(list):
89  """Pipeline is a sequence of `TaskDef` objects.
90 
91  Pipeline is given as one of the inputs to a supervising framework
92  which builds execution graph out of it. Pipeline contains a sequence
93  of `TaskDef` instances.
94 
95  Main purpose of this class is to provide a mechanism to pass pipeline
96  definition from users to supervising framework. That mechanism is
97  implemented using simple serialization and de-serialization via
98  `pickle`. Note that pipeline serialization is not guaranteed to be
99  compatible between different versions or releases.
100 
101  In current implementation Pipeline is a list (it inherits from `list`)
102  and one can use all list methods on pipeline. Content of the pipeline
103  can be modified, it is up to the client to verify that modifications
104  leave pipeline in a consistent state. One could modify container
105  directly by adding or removing its elements.
106 
107  Parameters
108  ----------
109  pipeline : iterable of `TaskDef` instances, optional
110  Initial sequence of tasks.
111  """
112  def __init__(self, iterable=None):
113  list.__init__(self, iterable or [])
114 
115  def labelIndex(self, label):
116  """Return task index given its label.
117 
118  Parameters
119  ----------
120  label : `str`
121  Task label.
122 
123  Returns
124  -------
125  index : `int`
126  Task index, or -1 if label is not found.
127  """
128  for idx, taskDef in enumerate(self):
129  if taskDef.label == label:
130  return idx
131  return -1
132 
133  def __str__(self):
134  infos = [str(tdef) for tdef in self]
135  return "Pipeline({})".format(", ".join(infos))
136 
137 
138 @dataclass(frozen=True)
140  """An immutable struct that extracts and classifies the dataset types used
141  by a `PipelineTask`
142  """
143 
144  initInputs: FrozenSet[DatasetType]
145  """Dataset types that are needed as inputs in order to construct this Task.
146 
147  Task-level `initInputs` may be classified as either
148  `~PipelineDatasetTypes.initInputs` or
149  `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
150  """
151 
152  initOutputs: FrozenSet[DatasetType]
153  """Dataset types that may be written after constructing this Task.
154 
155  Task-level `initOutputs` may be classified as either
156  `~PipelineDatasetTypes.initOutputs` or
157  `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
158  """
159 
160  inputs: FrozenSet[DatasetType]
161  """Dataset types that are regular inputs to this Task.
162 
163  If an input dataset needed for a Quantum cannot be found in the input
164  collection(s) or produced by another Task in the Pipeline, that Quantum
165  (and all dependent Quanta) will not be produced.
166 
167  Task-level `inputs` may be classified as either
168  `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
169  at the Pipeline level.
170  """
171 
172  prerequisites: FrozenSet[DatasetType]
173  """Dataset types that are prerequisite inputs to this Task.
174 
175  Prerequisite inputs must exist in the input collection(s) before the
176  pipeline is run, but do not constrain the graph - if a prerequisite is
177  missing for a Quantum, `PrerequisiteMissingError` is raised.
178 
179  Prerequisite inputs are not resolved until the second stage of
180  QuantumGraph generation.
181  """
182 
183  outputs: FrozenSet[DatasetType]
184  """Dataset types that are produced by this Task.
185 
186  Task-level `outputs` may be classified as either
187  `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
188  at the Pipeline level.
189  """
190 
191  @classmethod
192  def fromTask(cls, taskClass: Type[PipelineTask], config: PipelineTaskConfig, *,
193  universe: DimensionUniverse) -> TaskDatasetTypes:
194  """Extract and classify the dataset types from a single `PipelineTask`.
195 
196  Parameters
197  ----------
198  taskClass: `type`
199  A concrete `PipelineTask` subclass.
200  config: `PipelineTaskConfig`
201  Configuration for the concrete `PipelineTask`.
202  universe: `DimensionUniverse`
203  Set of all known dimensions, used to construct normalized
204  `DatasetType` objects.
205 
206  Returns
207  -------
208  types: `TaskDatasetTypes`
209  The dataset types used by this task.
210  """
211  # TODO: there is both a bit too much repetition here and not quite
212  # enough to make it worthwhile to refactor it (i.e. inputs and
213  # prerequisites are special, so we can't use the same code for them
214  # as we could for the others). But other work on PipelineTask
215  # interfaces will eventually make this moot.
216  allInputsByArgName = {k: descr.makeDatasetType(universe)
217  for k, descr in taskClass.getInputDatasetTypes(config).items()}
218  prerequisiteArgNames = taskClass.getPrerequisiteDatasetTypes(config)
219  return cls(
220  initInputs=frozenset(descr.makeDatasetType(universe)
221  for descr in taskClass.getInitInputDatasetTypes(config).values()),
222  initOutputs=frozenset(descr.makeDatasetType(universe)
223  for descr in taskClass.getInitOutputDatasetTypes(config).values()),
224  inputs=frozenset(v for k, v in allInputsByArgName.items() if k not in prerequisiteArgNames),
225  prerequisites=frozenset(v for k, v in allInputsByArgName.items() if k in prerequisiteArgNames),
226  outputs=frozenset(descr.makeDatasetType(universe)
227  for descr in taskClass.getOutputDatasetTypes(config).values()),
228  )
229 
230 
231 @dataclass(frozen=True)
233  """An immutable struct that classifies the dataset types used in a
234  `Pipeline`.
235  """
236 
237  initInputs: FrozenSet[DatasetType]
238  """Dataset types that are needed as inputs in order to construct the Tasks
239  in this Pipeline.
240 
241  This does not include dataset types that are produced when constructing
242  other Tasks in the Pipeline (these are classified as `initIntermediates`).
243  """
244 
245  initOutputs: FrozenSet[DatasetType]
246  """Dataset types that may be written after constructing the Tasks in this
247  Pipeline.
248 
249  This does not include dataset types that are also used as inputs when
250  constructing other Tasks in the Pipeline (these are classified as
251  `initIntermediates`).
252  """
253 
254  initIntermediates: FrozenSet[DatasetType]
255  """Dataset types that are both used when constructing one or more Tasks
256  in the Pipeline and produced as a side-effect of constructing another
257  Task in the Pipeline.
258  """
259 
260  inputs: FrozenSet[DatasetType]
261  """Dataset types that are regular inputs for the full pipeline.
262 
263  If an input dataset needed for a Quantum cannot be found in the input
264  collection(s), that Quantum (and all dependent Quanta) will not be
265  produced.
266  """
267 
268  prerequisites: FrozenSet[DatasetType]
269  """Dataset types that are prerequisite inputs for the full Pipeline.
270 
271  Prerequisite inputs must exist in the input collection(s) before the
272  pipeline is run, but do not constrain the graph - if a prerequisite is
273  missing for a Quantum, `PrerequisiteMissingError` is raised.
274 
275  Prerequisite inputs are not resolved until the second stage of
276  QuantumGraph generation.
277  """
278 
279  intermediates: FrozenSet[DatasetType]
280  """Dataset types that are output by one Task in the Pipeline and consumed
281  as inputs by one or more other Tasks in the Pipeline.
282  """
283 
284  outputs: FrozenSet[DatasetType]
285  """Dataset types that are output by a Task in the Pipeline and not consumed
286  by any other Task in the Pipeline.
287  """
288 
289  byTask: Mapping[str, TaskDatasetTypes]
290  """Per-Task dataset types, keyed by label in the `Pipeline`.
291 
292  This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
293  neither has been modified since the dataset types were extracted, of
294  course).
295  """
296 
297  @classmethod
298  def fromPipeline(cls, pipeline: Pipeline, *, universe: DimensionUniverse) -> PipelineDatasetTypes:
299  """Extract and classify the dataset types from all tasks in a
300  `Pipeline`.
301 
302  Parameters
303  ----------
304  pipeline: `Pipeline`
305  An ordered collection of tasks that can be run together.
306  universe: `DimensionUniverse`
307  Set of all known dimensions, used to construct normalized
308  `DatasetType` objects.
309 
310  Returns
311  -------
312  types: `PipelineDatasetTypes`
313  The dataset types used by this `Pipeline`.
314 
315  Raises
316  ------
317  ValueError
318  Raised if Tasks are inconsistent about which datasets are marked
319  prerequisite. This indicates that the Tasks cannot be run as part
320  of the same `Pipeline`.
321  """
322  allInputs = set()
323  allOutputs = set()
324  allInitInputs = set()
325  allInitOutputs = set()
326  prerequisites = set()
327  byTask = dict()
328  for taskDef in pipeline:
329  thisTask = TaskDatasetTypes.fromTask(taskDef.taskClass, taskDef.config, universe=universe)
330  allInitInputs.update(thisTask.initInputs)
331  allInitOutputs.update(thisTask.initOutputs)
332  allInputs.update(thisTask.inputs)
333  prerequisites.update(thisTask.prerequisites)
334  allOutputs.update(thisTask.outputs)
335  byTask[taskDef.label] = thisTask
336  if not prerequisites.isdisjoint(allInputs):
337  raise ValueError("{} marked as both prerequisites and regular inputs".format(
338  {dt.name for dt in allInputs & prerequisites}
339  ))
340  if not prerequisites.isdisjoint(allOutputs):
341  raise ValueError("{} marked as both prerequisites and outputs".format(
342  {dt.name for dt in allOutputs & prerequisites}
343  ))
344  return cls(
345  initInputs=frozenset(allInitInputs - allInitOutputs),
346  initIntermediates=frozenset(allInitInputs & allInitOutputs),
347  initOutputs=frozenset(allInitOutputs - allInitInputs),
348  inputs=frozenset(allInputs - allOutputs),
349  intermediates=frozenset(allInputs & allOutputs),
350  outputs=frozenset(allOutputs - allInputs),
351  prerequisites=frozenset(prerequisites),
352  byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
353  )
def __init__(self, taskName, config, taskClass=None, label="")
Definition: pipeline.py:74
def __init__(self, iterable=None)
Definition: pipeline.py:112