lsst.pipe.base  18.1.0-7-g85d95c9+2
pipeline.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining Pipeline class and related methods.
24 """
25 
26 __all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes"]
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 from dataclasses import dataclass
32 from types import MappingProxyType
33 from typing import FrozenSet, Mapping
34 
35 # -----------------------------
36 # Imports for other modules --
37 from lsst.daf.butler import DatasetType, Registry, SkyPixDimension
38 from .connections import PipelineTaskConnections, iterConnections
39 
40 # ----------------------------------
41 # Local non-exported definitions --
42 # ----------------------------------
43 
44 # ------------------------
45 # Exported definitions --
46 # ------------------------
47 
48 
49 class TaskDef:
50  """TaskDef is a collection of information about task needed by Pipeline.
51 
52  The information includes task name, configuration object and optional
53  task class. This class is just a collection of attributes and it exposes
54  all of them so that attributes could potentially be modified in place
55  (e.g. if configuration needs extra overrides).
56 
57  Attributes
58  ----------
59  taskName : `str`
60  `PipelineTask` class name, currently it is not specified whether this
61  is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
62  Framework should be prepared to handle all cases.
63  config : `lsst.pex.config.Config`
64  Instance of the configuration class corresponding to this task class,
65  usually with all overrides applied.
66  taskClass : `type` or ``None``
67  `PipelineTask` class object, can be ``None``. If ``None`` then
68  framework will have to locate and load class.
69  label : `str`, optional
70  Task label, usually a short string unique in a pipeline.
71  """
72  def __init__(self, taskName, config, taskClass=None, label=""):
73  self.taskName = taskName
74  self.config = config
75  self.taskClass = taskClass
76  self.label = label
77  self.connections = config.connections.ConnectionsClass(config=config)
78 
79  def __str__(self):
80  rep = "TaskDef(" + self.taskName
81  if self.label:
82  rep += ", label=" + self.label
83  rep += ")"
84  return rep
85 
86 
87 class Pipeline(list):
88  """Pipeline is a sequence of `TaskDef` objects.
89 
90  Pipeline is given as one of the inputs to a supervising framework
91  which builds execution graph out of it. Pipeline contains a sequence
92  of `TaskDef` instances.
93 
94  Main purpose of this class is to provide a mechanism to pass pipeline
95  definition from users to supervising framework. That mechanism is
96  implemented using simple serialization and de-serialization via
97  `pickle`. Note that pipeline serialization is not guaranteed to be
98  compatible between different versions or releases.
99 
100  In current implementation Pipeline is a list (it inherits from `list`)
101  and one can use all list methods on pipeline. Content of the pipeline
102  can be modified, it is up to the client to verify that modifications
103  leave pipeline in a consistent state. One could modify container
104  directly by adding or removing its elements.
105 
106  Parameters
107  ----------
108  pipeline : iterable of `TaskDef` instances, optional
109  Initial sequence of tasks.
110  """
111  def __init__(self, iterable=None):
112  list.__init__(self, iterable or [])
113 
114  def labelIndex(self, label):
115  """Return task index given its label.
116 
117  Parameters
118  ----------
119  label : `str`
120  Task label.
121 
122  Returns
123  -------
124  index : `int`
125  Task index, or -1 if label is not found.
126  """
127  for idx, taskDef in enumerate(self):
128  if taskDef.label == label:
129  return idx
130  return -1
131 
132  def __str__(self):
133  infos = [str(tdef) for tdef in self]
134  return "Pipeline({})".format(", ".join(infos))
135 
136 
137 @dataclass(frozen=True)
139  """An immutable struct that extracts and classifies the dataset types used
140  by a `PipelineTask`
141  """
142 
143  initInputs: FrozenSet[DatasetType]
144  """Dataset types that are needed as inputs in order to construct this Task.
145 
146  Task-level `initInputs` may be classified as either
147  `~PipelineDatasetTypes.initInputs` or
148  `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
149  """
150 
151  initOutputs: FrozenSet[DatasetType]
152  """Dataset types that may be written after constructing this Task.
153 
154  Task-level `initOutputs` may be classified as either
155  `~PipelineDatasetTypes.initOutputs` or
156  `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
157  """
158 
159  inputs: FrozenSet[DatasetType]
160  """Dataset types that are regular inputs to this Task.
161 
162  If an input dataset needed for a Quantum cannot be found in the input
163  collection(s) or produced by another Task in the Pipeline, that Quantum
164  (and all dependent Quanta) will not be produced.
165 
166  Task-level `inputs` may be classified as either
167  `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
168  at the Pipeline level.
169  """
170 
171  prerequisites: FrozenSet[DatasetType]
172  """Dataset types that are prerequisite inputs to this Task.
173 
174  Prerequisite inputs must exist in the input collection(s) before the
175  pipeline is run, but do not constrain the graph - if a prerequisite is
176  missing for a Quantum, `PrerequisiteMissingError` is raised.
177 
178  Prerequisite inputs are not resolved until the second stage of
179  QuantumGraph generation.
180  """
181 
182  outputs: FrozenSet[DatasetType]
183  """Dataset types that are produced by this Task.
184 
185  Task-level `outputs` may be classified as either
186  `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
187  at the Pipeline level.
188  """
189 
190  @classmethod
191  def fromConnections(cls, connectionsInstance: PipelineTaskConnections, *,
192  registry: Registry) -> TaskDatasetTypes:
193  """Extract and classify the dataset types from a single `PipelineTask`.
194 
195  Parameters
196  ----------
197  connectionsInstance: `PipelineTaskConnections`
198  An instance of a `PipelineTaskConnections` class for a particular
199  `PipelineTask`.
200  registry: `Registry`
201  Registry used to construct normalized `DatasetType` objects and
202  retrieve those that are incomplete.
203 
204  Returns
205  -------
206  types: `TaskDatasetTypes`
207  The dataset types used by this task.
208  """
209  def makeDatasetTypesSet(connectionType):
210  """Constructs a set of true `DatasetType` objects
211 
212  Parameters
213  ----------
214  connectionType : `str`
215  Name of the connection type to produce a set for, corresponds
216  to an attribute of type `list` on the connection class instance
217 
218  Returns
219  -------
220  datasetTypes : `frozenset`
221  A set of all datasetTypes which correspond to the input
222  connection type specified in the connection class of this
223  `PipelineTask`
224 
225  Notes
226  -----
227  This function is a closure over the variables ``registry`` and
228  ``connectionsInstance``.
229  """
230  datasetTypes = []
231  for c in iterConnections(connectionsInstance, connectionType):
232  dimensions = set(getattr(c, 'dimensions', set()))
233  if "skypix" in dimensions:
234  try:
235  datasetType = registry.getDatasetType(c.name)
236  except LookupError as err:
237  raise LookupError(
238  f"DatasetType '{c.name}' referenced by "
239  f"{type(connectionsInstance).__name__} uses 'skypix' as a dimension "
240  f"placeholder, but does not already exist in the registry. "
241  f"Note that reference catalog names are now used as the dataset "
242  f"type name instead of 'ref_cat'."
243  ) from err
244  rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
245  rest2 = set(dim.name for dim in datasetType.dimensions
246  if not isinstance(dim, SkyPixDimension))
247  if rest1 != rest2:
248  raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in "
249  f"connections ({rest1}) are inconsistent with those in "
250  f"registry's version of this dataset ({rest2}).")
251  else:
252  datasetType = DatasetType(c.name, registry.dimensions.extract(dimensions),
253  c.storageClass)
254  datasetTypes.append(datasetType)
255  return frozenset(datasetTypes)
256 
257  return cls(
258  initInputs=makeDatasetTypesSet("initInputs"),
259  initOutputs=makeDatasetTypesSet("initOutputs"),
260  inputs=makeDatasetTypesSet("inputs"),
261  prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
262  outputs=makeDatasetTypesSet("outputs"),
263  )
264 
265 
266 @dataclass(frozen=True)
268  """An immutable struct that classifies the dataset types used in a
269  `Pipeline`.
270  """
271 
272  initInputs: FrozenSet[DatasetType]
273  """Dataset types that are needed as inputs in order to construct the Tasks
274  in this Pipeline.
275 
276  This does not include dataset types that are produced when constructing
277  other Tasks in the Pipeline (these are classified as `initIntermediates`).
278  """
279 
280  initOutputs: FrozenSet[DatasetType]
281  """Dataset types that may be written after constructing the Tasks in this
282  Pipeline.
283 
284  This does not include dataset types that are also used as inputs when
285  constructing other Tasks in the Pipeline (these are classified as
286  `initIntermediates`).
287  """
288 
289  initIntermediates: FrozenSet[DatasetType]
290  """Dataset types that are both used when constructing one or more Tasks
291  in the Pipeline and produced as a side-effect of constructing another
292  Task in the Pipeline.
293  """
294 
295  inputs: FrozenSet[DatasetType]
296  """Dataset types that are regular inputs for the full pipeline.
297 
298  If an input dataset needed for a Quantum cannot be found in the input
299  collection(s), that Quantum (and all dependent Quanta) will not be
300  produced.
301  """
302 
303  prerequisites: FrozenSet[DatasetType]
304  """Dataset types that are prerequisite inputs for the full Pipeline.
305 
306  Prerequisite inputs must exist in the input collection(s) before the
307  pipeline is run, but do not constrain the graph - if a prerequisite is
308  missing for a Quantum, `PrerequisiteMissingError` is raised.
309 
310  Prerequisite inputs are not resolved until the second stage of
311  QuantumGraph generation.
312  """
313 
314  intermediates: FrozenSet[DatasetType]
315  """Dataset types that are output by one Task in the Pipeline and consumed
316  as inputs by one or more other Tasks in the Pipeline.
317  """
318 
319  outputs: FrozenSet[DatasetType]
320  """Dataset types that are output by a Task in the Pipeline and not consumed
321  by any other Task in the Pipeline.
322  """
323 
324  byTask: Mapping[str, TaskDatasetTypes]
325  """Per-Task dataset types, keyed by label in the `Pipeline`.
326 
327  This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
328  neither has been modified since the dataset types were extracted, of
329  course).
330  """
331 
332  @classmethod
333  def fromPipeline(cls, pipeline: Pipeline, *, registry: Registry) -> PipelineDatasetTypes:
334  """Extract and classify the dataset types from all tasks in a
335  `Pipeline`.
336 
337  Parameters
338  ----------
339  pipeline: `Pipeline`
340  An ordered collection of tasks that can be run together.
341  registry: `Registry`
342  Registry used to construct normalized `DatasetType` objects and
343  retrieve those that are incomplete.
344 
345  Returns
346  -------
347  types: `PipelineDatasetTypes`
348  The dataset types used by this `Pipeline`.
349 
350  Raises
351  ------
352  ValueError
353  Raised if Tasks are inconsistent about which datasets are marked
354  prerequisite. This indicates that the Tasks cannot be run as part
355  of the same `Pipeline`.
356  """
357  allInputs = set()
358  allOutputs = set()
359  allInitInputs = set()
360  allInitOutputs = set()
361  prerequisites = set()
362  byTask = dict()
363  for taskDef in pipeline:
364  thisTask = TaskDatasetTypes.fromConnections(taskDef.connections, registry=registry)
365  allInitInputs.update(thisTask.initInputs)
366  allInitOutputs.update(thisTask.initOutputs)
367  allInputs.update(thisTask.inputs)
368  prerequisites.update(thisTask.prerequisites)
369  allOutputs.update(thisTask.outputs)
370  byTask[taskDef.label] = thisTask
371  if not prerequisites.isdisjoint(allInputs):
372  raise ValueError("{} marked as both prerequisites and regular inputs".format(
373  {dt.name for dt in allInputs & prerequisites}
374  ))
375  if not prerequisites.isdisjoint(allOutputs):
376  raise ValueError("{} marked as both prerequisites and outputs".format(
377  {dt.name for dt in allOutputs & prerequisites}
378  ))
379  # Make sure that components which are marked as inputs get treated as
380  # intermediates if there is an output which produces the composite
381  # containing the component
382  intermediateComponents = set()
383  intermediateComposites = set()
384  outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
385  for dsType in allInputs:
386  # get the name of a possible component
387  name, component = dsType.nameAndComponent()
388  # if there is a component name, that means this is a component
389  # DatasetType, if there is an output which produces the parent of
390  # this component, treat this input as an intermediate
391  if component is not None:
392  if name in outputNameMapping and outputNameMapping[name].dimensions == dsType.dimensions:
393  composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
394  universe=registry.dimensions)
395  intermediateComponents.add(dsType)
396  intermediateComposites.add(composite)
397  return cls(
398  initInputs=frozenset(allInitInputs - allInitOutputs),
399  initIntermediates=frozenset(allInitInputs & allInitOutputs),
400  initOutputs=frozenset(allInitOutputs - allInitInputs),
401  inputs=frozenset(allInputs - allOutputs - intermediateComponents),
402  intermediates=frozenset(allInputs & allOutputs | intermediateComponents),
403  outputs=frozenset(allOutputs - allInputs - intermediateComposites),
404  prerequisites=frozenset(prerequisites),
405  byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
406  )
def __init__(self, taskName, config, taskClass=None, label="")
Definition: pipeline.py:72
def __init__(self, iterable=None)
Definition: pipeline.py:111