Coverage for python/lsst/pipe/base/connectionTypes.py: 69%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29import typing
30from typing import Callable, Iterable, Optional
32from lsst.daf.butler import (
33 CollectionSearch,
34 DataCoordinate,
35 DatasetRef,
36 DatasetType,
37 DimensionUniverse,
38 Registry,
39 StorageClass,
40)
43@dataclasses.dataclass(frozen=True)
44class BaseConnection:
45 """Base class used for declaring PipelineTask connections
47 Parameters
48 ----------
49 name : `str`
50 The name used to identify the dataset type
51 storageClass : `str`
52 The storage class used when (un)/persisting the dataset type
53 multiple : `bool`
54 Indicates if this connection should expect to contain multiple objects
55 of the given dataset type. Tasks with more than one connection with
56 ``multiple=True`` with the same dimensions may want to implement
57 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
58 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
59 the execution system as early as possible of outputs that will not be
60 produced because the corresponding input is missing.
61 """
63 name: str
64 storageClass: str
65 doc: str = ""
66 multiple: bool = False
68 def __get__(self, inst, klass):
69 """Descriptor method
71 This is a method used to turn a connection into a descriptor.
72 When a connection is added to a connection class, it is a class level
73 variable. This method makes accessing this connection, on the
74 instance of the connection class owning this connection, return a
75 result specialized for that instance. In the case of connections
76 this specifically means names specified in a config instance will
77 be visible instead of the default names for the connection.
78 """
79 # If inst is None, this is being accessed by the class and not an
80 # instance, return this connection itself
81 if inst is None:
82 return self
83 # If no object cache exists, create one to track the instances this
84 # connection has been accessed by
85 if not hasattr(inst, "_connectionCache"):
86 object.__setattr__(inst, "_connectionCache", {})
87 # Look up an existing cached instance
88 idSelf = id(self)
89 if idSelf in inst._connectionCache:
90 return inst._connectionCache[idSelf]
91 # Accumulate the parameters that define this connection
92 params = {}
93 for field in dataclasses.fields(self):
94 params[field.name] = getattr(self, field.name)
95 # Get the name override defined by the instance of the connection class
96 params["name"] = inst._nameOverrides[self.varName]
97 # Return a new instance of this connection specialized with the
98 # information provided by the connection class instance
99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params))
101 def makeDatasetType(self, universe: DimensionUniverse, parentStorageClass: Optional[StorageClass] = None):
102 """Construct a true `DatasetType` instance with normalized dimensions.
104 Parameters
105 ----------
106 universe : `lsst.daf.butler.DimensionUniverse`
107 Set of all known dimensions to be used to normalize the dimension
108 names specified in config.
109 parentStorageClass : `lsst.daf.butler.StorageClass`, optional
110 Parent storage class for component datasets; `None` otherwise.
112 Returns
113 -------
114 datasetType : `DatasetType`
115 The `DatasetType` defined by this connection.
116 """
117 return DatasetType(
118 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
119 )
122@dataclasses.dataclass(frozen=True)
123class DimensionedConnection(BaseConnection):
124 """Class used for declaring PipelineTask connections that includes
125 dimensions
127 Parameters
128 ----------
129 name : `str`
130 The name used to identify the dataset type
131 storageClass : `str`
132 The storage class used when (un)/persisting the dataset type
133 multiple : `bool`
134 Indicates if this connection should expect to contain multiple objects
135 of the given dataset type. Tasks with more than one connection with
136 ``multiple=True`` with the same dimensions may want to implement
137 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
138 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
139 the execution system as early as possible of outputs that will not be
140 produced because the corresponding input is missing.
141 dimensions : iterable of `str`
142 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
143 to identify the dataset type identified by the specified name
144 isCalibration: `bool`, optional
145 `True` if this dataset type may be included in CALIBRATION-type
146 collections to associate it with a validity range, `False` (default)
147 otherwise.
148 """
150 dimensions: typing.Iterable[str] = ()
151 isCalibration: bool = False
153 def __post_init__(self):
154 if isinstance(self.dimensions, str): 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true
155 raise TypeError(
156 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
157 )
158 if not isinstance(self.dimensions, typing.Iterable): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true
159 raise TypeError("Dimensions must be iterable of dimensions")
161 def makeDatasetType(self, universe: DimensionUniverse, parentStorageClass: Optional[StorageClass] = None):
162 """Construct a true `DatasetType` instance with normalized dimensions.
164 Parameters
165 ----------
166 universe : `lsst.daf.butler.DimensionUniverse`
167 Set of all known dimensions to be used to normalize the dimension
168 names specified in config.
169 parentStorageClass : `lsst.daf.butler.StorageClass`, optional
170 Parent storage class for component datasets; `None` otherwise.
172 Returns
173 -------
174 datasetType : `DatasetType`
175 The `DatasetType` defined by this connection.
176 """
177 return DatasetType(
178 self.name,
179 universe.extract(self.dimensions),
180 self.storageClass,
181 isCalibration=self.isCalibration,
182 parentStorageClass=parentStorageClass,
183 )
186@dataclasses.dataclass(frozen=True)
187class BaseInput(DimensionedConnection):
188 """Class used for declaring PipelineTask input connections
190 Parameters
191 ----------
192 name : `str`
193 The default name used to identify the dataset type
194 storageClass : `str`
195 The storage class used when (un)/persisting the dataset type
196 multiple : `bool`
197 Indicates if this connection should expect to contain multiple objects
198 of the given dataset type. Tasks with more than one connection with
199 ``multiple=True`` with the same dimensions may want to implement
200 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
201 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
202 the execution system as early as possible of outputs that will not be
203 produced because the corresponding input is missing.
204 dimensions : iterable of `str`
205 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
206 to identify the dataset type identified by the specified name
207 deferLoad : `bool`
208 Indicates that this dataset type will be loaded as a
209 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
210 object to load the object at a later time.
211 minimum : `bool`
212 Minimum number of datasets required for this connection, per quantum.
213 This is checked in the base implementation of
214 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
215 the minimum is not met for `Input` connections (causing the quantum to
216 be pruned, skipped, or never created, depending on the context), and
217 `FileNotFoundError` for `PrerequisiteInput` connections (causing
218 QuantumGraph generation to fail). `PipelineTask` implementations may
219 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
220 for more fine-grained or configuration-driven constraints, as long as
221 they are compatible with this minium.
223 Raises
224 ------
225 TypeError
226 Raised if ``minimum`` is greater than one but ``multiple=False``.
227 NotImplementedError
228 Raised if ``minimum`` is zero for a regular `Input` connection; this
229 is not currently supported by our QuantumGraph generation algorithm.
230 """
232 deferLoad: bool = False
233 minimum: int = 1
235 def __post_init__(self) -> None:
236 super().__post_init__()
237 if self.minimum > 1 and not self.multiple: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
241@dataclasses.dataclass(frozen=True)
242class Input(BaseInput):
243 def __post_init__(self) -> None:
244 super().__post_init__()
245 if self.minimum == 0: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
249@dataclasses.dataclass(frozen=True)
250class PrerequisiteInput(BaseInput):
251 """Class used for declaring PipelineTask prerequisite connections
253 Parameters
254 ----------
255 name : `str`
256 The default name used to identify the dataset type
257 storageClass : `str`
258 The storage class used when (un)/persisting the dataset type
259 multiple : `bool`
260 Indicates if this connection should expect to contain multiple objects
261 of the given dataset type. Tasks with more than one connection with
262 ``multiple=True`` with the same dimensions may want to implement
263 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
264 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
265 the execution system as early as possible of outputs that will not be
266 produced because the corresponding input is missing.
267 dimensions : iterable of `str`
268 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
269 to identify the dataset type identified by the specified name
270 minimum : `bool`
271 Minimum number of datasets required for this connection, per quantum.
272 This is checked in the base implementation of
273 `PipelineTaskConnections.adjustQuantum`, which raises
274 `FileNotFoundError` (causing QuantumGraph generation to fail).
275 `PipelineTask` implementations may
276 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
277 for more fine-grained or configuration-driven constraints, as long as
278 they are compatible with this minium.
279 lookupFunction: `typing.Callable`, optional
280 An optional callable function that will look up PrerequisiteInputs
281 using the DatasetType, registry, quantum dataId, and input collections
282 passed to it. If no function is specified, the default temporal spatial
283 lookup will be used.
285 Raises
286 ------
287 TypeError
288 Raised if ``minimum`` is greater than one but ``multiple=False``.
290 Notes
291 -----
292 Prerequisite inputs are used for datasets that must exist in the data
293 repository before a pipeline including this is run; they cannot be produced
294 by another task in the same pipeline.
296 In exchange for this limitation, they have a number of advantages relative
297 to regular `Input` connections:
299 - The query used to find them then during `QuantumGraph` generation can be
300 fully customized by providing a ``lookupFunction``.
301 - Failed searches for prerequisites during `QuantumGraph` generation will
302 usually generate more helpful diagnostics than those for regular `Input`
303 connections.
304 - The default query for prerequisite inputs relates the quantum dimensions
305 directly to the dimensions of its dataset type, without being constrained
306 by any of the other dimensions in the pipeline. This allows them to be
307 used for temporal calibration lookups (which regular `Input` connections
308 cannot do at present) and to work around `QuantumGraph` generation
309 limitations involving cases where naive spatial overlap relationships
310 between dimensions are not desired (e.g. a task that wants all detectors
311 in each visit for which the visit overlaps a tract, not just those where
312 that detector+visit combination overlaps the tract).
313 - Prerequisite inputs may be optional (regular inputs are never optional).
315 """
317 lookupFunction: Optional[
318 Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch], Iterable[DatasetRef]]
319 ] = None
322@dataclasses.dataclass(frozen=True)
323class Output(DimensionedConnection):
324 pass
327@dataclasses.dataclass(frozen=True)
328class InitInput(BaseConnection):
329 pass
332@dataclasses.dataclass(frozen=True)
333class InitOutput(BaseConnection):
334 pass