Coverage for python/lsst/pipe/base/connectionTypes.py: 69%
65 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-30 02:02 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-30 02:02 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29import typing
30from collections.abc import Callable, Iterable, Sequence
31from typing import Optional, Union
33from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
36@dataclasses.dataclass(frozen=True)
37class BaseConnection:
38 """Base class used for declaring PipelineTask connections
40 Parameters
41 ----------
42 name : `str`
43 The name used to identify the dataset type
44 storageClass : `str`
45 The storage class used when (un)/persisting the dataset type
46 multiple : `bool`
47 Indicates if this connection should expect to contain multiple objects
48 of the given dataset type. Tasks with more than one connection with
49 ``multiple=True`` with the same dimensions may want to implement
50 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
51 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
52 the execution system as early as possible of outputs that will not be
53 produced because the corresponding input is missing.
54 """
56 name: str
57 storageClass: str
58 doc: str = ""
59 multiple: bool = False
61 def __get__(self, inst, klass):
62 """Descriptor method
64 This is a method used to turn a connection into a descriptor.
65 When a connection is added to a connection class, it is a class level
66 variable. This method makes accessing this connection, on the
67 instance of the connection class owning this connection, return a
68 result specialized for that instance. In the case of connections
69 this specifically means names specified in a config instance will
70 be visible instead of the default names for the connection.
71 """
72 # If inst is None, this is being accessed by the class and not an
73 # instance, return this connection itself
74 if inst is None:
75 return self
76 # If no object cache exists, create one to track the instances this
77 # connection has been accessed by
78 if not hasattr(inst, "_connectionCache"):
79 object.__setattr__(inst, "_connectionCache", {})
80 # Look up an existing cached instance
81 idSelf = id(self)
82 if idSelf in inst._connectionCache:
83 return inst._connectionCache[idSelf]
84 # Accumulate the parameters that define this connection
85 params = {}
86 for field in dataclasses.fields(self):
87 params[field.name] = getattr(self, field.name)
88 # Get the name override defined by the instance of the connection class
89 params["name"] = inst._nameOverrides[self.varName]
90 # Return a new instance of this connection specialized with the
91 # information provided by the connection class instance
92 return inst._connectionCache.setdefault(idSelf, self.__class__(**params))
94 def makeDatasetType(
95 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
96 ) -> DatasetType:
97 """Construct a true `DatasetType` instance with normalized dimensions.
99 Parameters
100 ----------
101 universe : `lsst.daf.butler.DimensionUniverse`
102 Set of all known dimensions to be used to normalize the dimension
103 names specified in config.
104 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
105 Parent storage class for component datasets; `None` otherwise.
107 Returns
108 -------
109 datasetType : `DatasetType`
110 The `DatasetType` defined by this connection.
111 """
112 return DatasetType(
113 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
114 )
117@dataclasses.dataclass(frozen=True)
118class DimensionedConnection(BaseConnection):
119 """Class used for declaring PipelineTask connections that includes
120 dimensions
122 Parameters
123 ----------
124 name : `str`
125 The name used to identify the dataset type
126 storageClass : `str`
127 The storage class used when (un)/persisting the dataset type
128 multiple : `bool`
129 Indicates if this connection should expect to contain multiple objects
130 of the given dataset type. Tasks with more than one connection with
131 ``multiple=True`` with the same dimensions may want to implement
132 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
133 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
134 the execution system as early as possible of outputs that will not be
135 produced because the corresponding input is missing.
136 dimensions : iterable of `str`
137 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
138 to identify the dataset type identified by the specified name
139 isCalibration: `bool`, optional
140 `True` if this dataset type may be included in CALIBRATION-type
141 collections to associate it with a validity range, `False` (default)
142 otherwise.
143 """
145 dimensions: typing.Iterable[str] = ()
146 isCalibration: bool = False
148 def __post_init__(self):
149 if isinstance(self.dimensions, str): 149 ↛ 150line 149 didn't jump to line 150, because the condition on line 149 was never true
150 raise TypeError(
151 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
152 )
153 if not isinstance(self.dimensions, typing.Iterable): 153 ↛ 154line 153 didn't jump to line 154, because the condition on line 153 was never true
154 raise TypeError("Dimensions must be iterable of dimensions")
156 def makeDatasetType(
157 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
158 ) -> DatasetType:
159 """Construct a true `DatasetType` instance with normalized dimensions.
161 Parameters
162 ----------
163 universe : `lsst.daf.butler.DimensionUniverse`
164 Set of all known dimensions to be used to normalize the dimension
165 names specified in config.
166 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
167 Parent storage class for component datasets; `None` otherwise.
169 Returns
170 -------
171 datasetType : `DatasetType`
172 The `DatasetType` defined by this connection.
173 """
174 return DatasetType(
175 self.name,
176 universe.extract(self.dimensions),
177 self.storageClass,
178 isCalibration=self.isCalibration,
179 parentStorageClass=parentStorageClass,
180 )
183@dataclasses.dataclass(frozen=True)
184class BaseInput(DimensionedConnection):
185 """Class used for declaring PipelineTask input connections
187 Parameters
188 ----------
189 name : `str`
190 The default name used to identify the dataset type
191 storageClass : `str`
192 The storage class used when (un)/persisting the dataset type
193 multiple : `bool`
194 Indicates if this connection should expect to contain multiple objects
195 of the given dataset type. Tasks with more than one connection with
196 ``multiple=True`` with the same dimensions may want to implement
197 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
198 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
199 the execution system as early as possible of outputs that will not be
200 produced because the corresponding input is missing.
201 dimensions : iterable of `str`
202 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
203 to identify the dataset type identified by the specified name
204 deferLoad : `bool`
205 Indicates that this dataset type will be loaded as a
206 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
207 object to load the object at a later time.
208 minimum : `bool`
209 Minimum number of datasets required for this connection, per quantum.
210 This is checked in the base implementation of
211 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
212 the minimum is not met for `Input` connections (causing the quantum to
213 be pruned, skipped, or never created, depending on the context), and
214 `FileNotFoundError` for `PrerequisiteInput` connections (causing
215 QuantumGraph generation to fail). `PipelineTask` implementations may
216 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
217 for more fine-grained or configuration-driven constraints, as long as
218 they are compatible with this minium.
220 Raises
221 ------
222 TypeError
223 Raised if ``minimum`` is greater than one but ``multiple=False``.
224 NotImplementedError
225 Raised if ``minimum`` is zero for a regular `Input` connection; this
226 is not currently supported by our QuantumGraph generation algorithm.
227 """
229 deferLoad: bool = False
230 minimum: int = 1
232 def __post_init__(self) -> None:
233 super().__post_init__()
234 if self.minimum > 1 and not self.multiple: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
238@dataclasses.dataclass(frozen=True)
239class Input(BaseInput):
240 def __post_init__(self) -> None:
241 super().__post_init__()
242 if self.minimum == 0: 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true
243 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
246@dataclasses.dataclass(frozen=True)
247class PrerequisiteInput(BaseInput):
248 """Class used for declaring PipelineTask prerequisite connections
250 Parameters
251 ----------
252 name : `str`
253 The default name used to identify the dataset type
254 storageClass : `str`
255 The storage class used when (un)/persisting the dataset type
256 multiple : `bool`
257 Indicates if this connection should expect to contain multiple objects
258 of the given dataset type. Tasks with more than one connection with
259 ``multiple=True`` with the same dimensions may want to implement
260 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
261 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
262 the execution system as early as possible of outputs that will not be
263 produced because the corresponding input is missing.
264 dimensions : iterable of `str`
265 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
266 to identify the dataset type identified by the specified name
267 minimum : `bool`
268 Minimum number of datasets required for this connection, per quantum.
269 This is checked in the base implementation of
270 `PipelineTaskConnections.adjustQuantum`, which raises
271 `FileNotFoundError` (causing QuantumGraph generation to fail).
272 `PipelineTask` implementations may
273 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
274 for more fine-grained or configuration-driven constraints, as long as
275 they are compatible with this minium.
276 lookupFunction: `typing.Callable`, optional
277 An optional callable function that will look up PrerequisiteInputs
278 using the DatasetType, registry, quantum dataId, and input collections
279 passed to it. If no function is specified, the default temporal spatial
280 lookup will be used.
282 Raises
283 ------
284 TypeError
285 Raised if ``minimum`` is greater than one but ``multiple=False``.
287 Notes
288 -----
289 Prerequisite inputs are used for datasets that must exist in the data
290 repository before a pipeline including this is run; they cannot be produced
291 by another task in the same pipeline.
293 In exchange for this limitation, they have a number of advantages relative
294 to regular `Input` connections:
296 - The query used to find them then during `QuantumGraph` generation can be
297 fully customized by providing a ``lookupFunction``.
298 - Failed searches for prerequisites during `QuantumGraph` generation will
299 usually generate more helpful diagnostics than those for regular `Input`
300 connections.
301 - The default query for prerequisite inputs relates the quantum dimensions
302 directly to the dimensions of its dataset type, without being constrained
303 by any of the other dimensions in the pipeline. This allows them to be
304 used for temporal calibration lookups (which regular `Input` connections
305 cannot do at present) and to work around `QuantumGraph` generation
306 limitations involving cases where naive spatial overlap relationships
307 between dimensions are not desired (e.g. a task that wants all detectors
308 in each visit for which the visit overlaps a tract, not just those where
309 that detector+visit combination overlaps the tract).
310 - Prerequisite inputs may be optional (regular inputs are never optional).
312 """
314 lookupFunction: Optional[
315 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]]
316 ] = None
319@dataclasses.dataclass(frozen=True)
320class Output(DimensionedConnection):
321 pass
324@dataclasses.dataclass(frozen=True)
325class InitInput(BaseConnection):
326 pass
329@dataclasses.dataclass(frozen=True)
330class InitOutput(BaseConnection):
331 pass