Coverage for python/lsst/pipe/base/connectionTypes.py: 69%
64 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 02:55 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 02:55 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29import typing
30from typing import Callable, Iterable, Optional, Union
32from lsst.daf.butler import (
33 CollectionSearch,
34 DataCoordinate,
35 DatasetRef,
36 DatasetType,
37 DimensionUniverse,
38 Registry,
39 StorageClass,
40)
43@dataclasses.dataclass(frozen=True)
44class BaseConnection:
45 """Base class used for declaring PipelineTask connections
47 Parameters
48 ----------
49 name : `str`
50 The name used to identify the dataset type
51 storageClass : `str`
52 The storage class used when (un)/persisting the dataset type
53 multiple : `bool`
54 Indicates if this connection should expect to contain multiple objects
55 of the given dataset type. Tasks with more than one connection with
56 ``multiple=True`` with the same dimensions may want to implement
57 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
58 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
59 the execution system as early as possible of outputs that will not be
60 produced because the corresponding input is missing.
61 """
63 name: str
64 storageClass: str
65 doc: str = ""
66 multiple: bool = False
68 def __get__(self, inst, klass):
69 """Descriptor method
71 This is a method used to turn a connection into a descriptor.
72 When a connection is added to a connection class, it is a class level
73 variable. This method makes accessing this connection, on the
74 instance of the connection class owning this connection, return a
75 result specialized for that instance. In the case of connections
76 this specifically means names specified in a config instance will
77 be visible instead of the default names for the connection.
78 """
79 # If inst is None, this is being accessed by the class and not an
80 # instance, return this connection itself
81 if inst is None:
82 return self
83 # If no object cache exists, create one to track the instances this
84 # connection has been accessed by
85 if not hasattr(inst, "_connectionCache"):
86 object.__setattr__(inst, "_connectionCache", {})
87 # Look up an existing cached instance
88 idSelf = id(self)
89 if idSelf in inst._connectionCache:
90 return inst._connectionCache[idSelf]
91 # Accumulate the parameters that define this connection
92 params = {}
93 for field in dataclasses.fields(self):
94 params[field.name] = getattr(self, field.name)
95 # Get the name override defined by the instance of the connection class
96 params["name"] = inst._nameOverrides[self.varName]
97 # Return a new instance of this connection specialized with the
98 # information provided by the connection class instance
99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params))
101 def makeDatasetType(
102 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
103 ) -> DatasetType:
104 """Construct a true `DatasetType` instance with normalized dimensions.
106 Parameters
107 ----------
108 universe : `lsst.daf.butler.DimensionUniverse`
109 Set of all known dimensions to be used to normalize the dimension
110 names specified in config.
111 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
112 Parent storage class for component datasets; `None` otherwise.
114 Returns
115 -------
116 datasetType : `DatasetType`
117 The `DatasetType` defined by this connection.
118 """
119 return DatasetType(
120 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
121 )
124@dataclasses.dataclass(frozen=True)
125class DimensionedConnection(BaseConnection):
126 """Class used for declaring PipelineTask connections that includes
127 dimensions
129 Parameters
130 ----------
131 name : `str`
132 The name used to identify the dataset type
133 storageClass : `str`
134 The storage class used when (un)/persisting the dataset type
135 multiple : `bool`
136 Indicates if this connection should expect to contain multiple objects
137 of the given dataset type. Tasks with more than one connection with
138 ``multiple=True`` with the same dimensions may want to implement
139 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
140 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
141 the execution system as early as possible of outputs that will not be
142 produced because the corresponding input is missing.
143 dimensions : iterable of `str`
144 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
145 to identify the dataset type identified by the specified name
146 isCalibration: `bool`, optional
147 `True` if this dataset type may be included in CALIBRATION-type
148 collections to associate it with a validity range, `False` (default)
149 otherwise.
150 """
152 dimensions: typing.Iterable[str] = ()
153 isCalibration: bool = False
155 def __post_init__(self):
156 if isinstance(self.dimensions, str): 156 ↛ 157line 156 didn't jump to line 157, because the condition on line 156 was never true
157 raise TypeError(
158 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
159 )
160 if not isinstance(self.dimensions, typing.Iterable): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 raise TypeError("Dimensions must be iterable of dimensions")
163 def makeDatasetType(
164 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
165 ) -> DatasetType:
166 """Construct a true `DatasetType` instance with normalized dimensions.
168 Parameters
169 ----------
170 universe : `lsst.daf.butler.DimensionUniverse`
171 Set of all known dimensions to be used to normalize the dimension
172 names specified in config.
173 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
174 Parent storage class for component datasets; `None` otherwise.
176 Returns
177 -------
178 datasetType : `DatasetType`
179 The `DatasetType` defined by this connection.
180 """
181 return DatasetType(
182 self.name,
183 universe.extract(self.dimensions),
184 self.storageClass,
185 isCalibration=self.isCalibration,
186 parentStorageClass=parentStorageClass,
187 )
190@dataclasses.dataclass(frozen=True)
191class BaseInput(DimensionedConnection):
192 """Class used for declaring PipelineTask input connections
194 Parameters
195 ----------
196 name : `str`
197 The default name used to identify the dataset type
198 storageClass : `str`
199 The storage class used when (un)/persisting the dataset type
200 multiple : `bool`
201 Indicates if this connection should expect to contain multiple objects
202 of the given dataset type. Tasks with more than one connection with
203 ``multiple=True`` with the same dimensions may want to implement
204 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
205 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
206 the execution system as early as possible of outputs that will not be
207 produced because the corresponding input is missing.
208 dimensions : iterable of `str`
209 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
210 to identify the dataset type identified by the specified name
211 deferLoad : `bool`
212 Indicates that this dataset type will be loaded as a
213 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
214 object to load the object at a later time.
215 minimum : `bool`
216 Minimum number of datasets required for this connection, per quantum.
217 This is checked in the base implementation of
218 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
219 the minimum is not met for `Input` connections (causing the quantum to
220 be pruned, skipped, or never created, depending on the context), and
221 `FileNotFoundError` for `PrerequisiteInput` connections (causing
222 QuantumGraph generation to fail). `PipelineTask` implementations may
223 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
224 for more fine-grained or configuration-driven constraints, as long as
225 they are compatible with this minium.
227 Raises
228 ------
229 TypeError
230 Raised if ``minimum`` is greater than one but ``multiple=False``.
231 NotImplementedError
232 Raised if ``minimum`` is zero for a regular `Input` connection; this
233 is not currently supported by our QuantumGraph generation algorithm.
234 """
236 deferLoad: bool = False
237 minimum: int = 1
239 def __post_init__(self) -> None:
240 super().__post_init__()
241 if self.minimum > 1 and not self.multiple: 241 ↛ 242line 241 didn't jump to line 242, because the condition on line 241 was never true
242 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
245@dataclasses.dataclass(frozen=True)
246class Input(BaseInput):
247 def __post_init__(self) -> None:
248 super().__post_init__()
249 if self.minimum == 0: 249 ↛ 250line 249 didn't jump to line 250, because the condition on line 249 was never true
250 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
253@dataclasses.dataclass(frozen=True)
254class PrerequisiteInput(BaseInput):
255 """Class used for declaring PipelineTask prerequisite connections
257 Parameters
258 ----------
259 name : `str`
260 The default name used to identify the dataset type
261 storageClass : `str`
262 The storage class used when (un)/persisting the dataset type
263 multiple : `bool`
264 Indicates if this connection should expect to contain multiple objects
265 of the given dataset type. Tasks with more than one connection with
266 ``multiple=True`` with the same dimensions may want to implement
267 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
268 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
269 the execution system as early as possible of outputs that will not be
270 produced because the corresponding input is missing.
271 dimensions : iterable of `str`
272 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
273 to identify the dataset type identified by the specified name
274 minimum : `bool`
275 Minimum number of datasets required for this connection, per quantum.
276 This is checked in the base implementation of
277 `PipelineTaskConnections.adjustQuantum`, which raises
278 `FileNotFoundError` (causing QuantumGraph generation to fail).
279 `PipelineTask` implementations may
280 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
281 for more fine-grained or configuration-driven constraints, as long as
282 they are compatible with this minium.
283 lookupFunction: `typing.Callable`, optional
284 An optional callable function that will look up PrerequisiteInputs
285 using the DatasetType, registry, quantum dataId, and input collections
286 passed to it. If no function is specified, the default temporal spatial
287 lookup will be used.
289 Raises
290 ------
291 TypeError
292 Raised if ``minimum`` is greater than one but ``multiple=False``.
294 Notes
295 -----
296 Prerequisite inputs are used for datasets that must exist in the data
297 repository before a pipeline including this is run; they cannot be produced
298 by another task in the same pipeline.
300 In exchange for this limitation, they have a number of advantages relative
301 to regular `Input` connections:
303 - The query used to find them then during `QuantumGraph` generation can be
304 fully customized by providing a ``lookupFunction``.
305 - Failed searches for prerequisites during `QuantumGraph` generation will
306 usually generate more helpful diagnostics than those for regular `Input`
307 connections.
308 - The default query for prerequisite inputs relates the quantum dimensions
309 directly to the dimensions of its dataset type, without being constrained
310 by any of the other dimensions in the pipeline. This allows them to be
311 used for temporal calibration lookups (which regular `Input` connections
312 cannot do at present) and to work around `QuantumGraph` generation
313 limitations involving cases where naive spatial overlap relationships
314 between dimensions are not desired (e.g. a task that wants all detectors
315 in each visit for which the visit overlaps a tract, not just those where
316 that detector+visit combination overlaps the tract).
317 - Prerequisite inputs may be optional (regular inputs are never optional).
319 """
321 lookupFunction: Optional[
322 Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch], Iterable[DatasetRef]]
323 ] = None
326@dataclasses.dataclass(frozen=True)
327class Output(DimensionedConnection):
328 pass
331@dataclasses.dataclass(frozen=True)
332class InitInput(BaseConnection):
333 pass
336@dataclasses.dataclass(frozen=True)
337class InitOutput(BaseConnection):
338 pass