Coverage for python/lsst/pipe/base/connectionTypes.py : 69%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput",
27 "Output", "BaseConnection"]
29import dataclasses
30import typing
31from typing import Callable, Iterable, Optional
33from lsst.daf.butler import (
34 CollectionSearch,
35 DataCoordinate,
36 DatasetRef,
37 DatasetType,
38 DimensionUniverse,
39 Registry,
40 StorageClass,
41)
44@dataclasses.dataclass(frozen=True)
45class BaseConnection:
46 """Base class used for declaring PipelineTask connections
48 Parameters
49 ----------
50 name : `str`
51 The name used to identify the dataset type
52 storageClass : `str`
53 The storage class used when (un)/persisting the dataset type
54 multiple : `bool`
55 Indicates if this connection should expect to contain multiple objects
56 of the given dataset type. Tasks with more than one connection with
57 ``multiple=True`` with the same dimensions may want to implement
58 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
59 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
60 the execution system as early as possible of outputs that will not be
61 produced because the corresponding input is missing.
62 """
63 name: str
64 storageClass: str
65 doc: str = ""
66 multiple: bool = False
68 def __get__(self, inst, klass):
69 """Descriptor method
71 This is a method used to turn a connection into a descriptor.
72 When a connection is added to a connection class, it is a class level
73 variable. This method makes accessing this connection, on the
74 instance of the connection class owning this connection, return a
75 result specialized for that instance. In the case of connections
76 this specifically means names specified in a config instance will
77 be visible instead of the default names for the connection.
78 """
79 # If inst is None, this is being accessed by the class and not an
80 # instance, return this connection itself
81 if inst is None:
82 return self
83 # If no object cache exists, create one to track the instances this
84 # connection has been accessed by
85 if not hasattr(inst, '_connectionCache'):
86 object.__setattr__(inst, '_connectionCache', {})
87 # Look up an existing cached instance
88 idSelf = id(self)
89 if idSelf in inst._connectionCache:
90 return inst._connectionCache[idSelf]
91 # Accumulate the parameters that define this connection
92 params = {}
93 for field in dataclasses.fields(self):
94 params[field.name] = getattr(self, field.name)
95 # Get the name override defined by the instance of the connection class
96 params['name'] = inst._nameOverrides[self.varName]
97 # Return a new instance of this connection specialized with the
98 # information provided by the connection class instance
99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params))
101 def makeDatasetType(self, universe: DimensionUniverse,
102 parentStorageClass: Optional[StorageClass] = None):
103 """Construct a true `DatasetType` instance with normalized dimensions.
105 Parameters
106 ----------
107 universe : `lsst.daf.butler.DimensionUniverse`
108 Set of all known dimensions to be used to normalize the dimension
109 names specified in config.
110 parentStorageClass : `lsst.daf.butler.StorageClass`, optional
111 Parent storage class for component datasets; `None` otherwise.
113 Returns
114 -------
115 datasetType : `DatasetType`
116 The `DatasetType` defined by this connection.
117 """
118 return DatasetType(self.name,
119 universe.empty,
120 self.storageClass,
121 parentStorageClass=parentStorageClass)
124@dataclasses.dataclass(frozen=True)
125class DimensionedConnection(BaseConnection):
126 """Class used for declaring PipelineTask connections that includes
127 dimensions
129 Parameters
130 ----------
131 name : `str`
132 The name used to identify the dataset type
133 storageClass : `str`
134 The storage class used when (un)/persisting the dataset type
135 multiple : `bool`
136 Indicates if this connection should expect to contain multiple objects
137 of the given dataset type. Tasks with more than one connection with
138 ``multiple=True`` with the same dimensions may want to implement
139 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
140 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
141 the execution system as early as possible of outputs that will not be
142 produced because the corresponding input is missing.
143 dimensions : iterable of `str`
144 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
145 to identify the dataset type identified by the specified name
146 isCalibration: `bool`, optional
147 `True` if this dataset type may be included in CALIBRATION-type
148 collections to associate it with a validity range, `False` (default)
149 otherwise.
150 """
151 dimensions: typing.Iterable[str] = ()
152 isCalibration: bool = False
154 def __post_init__(self):
155 if isinstance(self.dimensions, str): 155 ↛ 156line 155 didn't jump to line 156, because the condition on line 155 was never true
156 raise TypeError("Dimensions must be iterable of dimensions, got str,"
157 "possibly omitted trailing comma")
158 if not isinstance(self.dimensions, typing.Iterable): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true
159 raise TypeError("Dimensions must be iterable of dimensions")
161 def makeDatasetType(self, universe: DimensionUniverse,
162 parentStorageClass: Optional[StorageClass] = None):
163 """Construct a true `DatasetType` instance with normalized dimensions.
165 Parameters
166 ----------
167 universe : `lsst.daf.butler.DimensionUniverse`
168 Set of all known dimensions to be used to normalize the dimension
169 names specified in config.
170 parentStorageClass : `lsst.daf.butler.StorageClass`, optional
171 Parent storage class for component datasets; `None` otherwise.
173 Returns
174 -------
175 datasetType : `DatasetType`
176 The `DatasetType` defined by this connection.
177 """
178 return DatasetType(self.name,
179 universe.extract(self.dimensions),
180 self.storageClass, isCalibration=self.isCalibration,
181 parentStorageClass=parentStorageClass)
184@dataclasses.dataclass(frozen=True)
185class BaseInput(DimensionedConnection):
186 """Class used for declaring PipelineTask input connections
188 Parameters
189 ----------
190 name : `str`
191 The default name used to identify the dataset type
192 storageClass : `str`
193 The storage class used when (un)/persisting the dataset type
194 multiple : `bool`
195 Indicates if this connection should expect to contain multiple objects
196 of the given dataset type. Tasks with more than one connection with
197 ``multiple=True`` with the same dimensions may want to implement
198 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
199 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
200 the execution system as early as possible of outputs that will not be
201 produced because the corresponding input is missing.
202 dimensions : iterable of `str`
203 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
204 to identify the dataset type identified by the specified name
205 deferLoad : `bool`
206 Indicates that this dataset type will be loaded as a
207 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
208 object to load the object at a later time.
209 minimum : `bool`
210 Minimum number of datasets required for this connection, per quantum.
211 This is checked in the base implementation of
212 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
213 the minimum is not met for `Input` connections (causing the quantum to
214 be pruned, skipped, or never created, depending on the context), and
215 `FileNotFoundError` for `PrerequisiteInput` connections (causing
216 QuantumGraph generation to fail). `PipelineTask` implementations may
217 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
218 for more fine-grained or configuration-driven constraints, as long as
219 they are compatible with this minium.
221 Raises
222 ------
223 TypeError
224 Raised if ``minimum`` is greater than one but ``multiple=False``.
225 NotImplementedError
226 Raised if ``minimum`` is zero for a regular `Input` connection; this
227 is not currently supported by our QuantumGraph generation algorithm.
228 """
229 deferLoad: bool = False
230 minimum: int = 1
232 def __post_init__(self) -> None:
233 super().__post_init__()
234 if self.minimum > 1 and not self.multiple: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
238@dataclasses.dataclass(frozen=True)
239class Input(BaseInput):
241 def __post_init__(self) -> None:
242 super().__post_init__()
243 if self.minimum == 0: 243 ↛ 244line 243 didn't jump to line 244, because the condition on line 243 was never true
244 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
247@dataclasses.dataclass(frozen=True)
248class PrerequisiteInput(BaseInput):
249 """Class used for declaring PipelineTask prerequisite connections
251 Parameters
252 ----------
253 name : `str`
254 The default name used to identify the dataset type
255 storageClass : `str`
256 The storage class used when (un)/persisting the dataset type
257 multiple : `bool`
258 Indicates if this connection should expect to contain multiple objects
259 of the given dataset type. Tasks with more than one connection with
260 ``multiple=True`` with the same dimensions may want to implement
261 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
262 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
263 the execution system as early as possible of outputs that will not be
264 produced because the corresponding input is missing.
265 dimensions : iterable of `str`
266 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
267 to identify the dataset type identified by the specified name
268 minimum : `bool`
269 Minimum number of datasets required for this connection, per quantum.
270 This is checked in the base implementation of
271 `PipelineTaskConnections.adjustQuantum`, which raises
272 `FileNotFoundError` (causing QuantumGraph generation to fail).
273 `PipelineTask` implementations may
274 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
275 for more fine-grained or configuration-driven constraints, as long as
276 they are compatible with this minium.
277 lookupFunction: `typing.Callable`, optional
278 An optional callable function that will look up PrerequisiteInputs
279 using the DatasetType, registry, quantum dataId, and input collections
280 passed to it. If no function is specified, the default temporal spatial
281 lookup will be used.
283 Raises
284 ------
285 TypeError
286 Raised if ``minimum`` is greater than one but ``multiple=False``.
288 Notes
289 -----
290 Prerequisite inputs are used for datasets that must exist in the data
291 repository before a pipeline including this is run; they cannot be produced
292 by another task in the same pipeline.
294 In exchange for this limitation, they have a number of advantages relative
295 to regular `Input` connections:
297 - The query used to find them then during `QuantumGraph` generation can be
298 fully customized by providing a ``lookupFunction``.
299 - Failed searches for prerequisites during `QuantumGraph` generation will
300 usually generate more helpful diagnostics than those for regular `Input`
301 connections.
302 - The default query for prerequisite inputs relates the quantum dimensions
303 directly to the dimensions of its dataset type, without being constrained
304 by any of the other dimensions in the pipeline. This allows them to be
305 used for temporal calibration lookups (which regular `Input` connections
306 cannot do at present) and to work around `QuantumGraph` generation
307 limitations involving cases where naive spatial overlap relationships
308 between dimensions are not desired (e.g. a task that wants all detectors
309 in each visit for which the visit overlaps a tract, not just those where
310 that detector+visit combination overlaps the tract).
311 - Prerequisite inputs may be optional (regular inputs are never optional).
313 """
314 lookupFunction: Optional[Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch],
315 Iterable[DatasetRef]]] = None
318@dataclasses.dataclass(frozen=True)
319class Output(DimensionedConnection):
320 pass
323@dataclasses.dataclass(frozen=True)
324class InitInput(BaseConnection):
325 pass
328@dataclasses.dataclass(frozen=True)
329class InitOutput(BaseConnection):
330 pass