Coverage for python/lsst/pipe/base/connectionTypes.py: 80%
62 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-12 02:03 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-12 02:03 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29from collections.abc import Callable, Iterable, Sequence
30from typing import ClassVar
32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
35@dataclasses.dataclass(frozen=True)
36class BaseConnection:
37 """Base class used for declaring PipelineTask connections
39 Parameters
40 ----------
41 name : `str`
42 The name used to identify the dataset type
43 storageClass : `str`
44 The storage class used when (un)/persisting the dataset type
45 multiple : `bool`
46 Indicates if this connection should expect to contain multiple objects
47 of the given dataset type. Tasks with more than one connection with
48 ``multiple=True`` with the same dimensions may want to implement
49 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
50 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
51 the execution system as early as possible of outputs that will not be
52 produced because the corresponding input is missing.
53 """
55 name: str
56 storageClass: str
57 doc: str = ""
58 multiple: bool = False
60 _connection_type_set: ClassVar[str]
62 def __get__(self, inst, klass):
63 """Descriptor access method.
65 This is a method used to turn a connection into a descriptor.
66 When a connection is added to a connection class, it is a class level
67 variable. This method makes accessing this connection, on the
68 instance of the connection class owning this connection, return a
69 result specialized for that instance. In the case of connections
70 this specifically means names specified in a config instance will
71 be visible instead of the default names for the connection, and that
72 removed connections will not be accessible on the instance.
73 """
74 # If inst is None, this is being accessed by the class and not an
75 # instance, return this connection itself
76 if inst is None:
77 return self
78 # Attempt to return the configured connection object from the
79 # connections instance allConnections mapping.
80 try:
81 return inst.allConnections[self.varName]
82 except KeyError:
83 raise AttributeError(
84 f"Connection {self.varName!r} of {klass.__name__} has been removed."
85 ) from None
87 def makeDatasetType(
88 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
89 ) -> DatasetType:
90 """Construct a true `DatasetType` instance with normalized dimensions.
92 Parameters
93 ----------
94 universe : `lsst.daf.butler.DimensionUniverse`
95 Set of all known dimensions to be used to normalize the dimension
96 names specified in config.
97 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
98 Parent storage class for component datasets; `None` otherwise.
100 Returns
101 -------
102 datasetType : `DatasetType`
103 The `DatasetType` defined by this connection.
104 """
105 return DatasetType(
106 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
107 )
110@dataclasses.dataclass(frozen=True)
111class DimensionedConnection(BaseConnection):
112 """Class used for declaring PipelineTask connections that includes
113 dimensions
115 Parameters
116 ----------
117 name : `str`
118 The name used to identify the dataset type
119 storageClass : `str`
120 The storage class used when (un)/persisting the dataset type
121 multiple : `bool`
122 Indicates if this connection should expect to contain multiple objects
123 of the given dataset type. Tasks with more than one connection with
124 ``multiple=True`` with the same dimensions may want to implement
125 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
126 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
127 the execution system as early as possible of outputs that will not be
128 produced because the corresponding input is missing.
129 dimensions : iterable of `str`
130 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
131 to identify the dataset type identified by the specified name
132 isCalibration: `bool`, optional
133 `True` if this dataset type may be included in CALIBRATION-type
134 collections to associate it with a validity range, `False` (default)
135 otherwise.
136 """
138 dimensions: Iterable[str] = ()
139 isCalibration: bool = False
141 def __post_init__(self):
142 if isinstance(self.dimensions, str): 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 raise TypeError(
144 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
145 )
146 if not isinstance(self.dimensions, Iterable): 146 ↛ 147line 146 didn't jump to line 147, because the condition on line 146 was never true
147 raise TypeError("Dimensions must be iterable of dimensions")
149 def makeDatasetType(
150 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
151 ) -> DatasetType:
152 """Construct a true `DatasetType` instance with normalized dimensions.
154 Parameters
155 ----------
156 universe : `lsst.daf.butler.DimensionUniverse`
157 Set of all known dimensions to be used to normalize the dimension
158 names specified in config.
159 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
160 Parent storage class for component datasets; `None` otherwise.
162 Returns
163 -------
164 datasetType : `DatasetType`
165 The `DatasetType` defined by this connection.
166 """
167 return DatasetType(
168 self.name,
169 universe.extract(self.dimensions),
170 self.storageClass,
171 isCalibration=self.isCalibration,
172 parentStorageClass=parentStorageClass,
173 )
176@dataclasses.dataclass(frozen=True)
177class BaseInput(DimensionedConnection):
178 """Class used for declaring PipelineTask input connections
180 Parameters
181 ----------
182 name : `str`
183 The default name used to identify the dataset type
184 storageClass : `str`
185 The storage class used when (un)/persisting the dataset type
186 multiple : `bool`
187 Indicates if this connection should expect to contain multiple objects
188 of the given dataset type. Tasks with more than one connection with
189 ``multiple=True`` with the same dimensions may want to implement
190 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
191 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
192 the execution system as early as possible of outputs that will not be
193 produced because the corresponding input is missing.
194 dimensions : iterable of `str`
195 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
196 to identify the dataset type identified by the specified name
197 deferLoad : `bool`
198 Indicates that this dataset type will be loaded as a
199 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
200 object to load the object at a later time.
201 minimum : `bool`
202 Minimum number of datasets required for this connection, per quantum.
203 This is checked in the base implementation of
204 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
205 the minimum is not met for `Input` connections (causing the quantum to
206 be pruned, skipped, or never created, depending on the context), and
207 `FileNotFoundError` for `PrerequisiteInput` connections (causing
208 QuantumGraph generation to fail). `PipelineTask` implementations may
209 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
210 for more fine-grained or configuration-driven constraints, as long as
211 they are compatible with this minium.
213 Raises
214 ------
215 TypeError
216 Raised if ``minimum`` is greater than one but ``multiple=False``.
217 NotImplementedError
218 Raised if ``minimum`` is zero for a regular `Input` connection; this
219 is not currently supported by our QuantumGraph generation algorithm.
220 """
222 deferLoad: bool = False
223 minimum: int = 1
225 def __post_init__(self) -> None:
226 super().__post_init__()
227 if self.minimum > 1 and not self.multiple: 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true
228 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
231@dataclasses.dataclass(frozen=True)
232class Input(BaseInput):
233 """Class used for declaring PipelineTask input connections
235 Parameters
236 ----------
237 name : `str`
238 The default name used to identify the dataset type
239 storageClass : `str`
240 The storage class used when (un)/persisting the dataset type
241 multiple : `bool`
242 Indicates if this connection should expect to contain multiple objects
243 of the given dataset type. Tasks with more than one connection with
244 ``multiple=True`` with the same dimensions may want to implement
245 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
246 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
247 the execution system as early as possible of outputs that will not be
248 produced because the corresponding input is missing.
249 dimensions : iterable of `str`
250 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
251 to identify the dataset type identified by the specified name
252 deferLoad : `bool`
253 Indicates that this dataset type will be loaded as a
254 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
255 object to load the object at a later time.
256 minimum : `bool`
257 Minimum number of datasets required for this connection, per quantum.
258 This is checked in the base implementation of
259 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
260 the minimum is not met for `Input` connections (causing the quantum to
261 be pruned, skipped, or never created, depending on the context), and
262 `FileNotFoundError` for `PrerequisiteInput` connections (causing
263 QuantumGraph generation to fail). `PipelineTask` implementations may
264 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
265 for more fine-grained or configuration-driven constraints, as long as
266 they are compatible with this minium.
267 deferGraphConstraint: `bool`, optional
268 If `True`, do not include this dataset type's existence in the initial
269 query that starts the QuantumGraph generation process. This can be
270 used to make QuantumGraph generation faster by avoiding redundant
271 datasets, and in certain cases it can (along with careful attention to
272 which tasks are included in the same QuantumGraph) be used to work
273 around the QuantumGraph generation algorithm's inflexible handling of
274 spatial overlaps. This option has no effect when the connection is not
275 an overall input of the pipeline (or subset thereof) for which a graph
276 is being created, and it never affects the ordering of quanta.
278 Raises
279 ------
280 TypeError
281 Raised if ``minimum`` is greater than one but ``multiple=False``.
282 NotImplementedError
283 Raised if ``minimum`` is zero for a regular `Input` connection; this
284 is not currently supported by our QuantumGraph generation algorithm.
285 """
287 deferGraphConstraint: bool = False
289 _connection_type_set: ClassVar[str] = "inputs"
291 def __post_init__(self) -> None:
292 super().__post_init__()
293 if self.minimum == 0: 293 ↛ 294line 293 didn't jump to line 294, because the condition on line 293 was never true
294 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
297@dataclasses.dataclass(frozen=True)
298class PrerequisiteInput(BaseInput):
299 """Class used for declaring PipelineTask prerequisite connections
301 Parameters
302 ----------
303 name : `str`
304 The default name used to identify the dataset type
305 storageClass : `str`
306 The storage class used when (un)/persisting the dataset type
307 multiple : `bool`
308 Indicates if this connection should expect to contain multiple objects
309 of the given dataset type. Tasks with more than one connection with
310 ``multiple=True`` with the same dimensions may want to implement
311 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
312 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
313 the execution system as early as possible of outputs that will not be
314 produced because the corresponding input is missing.
315 dimensions : iterable of `str`
316 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
317 to identify the dataset type identified by the specified name
318 minimum : `bool`
319 Minimum number of datasets required for this connection, per quantum.
320 This is checked in the base implementation of
321 `PipelineTaskConnections.adjustQuantum`, which raises
322 `FileNotFoundError` (causing QuantumGraph generation to fail).
323 `PipelineTask` implementations may
324 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
325 for more fine-grained or configuration-driven constraints, as long as
326 they are compatible with this minium.
327 lookupFunction: `typing.Callable`, optional
328 An optional callable function that will look up PrerequisiteInputs
329 using the DatasetType, registry, quantum dataId, and input collections
330 passed to it. If no function is specified, the default temporal spatial
331 lookup will be used.
333 Raises
334 ------
335 TypeError
336 Raised if ``minimum`` is greater than one but ``multiple=False``.
338 Notes
339 -----
340 Prerequisite inputs are used for datasets that must exist in the data
341 repository before a pipeline including this is run; they cannot be produced
342 by another task in the same pipeline.
344 In exchange for this limitation, they have a number of advantages relative
345 to regular `Input` connections:
347 - The query used to find them then during `QuantumGraph` generation can be
348 fully customized by providing a ``lookupFunction``.
349 - Failed searches for prerequisites during `QuantumGraph` generation will
350 usually generate more helpful diagnostics than those for regular `Input`
351 connections.
352 - The default query for prerequisite inputs relates the quantum dimensions
353 directly to the dimensions of its dataset type, without being constrained
354 by any of the other dimensions in the pipeline. This allows them to be
355 used for temporal calibration lookups (which regular `Input` connections
356 cannot do at present) and to work around `QuantumGraph` generation
357 limitations involving cases where naive spatial overlap relationships
358 between dimensions are not desired (e.g. a task that wants all detectors
359 in each visit for which the visit overlaps a tract, not just those where
360 that detector+visit combination overlaps the tract).
361 - Prerequisite inputs may be optional (regular inputs are never optional).
363 """
365 lookupFunction: Callable[
366 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]
367 ] | None = None
369 _connection_type_set: ClassVar[str] = "prerequisiteInputs"
372@dataclasses.dataclass(frozen=True)
373class Output(DimensionedConnection):
374 _connection_type_set: ClassVar[str] = "outputs"
377@dataclasses.dataclass(frozen=True)
378class InitInput(BaseConnection):
379 _connection_type_set: ClassVar[str] = "initInputs"
382@dataclasses.dataclass(frozen=True)
383class InitOutput(BaseConnection):
384 _connection_type_set: ClassVar[str] = "initOutputs"