Coverage for python/lsst/pipe/base/connectionTypes.py: 78%
72 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:02 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:02 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Module defining connection types to be used within a
29`PipelineTaskConnections` class.
30"""
32__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
34import dataclasses
35from collections.abc import Callable, Iterable, Sequence
36from typing import ClassVar
38from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
39from lsst.utils.introspection import find_outside_stacklevel
42@dataclasses.dataclass(frozen=True)
43class BaseConnection:
44 """Base class used for declaring `PipelineTask` connections.
46 Attributes
47 ----------
48 name : `str`
49 The name used to identify the dataset type.
50 storageClass : `str`
51 The storage class used when (un)/persisting the dataset type.
52 multiple : `bool`
53 Indicates if this connection should expect to contain multiple objects
54 of the given dataset type. Tasks with more than one connection with
55 ``multiple=True`` with the same dimensions may want to implement
56 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
57 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and
58 notify the execution system as early as possible of outputs that will
59 not be produced because the corresponding input is missing.
60 deprecated : `str`, optional
61 A description of why this connection is deprecated, including the
62 version after which it may be removed.
64 If not `None`, the string is appended to the docstring for this
65 connection and the corresponding config Field.
66 """
68 name: str
69 storageClass: str
70 doc: str = ""
71 multiple: bool = False
72 deprecated: str | None = dataclasses.field(default=None, kw_only=True)
74 _connection_type_set: ClassVar[str]
75 _deprecation_context: str = ""
77 def __post_init__(self):
78 if self.deprecated and not self._deprecation_context: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true
79 info = {}
80 _ = find_outside_stacklevel("lsst.pipe.base", "dataclasses", stack_info=info)
81 object.__setattr__(self, "_deprecation_context", f"{info['filename']}:{info['lineno']}")
83 def __get__(self, inst, klass):
84 """Descriptor access method.
86 This is a method used to turn a connection into a descriptor.
87 When a connection is added to a connection class, it is a class level
88 variable. This method makes accessing this connection, on the
89 instance of the connection class owning this connection, return a
90 result specialized for that instance. In the case of connections
91 this specifically means names specified in a config instance will
92 be visible instead of the default names for the connection, and that
93 removed connections will not be accessible on the instance.
94 """
95 # If inst is None, this is being accessed by the class and not an
96 # instance, return this connection itself
97 if inst is None:
98 return self
99 # Attempt to return the configured connection object from the
100 # connections instance allConnections mapping.
101 try:
102 return inst.allConnections[self.varName]
103 except KeyError:
104 raise AttributeError(
105 f"Connection {self.varName!r} of {klass.__name__} has been removed."
106 ) from None
108 def makeDatasetType(
109 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
110 ) -> DatasetType:
111 """Construct a true `~lsst.daf.butler.DatasetType` instance with
112 normalized dimensions.
114 Parameters
115 ----------
116 universe : `lsst.daf.butler.DimensionUniverse`
117 Set of all known dimensions to be used to normalize the dimension
118 names specified in config.
119 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
120 Parent storage class for component datasets; `None` otherwise.
122 Returns
123 -------
124 datasetType : `~lsst.daf.butler.DatasetType`
125 The `~lsst.daf.butler.DatasetType` defined by this connection.
126 """
127 return DatasetType(
128 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
129 )
132@dataclasses.dataclass(frozen=True)
133class DimensionedConnection(BaseConnection):
134 """Class used for declaring PipelineTask connections that includes
135 dimensions.
137 Attributes
138 ----------
139 name : `str`
140 The name used to identify the dataset type.
141 storageClass : `str`
142 The storage class used when (un)/persisting the dataset type.
143 multiple : `bool`
144 Indicates if this connection should expect to contain multiple objects
145 of the given dataset type. Tasks with more than one connection with
146 ``multiple=True`` with the same dimensions may want to implement
147 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
148 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
149 the execution system as early as possible of outputs that will not be
150 produced because the corresponding input is missing.
151 dimensions : iterable of `str`
152 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
153 to identify the dataset type identified by the specified name.
154 isCalibration : `bool`, optional
155 `True` if this dataset type may be included in CALIBRATION-type
156 collections to associate it with a validity range, `False` (default)
157 otherwise.
158 """
160 dimensions: Iterable[str] = ()
161 isCalibration: bool = False
163 def __post_init__(self):
164 super().__post_init__()
165 if isinstance(self.dimensions, str): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 raise TypeError(
167 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
168 )
169 if not isinstance(self.dimensions, Iterable): 169 ↛ 170line 169 didn't jump to line 170, because the condition on line 169 was never true
170 raise TypeError("Dimensions must be iterable of dimensions")
172 def makeDatasetType(
173 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
174 ) -> DatasetType:
175 """Construct a true `~lsst.daf.butler.DatasetType` instance with
176 normalized dimensions.
178 Parameters
179 ----------
180 universe : `lsst.daf.butler.DimensionUniverse`
181 Set of all known dimensions to be used to normalize the dimension
182 names specified in config.
183 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
184 Parent storage class for component datasets; `None` otherwise.
186 Returns
187 -------
188 datasetType : `~lsst.daf.butler.DatasetType`
189 The `~lsst.daf.butler.DatasetType` defined by this connection.
190 """
191 return DatasetType(
192 self.name,
193 universe.conform(self.dimensions),
194 self.storageClass,
195 isCalibration=self.isCalibration,
196 parentStorageClass=parentStorageClass,
197 )
200@dataclasses.dataclass(frozen=True)
201class BaseInput(DimensionedConnection):
202 """Class used for declaring PipelineTask input connections.
204 Attributes
205 ----------
206 name : `str`
207 The default name used to identify the dataset type.
208 storageClass : `str`
209 The storage class used when (un)/persisting the dataset type.
210 multiple : `bool`
211 Indicates if this connection should expect to contain multiple objects
212 of the given dataset type. Tasks with more than one connection with
213 ``multiple=True`` with the same dimensions may want to implement
214 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
215 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
216 the execution system as early as possible of outputs that will not be
217 produced because the corresponding input is missing.
218 dimensions : iterable of `str`
219 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
220 to identify the dataset type identified by the specified name.
221 deferLoad : `bool`
222 Indicates that this dataset type will be loaded as a
223 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
224 object to load the object at a later time.
225 minimum : `bool`
226 Minimum number of datasets required for this connection, per quantum.
227 This is checked in the base implementation of
228 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
229 the minimum is not met for `Input` connections (causing the quantum to
230 be pruned, skipped, or never created, depending on the context), and
231 `FileNotFoundError` for `PrerequisiteInput` connections (causing
232 QuantumGraph generation to fail). `PipelineTask` implementations may
233 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
234 for more fine-grained or configuration-driven constraints, as long as
235 they are compatible with this minium.
237 Raises
238 ------
239 TypeError
240 Raised if ``minimum`` is greater than one but ``multiple=False``.
241 NotImplementedError
242 Raised if ``minimum`` is zero for a regular `Input` connection; this
243 is not currently supported by our QuantumGraph generation algorithm.
244 """
246 deferLoad: bool = False
247 minimum: int = 1
249 def __post_init__(self) -> None:
250 super().__post_init__()
251 if self.minimum > 1 and not self.multiple: 251 ↛ 252line 251 didn't jump to line 252, because the condition on line 251 was never true
252 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
255@dataclasses.dataclass(frozen=True)
256class Input(BaseInput):
257 """Class used for declaring PipelineTask input connections.
259 Attributes
260 ----------
261 name : `str`
262 The default name used to identify the dataset type.
263 storageClass : `str`
264 The storage class used when (un)/persisting the dataset type.
265 multiple : `bool`
266 Indicates if this connection should expect to contain multiple objects
267 of the given dataset type. Tasks with more than one connection with
268 ``multiple=True`` with the same dimensions may want to implement
269 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
270 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
271 the execution system as early as possible of outputs that will not be
272 produced because the corresponding input is missing.
273 dimensions : iterable of `str`
274 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
275 to identify the dataset type identified by the specified name.
276 deferLoad : `bool`
277 Indicates that this dataset type will be loaded as a
278 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
279 object to load the object at a later time.
280 minimum : `bool`
281 Minimum number of datasets required for this connection, per quantum.
282 This is checked in the base implementation of
283 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
284 the minimum is not met for `Input` connections (causing the quantum to
285 be pruned, skipped, or never created, depending on the context), and
286 `FileNotFoundError` for `PrerequisiteInput` connections (causing
287 QuantumGraph generation to fail). `PipelineTask` implementations may
288 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
289 for more fine-grained or configuration-driven constraints, as long as
290 they are compatible with this minium.
291 deferGraphConstraint : `bool`, optional
292 If `True`, do not include this dataset type's existence in the initial
293 query that starts the QuantumGraph generation process. This can be
294 used to make QuantumGraph generation faster by avoiding redundant
295 datasets, and in certain cases it can (along with careful attention to
296 which tasks are included in the same QuantumGraph) be used to work
297 around the QuantumGraph generation algorithm's inflexible handling of
298 spatial overlaps. This option has no effect when the connection is not
299 an overall input of the pipeline (or subset thereof) for which a graph
300 is being created, and it never affects the ordering of quanta.
301 deferBinding : `bool`, optional
302 If `True`, the dataset will not be automatically included in
303 the pipeline graph, ``deferGraphConstraint`` is implied.
304 The custom QuantumGraphBuilder is required to bind it and add a
305 corresponding edge to the pipeline graph.
306 This option allows to have the same dataset type as both
307 input and output of a quantum.
309 Raises
310 ------
311 TypeError
312 Raised if ``minimum`` is greater than one but ``multiple=False``.
313 NotImplementedError
314 Raised if ``minimum`` is zero for a regular `Input` connection; this
315 is not currently supported by our QuantumGraph generation algorithm.
316 """
318 deferGraphConstraint: bool = False
320 deferBinding: bool = False
322 _connection_type_set: ClassVar[str] = "inputs"
324 def __post_init__(self) -> None:
325 super().__post_init__()
326 if self.minimum == 0: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true
327 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
330@dataclasses.dataclass(frozen=True)
331class PrerequisiteInput(BaseInput):
332 """Class used for declaring PipelineTask prerequisite connections.
334 Attributes
335 ----------
336 name : `str`
337 The default name used to identify the dataset type.
338 storageClass : `str`
339 The storage class used when (un)/persisting the dataset type.
340 multiple : `bool`
341 Indicates if this connection should expect to contain multiple objects
342 of the given dataset type. Tasks with more than one connection with
343 ``multiple=True`` with the same dimensions may want to implement
344 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
345 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
346 the execution system as early as possible of outputs that will not be
347 produced because the corresponding input is missing.
348 dimensions : iterable of `str`
349 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
350 to identify the dataset type identified by the specified name.
351 minimum : `bool`
352 Minimum number of datasets required for this connection, per quantum.
353 This is checked in the base implementation of
354 `PipelineTaskConnections.adjustQuantum`, which raises
355 `FileNotFoundError` (causing QuantumGraph generation to fail).
356 `PipelineTask` implementations may
357 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
358 for more fine-grained or configuration-driven constraints, as long as
359 they are compatible with this minium.
360 lookupFunction : `typing.Callable`, optional
361 An optional callable function that will look up PrerequisiteInputs
362 using the DatasetType, registry, quantum dataId, and input collections
363 passed to it. If no function is specified, the default temporal spatial
364 lookup will be used.
366 Raises
367 ------
368 TypeError
369 Raised if ``minimum`` is greater than one but ``multiple=False``.
371 Notes
372 -----
373 Prerequisite inputs are used for datasets that must exist in the data
374 repository before a pipeline including this is run; they cannot be produced
375 by another task in the same pipeline.
377 In exchange for this limitation, they have a number of advantages relative
378 to regular `Input` connections:
380 - The query used to find them then during `QuantumGraph` generation can be
381 fully customized by providing a ``lookupFunction``.
382 - Failed searches for prerequisites during `QuantumGraph` generation will
383 usually generate more helpful diagnostics than those for regular `Input`
384 connections.
385 - The default query for prerequisite inputs relates the quantum dimensions
386 directly to the dimensions of its dataset type, without being constrained
387 by any of the other dimensions in the pipeline. This allows them to be
388 used for temporal calibration lookups (which regular `Input` connections
389 cannot do at present) and to work around `QuantumGraph` generation
390 limitations involving cases where naive spatial overlap relationships
391 between dimensions are not desired (e.g. a task that wants all detectors
392 in each visit for which the visit overlaps a tract, not just those where
393 that detector+visit combination overlaps the tract).
394 - Prerequisite inputs may be optional (regular inputs are never optional).
395 """
397 lookupFunction: (
398 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]] | None
399 ) = None
401 _connection_type_set: ClassVar[str] = "prerequisiteInputs"
404@dataclasses.dataclass(frozen=True)
405class Output(DimensionedConnection):
406 """Connection for output dataset."""
408 _connection_type_set: ClassVar[str] = "outputs"
411@dataclasses.dataclass(frozen=True)
412class InitInput(BaseConnection):
413 """Connection for initInput dataset."""
415 _connection_type_set: ClassVar[str] = "initInputs"
418@dataclasses.dataclass(frozen=True)
419class InitOutput(BaseConnection):
420 """Connection for initOutput dataset."""
422 _connection_type_set: ClassVar[str] = "initOutputs"