Coverage for python/lsst/pipe/base/connectionTypes.py: 78%
71 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-23 10:31 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-23 10:31 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29from collections.abc import Callable, Iterable, Sequence
30from typing import ClassVar
32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
33from lsst.utils.introspection import find_outside_stacklevel
36@dataclasses.dataclass(frozen=True)
37class BaseConnection:
38 """Base class used for declaring `PipelineTask` connections.
40 Parameters
41 ----------
42 name : `str`
43 The name used to identify the dataset type.
44 storageClass : `str`
45 The storage class used when (un)/persisting the dataset type.
46 multiple : `bool`
47 Indicates if this connection should expect to contain multiple objects
48 of the given dataset type. Tasks with more than one connection with
49 ``multiple=True`` with the same dimensions may want to implement
50 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
51 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and
52 notify the execution system as early as possible of outputs that will
53 not be produced because the corresponding input is missing.
54 deprecated : `str`, optional
55 A description of why this connection is deprecated, including the
56 version after which it may be removed.
58 If not `None`, the string is appended to the docstring for this
59 connection and the corresponding config Field.
60 """
62 name: str
63 storageClass: str
64 doc: str = ""
65 multiple: bool = False
66 deprecated: str | None = dataclasses.field(default=None, kw_only=True)
68 _connection_type_set: ClassVar[str]
69 _deprecation_context: str = ""
71 def __post_init__(self):
72 if self.deprecated and not self._deprecation_context: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true
73 info = {}
74 _ = find_outside_stacklevel("lsst.pipe.base", "dataclasses", stack_info=info)
75 object.__setattr__(self, "_deprecation_context", f"{info['filename']}:{info['lineno']}")
77 def __get__(self, inst, klass):
78 """Descriptor access method.
80 This is a method used to turn a connection into a descriptor.
81 When a connection is added to a connection class, it is a class level
82 variable. This method makes accessing this connection, on the
83 instance of the connection class owning this connection, return a
84 result specialized for that instance. In the case of connections
85 this specifically means names specified in a config instance will
86 be visible instead of the default names for the connection, and that
87 removed connections will not be accessible on the instance.
88 """
89 # If inst is None, this is being accessed by the class and not an
90 # instance, return this connection itself
91 if inst is None:
92 return self
93 # Attempt to return the configured connection object from the
94 # connections instance allConnections mapping.
95 try:
96 return inst.allConnections[self.varName]
97 except KeyError:
98 raise AttributeError(
99 f"Connection {self.varName!r} of {klass.__name__} has been removed."
100 ) from None
102 def makeDatasetType(
103 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
104 ) -> DatasetType:
105 """Construct a true `~lsst.daf.butler.DatasetType` instance with
106 normalized dimensions.
108 Parameters
109 ----------
110 universe : `lsst.daf.butler.DimensionUniverse`
111 Set of all known dimensions to be used to normalize the dimension
112 names specified in config.
113 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
114 Parent storage class for component datasets; `None` otherwise.
116 Returns
117 -------
118 datasetType : `~lsst.daf.butler.DatasetType`
119 The `~lsst.daf.butler.DatasetType` defined by this connection.
120 """
121 return DatasetType(
122 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
123 )
126@dataclasses.dataclass(frozen=True)
127class DimensionedConnection(BaseConnection):
128 """Class used for declaring PipelineTask connections that includes
129 dimensions
131 Parameters
132 ----------
133 name : `str`
134 The name used to identify the dataset type
135 storageClass : `str`
136 The storage class used when (un)/persisting the dataset type
137 multiple : `bool`
138 Indicates if this connection should expect to contain multiple objects
139 of the given dataset type. Tasks with more than one connection with
140 ``multiple=True`` with the same dimensions may want to implement
141 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
142 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
143 the execution system as early as possible of outputs that will not be
144 produced because the corresponding input is missing.
145 dimensions : iterable of `str`
146 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
147 to identify the dataset type identified by the specified name
148 isCalibration: `bool`, optional
149 `True` if this dataset type may be included in CALIBRATION-type
150 collections to associate it with a validity range, `False` (default)
151 otherwise.
152 """
154 dimensions: Iterable[str] = ()
155 isCalibration: bool = False
157 def __post_init__(self):
158 super().__post_init__()
159 if isinstance(self.dimensions, str): 159 ↛ 160line 159 didn't jump to line 160, because the condition on line 159 was never true
160 raise TypeError(
161 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
162 )
163 if not isinstance(self.dimensions, Iterable): 163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true
164 raise TypeError("Dimensions must be iterable of dimensions")
166 def makeDatasetType(
167 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
168 ) -> DatasetType:
169 """Construct a true `~lsst.daf.butler.DatasetType` instance with
170 normalized dimensions.
172 Parameters
173 ----------
174 universe : `lsst.daf.butler.DimensionUniverse`
175 Set of all known dimensions to be used to normalize the dimension
176 names specified in config.
177 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
178 Parent storage class for component datasets; `None` otherwise.
180 Returns
181 -------
182 datasetType : `~lsst.daf.butler.DatasetType`
183 The `~lsst.daf.butler.DatasetType` defined by this connection.
184 """
185 return DatasetType(
186 self.name,
187 universe.extract(self.dimensions),
188 self.storageClass,
189 isCalibration=self.isCalibration,
190 parentStorageClass=parentStorageClass,
191 )
194@dataclasses.dataclass(frozen=True)
195class BaseInput(DimensionedConnection):
196 """Class used for declaring PipelineTask input connections
198 Parameters
199 ----------
200 name : `str`
201 The default name used to identify the dataset type
202 storageClass : `str`
203 The storage class used when (un)/persisting the dataset type
204 multiple : `bool`
205 Indicates if this connection should expect to contain multiple objects
206 of the given dataset type. Tasks with more than one connection with
207 ``multiple=True`` with the same dimensions may want to implement
208 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
209 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
210 the execution system as early as possible of outputs that will not be
211 produced because the corresponding input is missing.
212 dimensions : iterable of `str`
213 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
214 to identify the dataset type identified by the specified name
215 deferLoad : `bool`
216 Indicates that this dataset type will be loaded as a
217 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
218 object to load the object at a later time.
219 minimum : `bool`
220 Minimum number of datasets required for this connection, per quantum.
221 This is checked in the base implementation of
222 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
223 the minimum is not met for `Input` connections (causing the quantum to
224 be pruned, skipped, or never created, depending on the context), and
225 `FileNotFoundError` for `PrerequisiteInput` connections (causing
226 QuantumGraph generation to fail). `PipelineTask` implementations may
227 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
228 for more fine-grained or configuration-driven constraints, as long as
229 they are compatible with this minium.
231 Raises
232 ------
233 TypeError
234 Raised if ``minimum`` is greater than one but ``multiple=False``.
235 NotImplementedError
236 Raised if ``minimum`` is zero for a regular `Input` connection; this
237 is not currently supported by our QuantumGraph generation algorithm.
238 """
240 deferLoad: bool = False
241 minimum: int = 1
243 def __post_init__(self) -> None:
244 super().__post_init__()
245 if self.minimum > 1 and not self.multiple: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
249@dataclasses.dataclass(frozen=True)
250class Input(BaseInput):
251 """Class used for declaring PipelineTask input connections
253 Parameters
254 ----------
255 name : `str`
256 The default name used to identify the dataset type
257 storageClass : `str`
258 The storage class used when (un)/persisting the dataset type
259 multiple : `bool`
260 Indicates if this connection should expect to contain multiple objects
261 of the given dataset type. Tasks with more than one connection with
262 ``multiple=True`` with the same dimensions may want to implement
263 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
264 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
265 the execution system as early as possible of outputs that will not be
266 produced because the corresponding input is missing.
267 dimensions : iterable of `str`
268 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
269 to identify the dataset type identified by the specified name
270 deferLoad : `bool`
271 Indicates that this dataset type will be loaded as a
272 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
273 object to load the object at a later time.
274 minimum : `bool`
275 Minimum number of datasets required for this connection, per quantum.
276 This is checked in the base implementation of
277 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
278 the minimum is not met for `Input` connections (causing the quantum to
279 be pruned, skipped, or never created, depending on the context), and
280 `FileNotFoundError` for `PrerequisiteInput` connections (causing
281 QuantumGraph generation to fail). `PipelineTask` implementations may
282 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
283 for more fine-grained or configuration-driven constraints, as long as
284 they are compatible with this minium.
285 deferGraphConstraint: `bool`, optional
286 If `True`, do not include this dataset type's existence in the initial
287 query that starts the QuantumGraph generation process. This can be
288 used to make QuantumGraph generation faster by avoiding redundant
289 datasets, and in certain cases it can (along with careful attention to
290 which tasks are included in the same QuantumGraph) be used to work
291 around the QuantumGraph generation algorithm's inflexible handling of
292 spatial overlaps. This option has no effect when the connection is not
293 an overall input of the pipeline (or subset thereof) for which a graph
294 is being created, and it never affects the ordering of quanta.
296 Raises
297 ------
298 TypeError
299 Raised if ``minimum`` is greater than one but ``multiple=False``.
300 NotImplementedError
301 Raised if ``minimum`` is zero for a regular `Input` connection; this
302 is not currently supported by our QuantumGraph generation algorithm.
303 """
305 deferGraphConstraint: bool = False
307 _connection_type_set: ClassVar[str] = "inputs"
309 def __post_init__(self) -> None:
310 super().__post_init__()
311 if self.minimum == 0: 311 ↛ 312line 311 didn't jump to line 312, because the condition on line 311 was never true
312 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
315@dataclasses.dataclass(frozen=True)
316class PrerequisiteInput(BaseInput):
317 """Class used for declaring PipelineTask prerequisite connections.
319 Parameters
320 ----------
321 name : `str`
322 The default name used to identify the dataset type
323 storageClass : `str`
324 The storage class used when (un)/persisting the dataset type
325 multiple : `bool`
326 Indicates if this connection should expect to contain multiple objects
327 of the given dataset type. Tasks with more than one connection with
328 ``multiple=True`` with the same dimensions may want to implement
329 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
330 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
331 the execution system as early as possible of outputs that will not be
332 produced because the corresponding input is missing.
333 dimensions : iterable of `str`
334 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
335 to identify the dataset type identified by the specified name
336 minimum : `bool`
337 Minimum number of datasets required for this connection, per quantum.
338 This is checked in the base implementation of
339 `PipelineTaskConnections.adjustQuantum`, which raises
340 `FileNotFoundError` (causing QuantumGraph generation to fail).
341 `PipelineTask` implementations may
342 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
343 for more fine-grained or configuration-driven constraints, as long as
344 they are compatible with this minium.
345 lookupFunction: `typing.Callable`, optional
346 An optional callable function that will look up PrerequisiteInputs
347 using the DatasetType, registry, quantum dataId, and input collections
348 passed to it. If no function is specified, the default temporal spatial
349 lookup will be used.
351 Raises
352 ------
353 TypeError
354 Raised if ``minimum`` is greater than one but ``multiple=False``.
356 Notes
357 -----
358 Prerequisite inputs are used for datasets that must exist in the data
359 repository before a pipeline including this is run; they cannot be produced
360 by another task in the same pipeline.
362 In exchange for this limitation, they have a number of advantages relative
363 to regular `Input` connections:
365 - The query used to find them then during `QuantumGraph` generation can be
366 fully customized by providing a ``lookupFunction``.
367 - Failed searches for prerequisites during `QuantumGraph` generation will
368 usually generate more helpful diagnostics than those for regular `Input`
369 connections.
370 - The default query for prerequisite inputs relates the quantum dimensions
371 directly to the dimensions of its dataset type, without being constrained
372 by any of the other dimensions in the pipeline. This allows them to be
373 used for temporal calibration lookups (which regular `Input` connections
374 cannot do at present) and to work around `QuantumGraph` generation
375 limitations involving cases where naive spatial overlap relationships
376 between dimensions are not desired (e.g. a task that wants all detectors
377 in each visit for which the visit overlaps a tract, not just those where
378 that detector+visit combination overlaps the tract).
379 - Prerequisite inputs may be optional (regular inputs are never optional).
380 """
382 lookupFunction: Callable[
383 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]
384 ] | None = None
386 _connection_type_set: ClassVar[str] = "prerequisiteInputs"
389@dataclasses.dataclass(frozen=True)
390class Output(DimensionedConnection):
391 """Connection for output dataset."""
393 _connection_type_set: ClassVar[str] = "outputs"
396@dataclasses.dataclass(frozen=True)
397class InitInput(BaseConnection):
398 """Connection for initInput dataset."""
400 _connection_type_set: ClassVar[str] = "initInputs"
403@dataclasses.dataclass(frozen=True)
404class InitOutput(BaseConnection):
405 """Connection for initOutput dataset."""
407 _connection_type_set: ClassVar[str] = "initOutputs"