Coverage for python/lsst/pipe/base/connectionTypes.py: 80%
63 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 11:14 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 11:14 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29from collections.abc import Callable, Iterable, Sequence
30from typing import ClassVar
32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
35@dataclasses.dataclass(frozen=True)
36class BaseConnection:
37 """Base class used for declaring `PipelineTask` connections.
39 Parameters
40 ----------
41 name : `str`
42 The name used to identify the dataset type.
43 storageClass : `str`
44 The storage class used when (un)/persisting the dataset type.
45 multiple : `bool`
46 Indicates if this connection should expect to contain multiple objects
47 of the given dataset type. Tasks with more than one connection with
48 ``multiple=True`` with the same dimensions may want to implement
49 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
50 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and
51 notify the execution system as early as possible of outputs that will
52 not be produced because the corresponding input is missing.
53 deprecated : `str`, optional
54 A description of why this connection is deprecated, including the
55 version after which it may be removed.
57 If not `None`, the string is appended to the docstring for this
58 connection and the corresponding config Field.
59 """
61 name: str
62 storageClass: str
63 doc: str = ""
64 multiple: bool = False
65 deprecated: str | None = dataclasses.field(default=None, kw_only=True)
67 _connection_type_set: ClassVar[str]
69 def __get__(self, inst, klass):
70 """Descriptor access method.
72 This is a method used to turn a connection into a descriptor.
73 When a connection is added to a connection class, it is a class level
74 variable. This method makes accessing this connection, on the
75 instance of the connection class owning this connection, return a
76 result specialized for that instance. In the case of connections
77 this specifically means names specified in a config instance will
78 be visible instead of the default names for the connection, and that
79 removed connections will not be accessible on the instance.
80 """
81 # If inst is None, this is being accessed by the class and not an
82 # instance, return this connection itself
83 if inst is None:
84 return self
85 # Attempt to return the configured connection object from the
86 # connections instance allConnections mapping.
87 try:
88 return inst.allConnections[self.varName]
89 except KeyError:
90 raise AttributeError(
91 f"Connection {self.varName!r} of {klass.__name__} has been removed."
92 ) from None
94 def makeDatasetType(
95 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
96 ) -> DatasetType:
97 """Construct a true `~lsst.daf.butler.DatasetType` instance with
98 normalized dimensions.
100 Parameters
101 ----------
102 universe : `lsst.daf.butler.DimensionUniverse`
103 Set of all known dimensions to be used to normalize the dimension
104 names specified in config.
105 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
106 Parent storage class for component datasets; `None` otherwise.
108 Returns
109 -------
110 datasetType : `~lsst.daf.butler.DatasetType`
111 The `~lsst.daf.butler.DatasetType` defined by this connection.
112 """
113 return DatasetType(
114 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
115 )
118@dataclasses.dataclass(frozen=True)
119class DimensionedConnection(BaseConnection):
120 """Class used for declaring PipelineTask connections that includes
121 dimensions
123 Parameters
124 ----------
125 name : `str`
126 The name used to identify the dataset type
127 storageClass : `str`
128 The storage class used when (un)/persisting the dataset type
129 multiple : `bool`
130 Indicates if this connection should expect to contain multiple objects
131 of the given dataset type. Tasks with more than one connection with
132 ``multiple=True`` with the same dimensions may want to implement
133 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
134 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
135 the execution system as early as possible of outputs that will not be
136 produced because the corresponding input is missing.
137 dimensions : iterable of `str`
138 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
139 to identify the dataset type identified by the specified name
140 isCalibration: `bool`, optional
141 `True` if this dataset type may be included in CALIBRATION-type
142 collections to associate it with a validity range, `False` (default)
143 otherwise.
144 """
146 dimensions: Iterable[str] = ()
147 isCalibration: bool = False
149 def __post_init__(self):
150 if isinstance(self.dimensions, str): 150 ↛ 151line 150 didn't jump to line 151, because the condition on line 150 was never true
151 raise TypeError(
152 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
153 )
154 if not isinstance(self.dimensions, Iterable): 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true
155 raise TypeError("Dimensions must be iterable of dimensions")
157 def makeDatasetType(
158 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
159 ) -> DatasetType:
160 """Construct a true `~lsst.daf.butler.DatasetType` instance with
161 normalized dimensions.
163 Parameters
164 ----------
165 universe : `lsst.daf.butler.DimensionUniverse`
166 Set of all known dimensions to be used to normalize the dimension
167 names specified in config.
168 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
169 Parent storage class for component datasets; `None` otherwise.
171 Returns
172 -------
173 datasetType : `~lsst.daf.butler.DatasetType`
174 The `~lsst.daf.butler.DatasetType` defined by this connection.
175 """
176 return DatasetType(
177 self.name,
178 universe.extract(self.dimensions),
179 self.storageClass,
180 isCalibration=self.isCalibration,
181 parentStorageClass=parentStorageClass,
182 )
185@dataclasses.dataclass(frozen=True)
186class BaseInput(DimensionedConnection):
187 """Class used for declaring PipelineTask input connections
189 Parameters
190 ----------
191 name : `str`
192 The default name used to identify the dataset type
193 storageClass : `str`
194 The storage class used when (un)/persisting the dataset type
195 multiple : `bool`
196 Indicates if this connection should expect to contain multiple objects
197 of the given dataset type. Tasks with more than one connection with
198 ``multiple=True`` with the same dimensions may want to implement
199 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
200 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
201 the execution system as early as possible of outputs that will not be
202 produced because the corresponding input is missing.
203 dimensions : iterable of `str`
204 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
205 to identify the dataset type identified by the specified name
206 deferLoad : `bool`
207 Indicates that this dataset type will be loaded as a
208 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
209 object to load the object at a later time.
210 minimum : `bool`
211 Minimum number of datasets required for this connection, per quantum.
212 This is checked in the base implementation of
213 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
214 the minimum is not met for `Input` connections (causing the quantum to
215 be pruned, skipped, or never created, depending on the context), and
216 `FileNotFoundError` for `PrerequisiteInput` connections (causing
217 QuantumGraph generation to fail). `PipelineTask` implementations may
218 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
219 for more fine-grained or configuration-driven constraints, as long as
220 they are compatible with this minium.
222 Raises
223 ------
224 TypeError
225 Raised if ``minimum`` is greater than one but ``multiple=False``.
226 NotImplementedError
227 Raised if ``minimum`` is zero for a regular `Input` connection; this
228 is not currently supported by our QuantumGraph generation algorithm.
229 """
231 deferLoad: bool = False
232 minimum: int = 1
234 def __post_init__(self) -> None:
235 super().__post_init__()
236 if self.minimum > 1 and not self.multiple: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
240@dataclasses.dataclass(frozen=True)
241class Input(BaseInput):
242 """Class used for declaring PipelineTask input connections
244 Parameters
245 ----------
246 name : `str`
247 The default name used to identify the dataset type
248 storageClass : `str`
249 The storage class used when (un)/persisting the dataset type
250 multiple : `bool`
251 Indicates if this connection should expect to contain multiple objects
252 of the given dataset type. Tasks with more than one connection with
253 ``multiple=True`` with the same dimensions may want to implement
254 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
255 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
256 the execution system as early as possible of outputs that will not be
257 produced because the corresponding input is missing.
258 dimensions : iterable of `str`
259 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
260 to identify the dataset type identified by the specified name
261 deferLoad : `bool`
262 Indicates that this dataset type will be loaded as a
263 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
264 object to load the object at a later time.
265 minimum : `bool`
266 Minimum number of datasets required for this connection, per quantum.
267 This is checked in the base implementation of
268 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
269 the minimum is not met for `Input` connections (causing the quantum to
270 be pruned, skipped, or never created, depending on the context), and
271 `FileNotFoundError` for `PrerequisiteInput` connections (causing
272 QuantumGraph generation to fail). `PipelineTask` implementations may
273 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
274 for more fine-grained or configuration-driven constraints, as long as
275 they are compatible with this minium.
276 deferGraphConstraint: `bool`, optional
277 If `True`, do not include this dataset type's existence in the initial
278 query that starts the QuantumGraph generation process. This can be
279 used to make QuantumGraph generation faster by avoiding redundant
280 datasets, and in certain cases it can (along with careful attention to
281 which tasks are included in the same QuantumGraph) be used to work
282 around the QuantumGraph generation algorithm's inflexible handling of
283 spatial overlaps. This option has no effect when the connection is not
284 an overall input of the pipeline (or subset thereof) for which a graph
285 is being created, and it never affects the ordering of quanta.
287 Raises
288 ------
289 TypeError
290 Raised if ``minimum`` is greater than one but ``multiple=False``.
291 NotImplementedError
292 Raised if ``minimum`` is zero for a regular `Input` connection; this
293 is not currently supported by our QuantumGraph generation algorithm.
294 """
296 deferGraphConstraint: bool = False
298 _connection_type_set: ClassVar[str] = "inputs"
300 def __post_init__(self) -> None:
301 super().__post_init__()
302 if self.minimum == 0: 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true
303 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
306@dataclasses.dataclass(frozen=True)
307class PrerequisiteInput(BaseInput):
308 """Class used for declaring PipelineTask prerequisite connections.
310 Parameters
311 ----------
312 name : `str`
313 The default name used to identify the dataset type
314 storageClass : `str`
315 The storage class used when (un)/persisting the dataset type
316 multiple : `bool`
317 Indicates if this connection should expect to contain multiple objects
318 of the given dataset type. Tasks with more than one connection with
319 ``multiple=True`` with the same dimensions may want to implement
320 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
321 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
322 the execution system as early as possible of outputs that will not be
323 produced because the corresponding input is missing.
324 dimensions : iterable of `str`
325 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
326 to identify the dataset type identified by the specified name
327 minimum : `bool`
328 Minimum number of datasets required for this connection, per quantum.
329 This is checked in the base implementation of
330 `PipelineTaskConnections.adjustQuantum`, which raises
331 `FileNotFoundError` (causing QuantumGraph generation to fail).
332 `PipelineTask` implementations may
333 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
334 for more fine-grained or configuration-driven constraints, as long as
335 they are compatible with this minium.
336 lookupFunction: `typing.Callable`, optional
337 An optional callable function that will look up PrerequisiteInputs
338 using the DatasetType, registry, quantum dataId, and input collections
339 passed to it. If no function is specified, the default temporal spatial
340 lookup will be used.
342 Raises
343 ------
344 TypeError
345 Raised if ``minimum`` is greater than one but ``multiple=False``.
347 Notes
348 -----
349 Prerequisite inputs are used for datasets that must exist in the data
350 repository before a pipeline including this is run; they cannot be produced
351 by another task in the same pipeline.
353 In exchange for this limitation, they have a number of advantages relative
354 to regular `Input` connections:
356 - The query used to find them then during `QuantumGraph` generation can be
357 fully customized by providing a ``lookupFunction``.
358 - Failed searches for prerequisites during `QuantumGraph` generation will
359 usually generate more helpful diagnostics than those for regular `Input`
360 connections.
361 - The default query for prerequisite inputs relates the quantum dimensions
362 directly to the dimensions of its dataset type, without being constrained
363 by any of the other dimensions in the pipeline. This allows them to be
364 used for temporal calibration lookups (which regular `Input` connections
365 cannot do at present) and to work around `QuantumGraph` generation
366 limitations involving cases where naive spatial overlap relationships
367 between dimensions are not desired (e.g. a task that wants all detectors
368 in each visit for which the visit overlaps a tract, not just those where
369 that detector+visit combination overlaps the tract).
370 - Prerequisite inputs may be optional (regular inputs are never optional).
371 """
373 lookupFunction: Callable[
374 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]
375 ] | None = None
377 _connection_type_set: ClassVar[str] = "prerequisiteInputs"
380@dataclasses.dataclass(frozen=True)
381class Output(DimensionedConnection):
382 """Connection for output dataset."""
384 _connection_type_set: ClassVar[str] = "outputs"
387@dataclasses.dataclass(frozen=True)
388class InitInput(BaseConnection):
389 """Connection for initInput dataset."""
391 _connection_type_set: ClassVar[str] = "initInputs"
394@dataclasses.dataclass(frozen=True)
395class InitOutput(BaseConnection):
396 """Connection for initOutput dataset."""
398 _connection_type_set: ClassVar[str] = "initOutputs"