Coverage for python/lsst/pipe/base/connectionTypes.py: 80%
62 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-16 09:02 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-16 09:02 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29from collections.abc import Callable, Iterable, Sequence
30from typing import ClassVar
32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
35@dataclasses.dataclass(frozen=True)
36class BaseConnection:
37 """Base class used for declaring `PipelineTask` connections.
39 Parameters
40 ----------
41 name : `str`
42 The name used to identify the dataset type.
43 storageClass : `str`
44 The storage class used when (un)/persisting the dataset type.
45 multiple : `bool`
46 Indicates if this connection should expect to contain multiple objects
47 of the given dataset type. Tasks with more than one connection with
48 ``multiple=True`` with the same dimensions may want to implement
49 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
50 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and
51 notify the execution system as early as possible of outputs that will
52 not be produced because the corresponding input is missing.
53 """
55 name: str
56 storageClass: str
57 doc: str = ""
58 multiple: bool = False
60 _connection_type_set: ClassVar[str]
62 def __get__(self, inst, klass):
63 """Descriptor access method.
65 This is a method used to turn a connection into a descriptor.
66 When a connection is added to a connection class, it is a class level
67 variable. This method makes accessing this connection, on the
68 instance of the connection class owning this connection, return a
69 result specialized for that instance. In the case of connections
70 this specifically means names specified in a config instance will
71 be visible instead of the default names for the connection, and that
72 removed connections will not be accessible on the instance.
73 """
74 # If inst is None, this is being accessed by the class and not an
75 # instance, return this connection itself
76 if inst is None:
77 return self
78 # Attempt to return the configured connection object from the
79 # connections instance allConnections mapping.
80 try:
81 return inst.allConnections[self.varName]
82 except KeyError:
83 raise AttributeError(
84 f"Connection {self.varName!r} of {klass.__name__} has been removed."
85 ) from None
87 def makeDatasetType(
88 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
89 ) -> DatasetType:
90 """Construct a true `~lsst.daf.butler.DatasetType` instance with
91 normalized dimensions.
93 Parameters
94 ----------
95 universe : `lsst.daf.butler.DimensionUniverse`
96 Set of all known dimensions to be used to normalize the dimension
97 names specified in config.
98 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
99 Parent storage class for component datasets; `None` otherwise.
101 Returns
102 -------
103 datasetType : `~lsst.daf.butler.DatasetType`
104 The `~lsst.daf.butler.DatasetType` defined by this connection.
105 """
106 return DatasetType(
107 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
108 )
111@dataclasses.dataclass(frozen=True)
112class DimensionedConnection(BaseConnection):
113 """Class used for declaring PipelineTask connections that includes
114 dimensions
116 Parameters
117 ----------
118 name : `str`
119 The name used to identify the dataset type
120 storageClass : `str`
121 The storage class used when (un)/persisting the dataset type
122 multiple : `bool`
123 Indicates if this connection should expect to contain multiple objects
124 of the given dataset type. Tasks with more than one connection with
125 ``multiple=True`` with the same dimensions may want to implement
126 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
127 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
128 the execution system as early as possible of outputs that will not be
129 produced because the corresponding input is missing.
130 dimensions : iterable of `str`
131 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
132 to identify the dataset type identified by the specified name
133 isCalibration: `bool`, optional
134 `True` if this dataset type may be included in CALIBRATION-type
135 collections to associate it with a validity range, `False` (default)
136 otherwise.
137 """
139 dimensions: Iterable[str] = ()
140 isCalibration: bool = False
142 def __post_init__(self):
143 if isinstance(self.dimensions, str): 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true
144 raise TypeError(
145 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
146 )
147 if not isinstance(self.dimensions, Iterable): 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true
148 raise TypeError("Dimensions must be iterable of dimensions")
150 def makeDatasetType(
151 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None
152 ) -> DatasetType:
153 """Construct a true `~lsst.daf.butler.DatasetType` instance with
154 normalized dimensions.
156 Parameters
157 ----------
158 universe : `lsst.daf.butler.DimensionUniverse`
159 Set of all known dimensions to be used to normalize the dimension
160 names specified in config.
161 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
162 Parent storage class for component datasets; `None` otherwise.
164 Returns
165 -------
166 datasetType : `~lsst.daf.butler.DatasetType`
167 The `~lsst.daf.butler.DatasetType` defined by this connection.
168 """
169 return DatasetType(
170 self.name,
171 universe.extract(self.dimensions),
172 self.storageClass,
173 isCalibration=self.isCalibration,
174 parentStorageClass=parentStorageClass,
175 )
178@dataclasses.dataclass(frozen=True)
179class BaseInput(DimensionedConnection):
180 """Class used for declaring PipelineTask input connections
182 Parameters
183 ----------
184 name : `str`
185 The default name used to identify the dataset type
186 storageClass : `str`
187 The storage class used when (un)/persisting the dataset type
188 multiple : `bool`
189 Indicates if this connection should expect to contain multiple objects
190 of the given dataset type. Tasks with more than one connection with
191 ``multiple=True`` with the same dimensions may want to implement
192 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
193 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
194 the execution system as early as possible of outputs that will not be
195 produced because the corresponding input is missing.
196 dimensions : iterable of `str`
197 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
198 to identify the dataset type identified by the specified name
199 deferLoad : `bool`
200 Indicates that this dataset type will be loaded as a
201 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
202 object to load the object at a later time.
203 minimum : `bool`
204 Minimum number of datasets required for this connection, per quantum.
205 This is checked in the base implementation of
206 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
207 the minimum is not met for `Input` connections (causing the quantum to
208 be pruned, skipped, or never created, depending on the context), and
209 `FileNotFoundError` for `PrerequisiteInput` connections (causing
210 QuantumGraph generation to fail). `PipelineTask` implementations may
211 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
212 for more fine-grained or configuration-driven constraints, as long as
213 they are compatible with this minium.
215 Raises
216 ------
217 TypeError
218 Raised if ``minimum`` is greater than one but ``multiple=False``.
219 NotImplementedError
220 Raised if ``minimum`` is zero for a regular `Input` connection; this
221 is not currently supported by our QuantumGraph generation algorithm.
222 """
224 deferLoad: bool = False
225 minimum: int = 1
227 def __post_init__(self) -> None:
228 super().__post_init__()
229 if self.minimum > 1 and not self.multiple: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
233@dataclasses.dataclass(frozen=True)
234class Input(BaseInput):
235 """Class used for declaring PipelineTask input connections
237 Parameters
238 ----------
239 name : `str`
240 The default name used to identify the dataset type
241 storageClass : `str`
242 The storage class used when (un)/persisting the dataset type
243 multiple : `bool`
244 Indicates if this connection should expect to contain multiple objects
245 of the given dataset type. Tasks with more than one connection with
246 ``multiple=True`` with the same dimensions may want to implement
247 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
248 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
249 the execution system as early as possible of outputs that will not be
250 produced because the corresponding input is missing.
251 dimensions : iterable of `str`
252 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
253 to identify the dataset type identified by the specified name
254 deferLoad : `bool`
255 Indicates that this dataset type will be loaded as a
256 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
257 object to load the object at a later time.
258 minimum : `bool`
259 Minimum number of datasets required for this connection, per quantum.
260 This is checked in the base implementation of
261 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
262 the minimum is not met for `Input` connections (causing the quantum to
263 be pruned, skipped, or never created, depending on the context), and
264 `FileNotFoundError` for `PrerequisiteInput` connections (causing
265 QuantumGraph generation to fail). `PipelineTask` implementations may
266 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
267 for more fine-grained or configuration-driven constraints, as long as
268 they are compatible with this minium.
269 deferGraphConstraint: `bool`, optional
270 If `True`, do not include this dataset type's existence in the initial
271 query that starts the QuantumGraph generation process. This can be
272 used to make QuantumGraph generation faster by avoiding redundant
273 datasets, and in certain cases it can (along with careful attention to
274 which tasks are included in the same QuantumGraph) be used to work
275 around the QuantumGraph generation algorithm's inflexible handling of
276 spatial overlaps. This option has no effect when the connection is not
277 an overall input of the pipeline (or subset thereof) for which a graph
278 is being created, and it never affects the ordering of quanta.
280 Raises
281 ------
282 TypeError
283 Raised if ``minimum`` is greater than one but ``multiple=False``.
284 NotImplementedError
285 Raised if ``minimum`` is zero for a regular `Input` connection; this
286 is not currently supported by our QuantumGraph generation algorithm.
287 """
289 deferGraphConstraint: bool = False
291 _connection_type_set: ClassVar[str] = "inputs"
293 def __post_init__(self) -> None:
294 super().__post_init__()
295 if self.minimum == 0: 295 ↛ 296line 295 didn't jump to line 296, because the condition on line 295 was never true
296 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
299@dataclasses.dataclass(frozen=True)
300class PrerequisiteInput(BaseInput):
301 """Class used for declaring PipelineTask prerequisite connections.
303 Parameters
304 ----------
305 name : `str`
306 The default name used to identify the dataset type
307 storageClass : `str`
308 The storage class used when (un)/persisting the dataset type
309 multiple : `bool`
310 Indicates if this connection should expect to contain multiple objects
311 of the given dataset type. Tasks with more than one connection with
312 ``multiple=True`` with the same dimensions may want to implement
313 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
314 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
315 the execution system as early as possible of outputs that will not be
316 produced because the corresponding input is missing.
317 dimensions : iterable of `str`
318 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
319 to identify the dataset type identified by the specified name
320 minimum : `bool`
321 Minimum number of datasets required for this connection, per quantum.
322 This is checked in the base implementation of
323 `PipelineTaskConnections.adjustQuantum`, which raises
324 `FileNotFoundError` (causing QuantumGraph generation to fail).
325 `PipelineTask` implementations may
326 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
327 for more fine-grained or configuration-driven constraints, as long as
328 they are compatible with this minium.
329 lookupFunction: `typing.Callable`, optional
330 An optional callable function that will look up PrerequisiteInputs
331 using the DatasetType, registry, quantum dataId, and input collections
332 passed to it. If no function is specified, the default temporal spatial
333 lookup will be used.
335 Raises
336 ------
337 TypeError
338 Raised if ``minimum`` is greater than one but ``multiple=False``.
340 Notes
341 -----
342 Prerequisite inputs are used for datasets that must exist in the data
343 repository before a pipeline including this is run; they cannot be produced
344 by another task in the same pipeline.
346 In exchange for this limitation, they have a number of advantages relative
347 to regular `Input` connections:
349 - The query used to find them then during `QuantumGraph` generation can be
350 fully customized by providing a ``lookupFunction``.
351 - Failed searches for prerequisites during `QuantumGraph` generation will
352 usually generate more helpful diagnostics than those for regular `Input`
353 connections.
354 - The default query for prerequisite inputs relates the quantum dimensions
355 directly to the dimensions of its dataset type, without being constrained
356 by any of the other dimensions in the pipeline. This allows them to be
357 used for temporal calibration lookups (which regular `Input` connections
358 cannot do at present) and to work around `QuantumGraph` generation
359 limitations involving cases where naive spatial overlap relationships
360 between dimensions are not desired (e.g. a task that wants all detectors
361 in each visit for which the visit overlaps a tract, not just those where
362 that detector+visit combination overlaps the tract).
363 - Prerequisite inputs may be optional (regular inputs are never optional).
364 """
366 lookupFunction: Callable[
367 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]
368 ] | None = None
370 _connection_type_set: ClassVar[str] = "prerequisiteInputs"
373@dataclasses.dataclass(frozen=True)
374class Output(DimensionedConnection):
375 """Connection for output dataset."""
377 _connection_type_set: ClassVar[str] = "outputs"
380@dataclasses.dataclass(frozen=True)
381class InitInput(BaseConnection):
382 """Connection for initInput dataset."""
384 _connection_type_set: ClassVar[str] = "initInputs"
387@dataclasses.dataclass(frozen=True)
388class InitOutput(BaseConnection):
389 """Connection for initOutput dataset."""
391 _connection_type_set: ClassVar[str] = "initOutputs"