Coverage for python/lsst/pipe/base/connectionTypes.py: 69%
66 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:48 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:48 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29import typing
30from collections.abc import Callable, Iterable, Sequence
31from typing import Optional, Union
33from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass
36@dataclasses.dataclass(frozen=True)
37class BaseConnection:
38 """Base class used for declaring PipelineTask connections
40 Parameters
41 ----------
42 name : `str`
43 The name used to identify the dataset type
44 storageClass : `str`
45 The storage class used when (un)/persisting the dataset type
46 multiple : `bool`
47 Indicates if this connection should expect to contain multiple objects
48 of the given dataset type. Tasks with more than one connection with
49 ``multiple=True`` with the same dimensions may want to implement
50 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
51 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
52 the execution system as early as possible of outputs that will not be
53 produced because the corresponding input is missing.
54 """
56 name: str
57 storageClass: str
58 doc: str = ""
59 multiple: bool = False
61 def __get__(self, inst, klass):
62 """Descriptor method
64 This is a method used to turn a connection into a descriptor.
65 When a connection is added to a connection class, it is a class level
66 variable. This method makes accessing this connection, on the
67 instance of the connection class owning this connection, return a
68 result specialized for that instance. In the case of connections
69 this specifically means names specified in a config instance will
70 be visible instead of the default names for the connection.
71 """
72 # If inst is None, this is being accessed by the class and not an
73 # instance, return this connection itself
74 if inst is None:
75 return self
76 # If no object cache exists, create one to track the instances this
77 # connection has been accessed by
78 if not hasattr(inst, "_connectionCache"):
79 object.__setattr__(inst, "_connectionCache", {})
80 # Look up an existing cached instance
81 idSelf = id(self)
82 if idSelf in inst._connectionCache:
83 return inst._connectionCache[idSelf]
84 # Accumulate the parameters that define this connection
85 params = {}
86 for field in dataclasses.fields(self):
87 params[field.name] = getattr(self, field.name)
88 # Get the name override defined by the instance of the connection class
89 params["name"] = inst._nameOverrides[self.varName]
90 # Return a new instance of this connection specialized with the
91 # information provided by the connection class instance
92 return inst._connectionCache.setdefault(idSelf, self.__class__(**params))
94 def makeDatasetType(
95 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
96 ) -> DatasetType:
97 """Construct a true `DatasetType` instance with normalized dimensions.
99 Parameters
100 ----------
101 universe : `lsst.daf.butler.DimensionUniverse`
102 Set of all known dimensions to be used to normalize the dimension
103 names specified in config.
104 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
105 Parent storage class for component datasets; `None` otherwise.
107 Returns
108 -------
109 datasetType : `DatasetType`
110 The `DatasetType` defined by this connection.
111 """
112 return DatasetType(
113 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
114 )
117@dataclasses.dataclass(frozen=True)
118class DimensionedConnection(BaseConnection):
119 """Class used for declaring PipelineTask connections that includes
120 dimensions
122 Parameters
123 ----------
124 name : `str`
125 The name used to identify the dataset type
126 storageClass : `str`
127 The storage class used when (un)/persisting the dataset type
128 multiple : `bool`
129 Indicates if this connection should expect to contain multiple objects
130 of the given dataset type. Tasks with more than one connection with
131 ``multiple=True`` with the same dimensions may want to implement
132 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
133 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
134 the execution system as early as possible of outputs that will not be
135 produced because the corresponding input is missing.
136 dimensions : iterable of `str`
137 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
138 to identify the dataset type identified by the specified name
139 isCalibration: `bool`, optional
140 `True` if this dataset type may be included in CALIBRATION-type
141 collections to associate it with a validity range, `False` (default)
142 otherwise.
143 """
145 dimensions: typing.Iterable[str] = ()
146 isCalibration: bool = False
148 def __post_init__(self):
149 if isinstance(self.dimensions, str): 149 ↛ 150line 149 didn't jump to line 150, because the condition on line 149 was never true
150 raise TypeError(
151 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
152 )
153 if not isinstance(self.dimensions, typing.Iterable): 153 ↛ 154line 153 didn't jump to line 154, because the condition on line 153 was never true
154 raise TypeError("Dimensions must be iterable of dimensions")
156 def makeDatasetType(
157 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
158 ) -> DatasetType:
159 """Construct a true `DatasetType` instance with normalized dimensions.
161 Parameters
162 ----------
163 universe : `lsst.daf.butler.DimensionUniverse`
164 Set of all known dimensions to be used to normalize the dimension
165 names specified in config.
166 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
167 Parent storage class for component datasets; `None` otherwise.
169 Returns
170 -------
171 datasetType : `DatasetType`
172 The `DatasetType` defined by this connection.
173 """
174 return DatasetType(
175 self.name,
176 universe.extract(self.dimensions),
177 self.storageClass,
178 isCalibration=self.isCalibration,
179 parentStorageClass=parentStorageClass,
180 )
183@dataclasses.dataclass(frozen=True)
184class BaseInput(DimensionedConnection):
185 """Class used for declaring PipelineTask input connections
187 Parameters
188 ----------
189 name : `str`
190 The default name used to identify the dataset type
191 storageClass : `str`
192 The storage class used when (un)/persisting the dataset type
193 multiple : `bool`
194 Indicates if this connection should expect to contain multiple objects
195 of the given dataset type. Tasks with more than one connection with
196 ``multiple=True`` with the same dimensions may want to implement
197 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
198 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
199 the execution system as early as possible of outputs that will not be
200 produced because the corresponding input is missing.
201 dimensions : iterable of `str`
202 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
203 to identify the dataset type identified by the specified name
204 deferLoad : `bool`
205 Indicates that this dataset type will be loaded as a
206 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
207 object to load the object at a later time.
208 minimum : `bool`
209 Minimum number of datasets required for this connection, per quantum.
210 This is checked in the base implementation of
211 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
212 the minimum is not met for `Input` connections (causing the quantum to
213 be pruned, skipped, or never created, depending on the context), and
214 `FileNotFoundError` for `PrerequisiteInput` connections (causing
215 QuantumGraph generation to fail). `PipelineTask` implementations may
216 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
217 for more fine-grained or configuration-driven constraints, as long as
218 they are compatible with this minium.
220 Raises
221 ------
222 TypeError
223 Raised if ``minimum`` is greater than one but ``multiple=False``.
224 NotImplementedError
225 Raised if ``minimum`` is zero for a regular `Input` connection; this
226 is not currently supported by our QuantumGraph generation algorithm.
227 """
229 deferLoad: bool = False
230 minimum: int = 1
232 def __post_init__(self) -> None:
233 super().__post_init__()
234 if self.minimum > 1 and not self.multiple: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
238@dataclasses.dataclass(frozen=True)
239class Input(BaseInput):
240 """Class used for declaring PipelineTask input connections
242 Parameters
243 ----------
244 name : `str`
245 The default name used to identify the dataset type
246 storageClass : `str`
247 The storage class used when (un)/persisting the dataset type
248 multiple : `bool`
249 Indicates if this connection should expect to contain multiple objects
250 of the given dataset type. Tasks with more than one connection with
251 ``multiple=True`` with the same dimensions may want to implement
252 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
253 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
254 the execution system as early as possible of outputs that will not be
255 produced because the corresponding input is missing.
256 dimensions : iterable of `str`
257 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
258 to identify the dataset type identified by the specified name
259 deferLoad : `bool`
260 Indicates that this dataset type will be loaded as a
261 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
262 object to load the object at a later time.
263 minimum : `bool`
264 Minimum number of datasets required for this connection, per quantum.
265 This is checked in the base implementation of
266 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
267 the minimum is not met for `Input` connections (causing the quantum to
268 be pruned, skipped, or never created, depending on the context), and
269 `FileNotFoundError` for `PrerequisiteInput` connections (causing
270 QuantumGraph generation to fail). `PipelineTask` implementations may
271 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
272 for more fine-grained or configuration-driven constraints, as long as
273 they are compatible with this minium.
274 deferGraphConstraint: `bool`, optional
275 If `True`, do not include this dataset type's existence in the initial
276 query that starts the QuantumGraph generation process. This can be
277 used to make QuantumGraph generation faster by avoiding redundant
278 datasets, and in certain cases it can (along with careful attention to
279 which tasks are included in the same QuantumGraph) be used to work
280 around the QuantumGraph generation algorithm's inflexible handling of
281 spatial overlaps. This option has no effect when the connection is not
282 an overall input of the pipeline (or subset thereof) for which a graph
283 is being created, and it never affects the ordering of quanta.
285 Raises
286 ------
287 TypeError
288 Raised if ``minimum`` is greater than one but ``multiple=False``.
289 NotImplementedError
290 Raised if ``minimum`` is zero for a regular `Input` connection; this
291 is not currently supported by our QuantumGraph generation algorithm.
292 """
294 deferGraphConstraint: bool = False
296 def __post_init__(self) -> None:
297 super().__post_init__()
298 if self.minimum == 0: 298 ↛ 299line 298 didn't jump to line 299, because the condition on line 298 was never true
299 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
302@dataclasses.dataclass(frozen=True)
303class PrerequisiteInput(BaseInput):
304 """Class used for declaring PipelineTask prerequisite connections
306 Parameters
307 ----------
308 name : `str`
309 The default name used to identify the dataset type
310 storageClass : `str`
311 The storage class used when (un)/persisting the dataset type
312 multiple : `bool`
313 Indicates if this connection should expect to contain multiple objects
314 of the given dataset type. Tasks with more than one connection with
315 ``multiple=True`` with the same dimensions may want to implement
316 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
317 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
318 the execution system as early as possible of outputs that will not be
319 produced because the corresponding input is missing.
320 dimensions : iterable of `str`
321 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
322 to identify the dataset type identified by the specified name
323 minimum : `bool`
324 Minimum number of datasets required for this connection, per quantum.
325 This is checked in the base implementation of
326 `PipelineTaskConnections.adjustQuantum`, which raises
327 `FileNotFoundError` (causing QuantumGraph generation to fail).
328 `PipelineTask` implementations may
329 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
330 for more fine-grained or configuration-driven constraints, as long as
331 they are compatible with this minium.
332 lookupFunction: `typing.Callable`, optional
333 An optional callable function that will look up PrerequisiteInputs
334 using the DatasetType, registry, quantum dataId, and input collections
335 passed to it. If no function is specified, the default temporal spatial
336 lookup will be used.
338 Raises
339 ------
340 TypeError
341 Raised if ``minimum`` is greater than one but ``multiple=False``.
343 Notes
344 -----
345 Prerequisite inputs are used for datasets that must exist in the data
346 repository before a pipeline including this is run; they cannot be produced
347 by another task in the same pipeline.
349 In exchange for this limitation, they have a number of advantages relative
350 to regular `Input` connections:
352 - The query used to find them then during `QuantumGraph` generation can be
353 fully customized by providing a ``lookupFunction``.
354 - Failed searches for prerequisites during `QuantumGraph` generation will
355 usually generate more helpful diagnostics than those for regular `Input`
356 connections.
357 - The default query for prerequisite inputs relates the quantum dimensions
358 directly to the dimensions of its dataset type, without being constrained
359 by any of the other dimensions in the pipeline. This allows them to be
360 used for temporal calibration lookups (which regular `Input` connections
361 cannot do at present) and to work around `QuantumGraph` generation
362 limitations involving cases where naive spatial overlap relationships
363 between dimensions are not desired (e.g. a task that wants all detectors
364 in each visit for which the visit overlaps a tract, not just those where
365 that detector+visit combination overlaps the tract).
366 - Prerequisite inputs may be optional (regular inputs are never optional).
368 """
370 lookupFunction: Optional[
371 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]]
372 ] = None
375@dataclasses.dataclass(frozen=True)
376class Output(DimensionedConnection):
377 pass
380@dataclasses.dataclass(frozen=True)
381class InitInput(BaseConnection):
382 pass
385@dataclasses.dataclass(frozen=True)
386class InitOutput(BaseConnection):
387 pass