Coverage for python/lsst/pipe/base/connectionTypes.py: 69%
65 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:35 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:35 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining connection types to be used within a
23`PipelineTaskConnections` class.
24"""
26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"]
28import dataclasses
29import typing
30from typing import Callable, Iterable, Optional, Union
32from lsst.daf.butler import (
33 CollectionSearch,
34 DataCoordinate,
35 DatasetRef,
36 DatasetType,
37 DimensionUniverse,
38 Registry,
39 StorageClass,
40)
43@dataclasses.dataclass(frozen=True)
44class BaseConnection:
45 """Base class used for declaring PipelineTask connections
47 Parameters
48 ----------
49 name : `str`
50 The name used to identify the dataset type
51 storageClass : `str`
52 The storage class used when (un)/persisting the dataset type
53 multiple : `bool`
54 Indicates if this connection should expect to contain multiple objects
55 of the given dataset type. Tasks with more than one connection with
56 ``multiple=True`` with the same dimensions may want to implement
57 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
58 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
59 the execution system as early as possible of outputs that will not be
60 produced because the corresponding input is missing.
61 """
63 name: str
64 storageClass: str
65 doc: str = ""
66 multiple: bool = False
68 def __get__(self, inst, klass):
69 """Descriptor method
71 This is a method used to turn a connection into a descriptor.
72 When a connection is added to a connection class, it is a class level
73 variable. This method makes accessing this connection, on the
74 instance of the connection class owning this connection, return a
75 result specialized for that instance. In the case of connections
76 this specifically means names specified in a config instance will
77 be visible instead of the default names for the connection.
78 """
79 # If inst is None, this is being accessed by the class and not an
80 # instance, return this connection itself
81 if inst is None:
82 return self
83 # If no object cache exists, create one to track the instances this
84 # connection has been accessed by
85 if not hasattr(inst, "_connectionCache"):
86 object.__setattr__(inst, "_connectionCache", {})
87 # Look up an existing cached instance
88 idSelf = id(self)
89 if idSelf in inst._connectionCache:
90 return inst._connectionCache[idSelf]
91 # Accumulate the parameters that define this connection
92 params = {}
93 for field in dataclasses.fields(self):
94 params[field.name] = getattr(self, field.name)
95 # Get the name override defined by the instance of the connection class
96 params["name"] = inst._nameOverrides[self.varName]
97 # Return a new instance of this connection specialized with the
98 # information provided by the connection class instance
99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params))
101 def makeDatasetType(
102 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
103 ) -> DatasetType:
104 """Construct a true `DatasetType` instance with normalized dimensions.
106 Parameters
107 ----------
108 universe : `lsst.daf.butler.DimensionUniverse`
109 Set of all known dimensions to be used to normalize the dimension
110 names specified in config.
111 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
112 Parent storage class for component datasets; `None` otherwise.
114 Returns
115 -------
116 datasetType : `DatasetType`
117 The `DatasetType` defined by this connection.
118 """
119 return DatasetType(
120 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass
121 )
124@dataclasses.dataclass(frozen=True)
125class DimensionedConnection(BaseConnection):
126 """Class used for declaring PipelineTask connections that includes
127 dimensions
129 Parameters
130 ----------
131 name : `str`
132 The name used to identify the dataset type
133 storageClass : `str`
134 The storage class used when (un)/persisting the dataset type
135 multiple : `bool`
136 Indicates if this connection should expect to contain multiple objects
137 of the given dataset type. Tasks with more than one connection with
138 ``multiple=True`` with the same dimensions may want to implement
139 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
140 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
141 the execution system as early as possible of outputs that will not be
142 produced because the corresponding input is missing.
143 dimensions : iterable of `str`
144 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
145 to identify the dataset type identified by the specified name
146 isCalibration: `bool`, optional
147 `True` if this dataset type may be included in CALIBRATION-type
148 collections to associate it with a validity range, `False` (default)
149 otherwise.
150 """
152 dimensions: typing.Iterable[str] = ()
153 isCalibration: bool = False
155 def __post_init__(self):
156 if isinstance(self.dimensions, str): 156 ↛ 157line 156 didn't jump to line 157, because the condition on line 156 was never true
157 raise TypeError(
158 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma"
159 )
160 if not isinstance(self.dimensions, typing.Iterable): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 raise TypeError("Dimensions must be iterable of dimensions")
163 def makeDatasetType(
164 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None
165 ) -> DatasetType:
166 """Construct a true `DatasetType` instance with normalized dimensions.
168 Parameters
169 ----------
170 universe : `lsst.daf.butler.DimensionUniverse`
171 Set of all known dimensions to be used to normalize the dimension
172 names specified in config.
173 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional
174 Parent storage class for component datasets; `None` otherwise.
176 Returns
177 -------
178 datasetType : `DatasetType`
179 The `DatasetType` defined by this connection.
180 """
181 return DatasetType(
182 self.name,
183 universe.extract(self.dimensions),
184 self.storageClass,
185 isCalibration=self.isCalibration,
186 parentStorageClass=parentStorageClass,
187 )
190@dataclasses.dataclass(frozen=True)
191class BaseInput(DimensionedConnection):
192 """Class used for declaring PipelineTask input connections
194 Parameters
195 ----------
196 name : `str`
197 The default name used to identify the dataset type
198 storageClass : `str`
199 The storage class used when (un)/persisting the dataset type
200 multiple : `bool`
201 Indicates if this connection should expect to contain multiple objects
202 of the given dataset type. Tasks with more than one connection with
203 ``multiple=True`` with the same dimensions may want to implement
204 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
205 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
206 the execution system as early as possible of outputs that will not be
207 produced because the corresponding input is missing.
208 dimensions : iterable of `str`
209 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
210 to identify the dataset type identified by the specified name
211 deferLoad : `bool`
212 Indicates that this dataset type will be loaded as a
213 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
214 object to load the object at a later time.
215 minimum : `bool`
216 Minimum number of datasets required for this connection, per quantum.
217 This is checked in the base implementation of
218 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
219 the minimum is not met for `Input` connections (causing the quantum to
220 be pruned, skipped, or never created, depending on the context), and
221 `FileNotFoundError` for `PrerequisiteInput` connections (causing
222 QuantumGraph generation to fail). `PipelineTask` implementations may
223 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
224 for more fine-grained or configuration-driven constraints, as long as
225 they are compatible with this minium.
227 Raises
228 ------
229 TypeError
230 Raised if ``minimum`` is greater than one but ``multiple=False``.
231 NotImplementedError
232 Raised if ``minimum`` is zero for a regular `Input` connection; this
233 is not currently supported by our QuantumGraph generation algorithm.
234 """
236 deferLoad: bool = False
237 minimum: int = 1
239 def __post_init__(self) -> None:
240 super().__post_init__()
241 if self.minimum > 1 and not self.multiple: 241 ↛ 242line 241 didn't jump to line 242, because the condition on line 241 was never true
242 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.")
245@dataclasses.dataclass(frozen=True)
246class Input(BaseInput):
247 """Class used for declaring PipelineTask input connections
249 Parameters
250 ----------
251 name : `str`
252 The default name used to identify the dataset type
253 storageClass : `str`
254 The storage class used when (un)/persisting the dataset type
255 multiple : `bool`
256 Indicates if this connection should expect to contain multiple objects
257 of the given dataset type. Tasks with more than one connection with
258 ``multiple=True`` with the same dimensions may want to implement
259 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
260 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
261 the execution system as early as possible of outputs that will not be
262 produced because the corresponding input is missing.
263 dimensions : iterable of `str`
264 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
265 to identify the dataset type identified by the specified name
266 deferLoad : `bool`
267 Indicates that this dataset type will be loaded as a
268 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this
269 object to load the object at a later time.
270 minimum : `bool`
271 Minimum number of datasets required for this connection, per quantum.
272 This is checked in the base implementation of
273 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if
274 the minimum is not met for `Input` connections (causing the quantum to
275 be pruned, skipped, or never created, depending on the context), and
276 `FileNotFoundError` for `PrerequisiteInput` connections (causing
277 QuantumGraph generation to fail). `PipelineTask` implementations may
278 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
279 for more fine-grained or configuration-driven constraints, as long as
280 they are compatible with this minium.
281 deferGraphConstraint: `bool`, optional
282 If `True`, do not include this dataset type's existence in the initial
283 query that starts the QuantumGraph generation process. This can be
284 used to make QuantumGraph generation faster by avoiding redundant
285 datasets, and in certain cases it can (along with careful attention to
286 which tasks are included in the same QuantumGraph) be used to work
287 around the QuantumGraph generation algorithm's inflexible handling of
288 spatial overlaps. This option has no effect when the connection is not
289 an overall input of the pipeline (or subset thereof) for which a graph
290 is being created, and it never affects the ordering of quanta.
292 Raises
293 ------
294 TypeError
295 Raised if ``minimum`` is greater than one but ``multiple=False``.
296 NotImplementedError
297 Raised if ``minimum`` is zero for a regular `Input` connection; this
298 is not currently supported by our QuantumGraph generation algorithm.
299 """
301 deferGraphConstraint: bool = False
303 def __post_init__(self) -> None:
304 super().__post_init__()
305 if self.minimum == 0: 305 ↛ 306line 305 didn't jump to line 306, because the condition on line 305 was never true
306 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.")
309@dataclasses.dataclass(frozen=True)
310class PrerequisiteInput(BaseInput):
311 """Class used for declaring PipelineTask prerequisite connections
313 Parameters
314 ----------
315 name : `str`
316 The default name used to identify the dataset type
317 storageClass : `str`
318 The storage class used when (un)/persisting the dataset type
319 multiple : `bool`
320 Indicates if this connection should expect to contain multiple objects
321 of the given dataset type. Tasks with more than one connection with
322 ``multiple=True`` with the same dimensions may want to implement
323 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are
324 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify
325 the execution system as early as possible of outputs that will not be
326 produced because the corresponding input is missing.
327 dimensions : iterable of `str`
328 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used
329 to identify the dataset type identified by the specified name
330 minimum : `bool`
331 Minimum number of datasets required for this connection, per quantum.
332 This is checked in the base implementation of
333 `PipelineTaskConnections.adjustQuantum`, which raises
334 `FileNotFoundError` (causing QuantumGraph generation to fail).
335 `PipelineTask` implementations may
336 provide custom `~PipelineTaskConnections.adjustQuantum` implementations
337 for more fine-grained or configuration-driven constraints, as long as
338 they are compatible with this minium.
339 lookupFunction: `typing.Callable`, optional
340 An optional callable function that will look up PrerequisiteInputs
341 using the DatasetType, registry, quantum dataId, and input collections
342 passed to it. If no function is specified, the default temporal spatial
343 lookup will be used.
345 Raises
346 ------
347 TypeError
348 Raised if ``minimum`` is greater than one but ``multiple=False``.
350 Notes
351 -----
352 Prerequisite inputs are used for datasets that must exist in the data
353 repository before a pipeline including this is run; they cannot be produced
354 by another task in the same pipeline.
356 In exchange for this limitation, they have a number of advantages relative
357 to regular `Input` connections:
359 - The query used to find them then during `QuantumGraph` generation can be
360 fully customized by providing a ``lookupFunction``.
361 - Failed searches for prerequisites during `QuantumGraph` generation will
362 usually generate more helpful diagnostics than those for regular `Input`
363 connections.
364 - The default query for prerequisite inputs relates the quantum dimensions
365 directly to the dimensions of its dataset type, without being constrained
366 by any of the other dimensions in the pipeline. This allows them to be
367 used for temporal calibration lookups (which regular `Input` connections
368 cannot do at present) and to work around `QuantumGraph` generation
369 limitations involving cases where naive spatial overlap relationships
370 between dimensions are not desired (e.g. a task that wants all detectors
371 in each visit for which the visit overlaps a tract, not just those where
372 that detector+visit combination overlaps the tract).
373 - Prerequisite inputs may be optional (regular inputs are never optional).
375 """
377 lookupFunction: Optional[
378 Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch], Iterable[DatasetRef]]
379 ] = None
382@dataclasses.dataclass(frozen=True)
383class Output(DimensionedConnection):
384 pass
387@dataclasses.dataclass(frozen=True)
388class InitInput(BaseConnection):
389 pass
392@dataclasses.dataclass(frozen=True)
393class InitOutput(BaseConnection):
394 pass