Coverage for python/lsst/pipe/base/connectionTypes.py: 69%

65 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-27 09:57 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29import typing 

30from collections.abc import Callable, Iterable, Sequence 

31from typing import Optional, Union 

32 

33from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

34 

35 

36@dataclasses.dataclass(frozen=True) 

37class BaseConnection: 

38 """Base class used for declaring PipelineTask connections 

39 

40 Parameters 

41 ---------- 

42 name : `str` 

43 The name used to identify the dataset type 

44 storageClass : `str` 

45 The storage class used when (un)/persisting the dataset type 

46 multiple : `bool` 

47 Indicates if this connection should expect to contain multiple objects 

48 of the given dataset type. Tasks with more than one connection with 

49 ``multiple=True`` with the same dimensions may want to implement 

50 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

51 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

52 the execution system as early as possible of outputs that will not be 

53 produced because the corresponding input is missing. 

54 """ 

55 

56 name: str 

57 storageClass: str 

58 doc: str = "" 

59 multiple: bool = False 

60 

61 def __get__(self, inst, klass): 

62 """Descriptor method 

63 

64 This is a method used to turn a connection into a descriptor. 

65 When a connection is added to a connection class, it is a class level 

66 variable. This method makes accessing this connection, on the 

67 instance of the connection class owning this connection, return a 

68 result specialized for that instance. In the case of connections 

69 this specifically means names specified in a config instance will 

70 be visible instead of the default names for the connection. 

71 """ 

72 # If inst is None, this is being accessed by the class and not an 

73 # instance, return this connection itself 

74 if inst is None: 

75 return self 

76 # If no object cache exists, create one to track the instances this 

77 # connection has been accessed by 

78 if not hasattr(inst, "_connectionCache"): 

79 object.__setattr__(inst, "_connectionCache", {}) 

80 # Look up an existing cached instance 

81 idSelf = id(self) 

82 if idSelf in inst._connectionCache: 

83 return inst._connectionCache[idSelf] 

84 # Accumulate the parameters that define this connection 

85 params = {} 

86 for field in dataclasses.fields(self): 

87 params[field.name] = getattr(self, field.name) 

88 # Get the name override defined by the instance of the connection class 

89 params["name"] = inst._nameOverrides[self.varName] 

90 # Return a new instance of this connection specialized with the 

91 # information provided by the connection class instance 

92 return inst._connectionCache.setdefault(idSelf, self.__class__(**params)) 

93 

94 def makeDatasetType( 

95 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None 

96 ) -> DatasetType: 

97 """Construct a true `DatasetType` instance with normalized dimensions. 

98 

99 Parameters 

100 ---------- 

101 universe : `lsst.daf.butler.DimensionUniverse` 

102 Set of all known dimensions to be used to normalize the dimension 

103 names specified in config. 

104 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

105 Parent storage class for component datasets; `None` otherwise. 

106 

107 Returns 

108 ------- 

109 datasetType : `DatasetType` 

110 The `DatasetType` defined by this connection. 

111 """ 

112 return DatasetType( 

113 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

114 ) 

115 

116 

117@dataclasses.dataclass(frozen=True) 

118class DimensionedConnection(BaseConnection): 

119 """Class used for declaring PipelineTask connections that includes 

120 dimensions 

121 

122 Parameters 

123 ---------- 

124 name : `str` 

125 The name used to identify the dataset type 

126 storageClass : `str` 

127 The storage class used when (un)/persisting the dataset type 

128 multiple : `bool` 

129 Indicates if this connection should expect to contain multiple objects 

130 of the given dataset type. Tasks with more than one connection with 

131 ``multiple=True`` with the same dimensions may want to implement 

132 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

133 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

134 the execution system as early as possible of outputs that will not be 

135 produced because the corresponding input is missing. 

136 dimensions : iterable of `str` 

137 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

138 to identify the dataset type identified by the specified name 

139 isCalibration: `bool`, optional 

140 `True` if this dataset type may be included in CALIBRATION-type 

141 collections to associate it with a validity range, `False` (default) 

142 otherwise. 

143 """ 

144 

145 dimensions: typing.Iterable[str] = () 

146 isCalibration: bool = False 

147 

148 def __post_init__(self): 

149 if isinstance(self.dimensions, str): 149 ↛ 150line 149 didn't jump to line 150, because the condition on line 149 was never true

150 raise TypeError( 

151 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

152 ) 

153 if not isinstance(self.dimensions, typing.Iterable): 153 ↛ 154line 153 didn't jump to line 154, because the condition on line 153 was never true

154 raise TypeError("Dimensions must be iterable of dimensions") 

155 

156 def makeDatasetType( 

157 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None 

158 ) -> DatasetType: 

159 """Construct a true `DatasetType` instance with normalized dimensions. 

160 

161 Parameters 

162 ---------- 

163 universe : `lsst.daf.butler.DimensionUniverse` 

164 Set of all known dimensions to be used to normalize the dimension 

165 names specified in config. 

166 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

167 Parent storage class for component datasets; `None` otherwise. 

168 

169 Returns 

170 ------- 

171 datasetType : `DatasetType` 

172 The `DatasetType` defined by this connection. 

173 """ 

174 return DatasetType( 

175 self.name, 

176 universe.extract(self.dimensions), 

177 self.storageClass, 

178 isCalibration=self.isCalibration, 

179 parentStorageClass=parentStorageClass, 

180 ) 

181 

182 

183@dataclasses.dataclass(frozen=True) 

184class BaseInput(DimensionedConnection): 

185 """Class used for declaring PipelineTask input connections 

186 

187 Parameters 

188 ---------- 

189 name : `str` 

190 The default name used to identify the dataset type 

191 storageClass : `str` 

192 The storage class used when (un)/persisting the dataset type 

193 multiple : `bool` 

194 Indicates if this connection should expect to contain multiple objects 

195 of the given dataset type. Tasks with more than one connection with 

196 ``multiple=True`` with the same dimensions may want to implement 

197 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

198 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

199 the execution system as early as possible of outputs that will not be 

200 produced because the corresponding input is missing. 

201 dimensions : iterable of `str` 

202 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

203 to identify the dataset type identified by the specified name 

204 deferLoad : `bool` 

205 Indicates that this dataset type will be loaded as a 

206 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

207 object to load the object at a later time. 

208 minimum : `bool` 

209 Minimum number of datasets required for this connection, per quantum. 

210 This is checked in the base implementation of 

211 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

212 the minimum is not met for `Input` connections (causing the quantum to 

213 be pruned, skipped, or never created, depending on the context), and 

214 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

215 QuantumGraph generation to fail). `PipelineTask` implementations may 

216 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

217 for more fine-grained or configuration-driven constraints, as long as 

218 they are compatible with this minium. 

219 

220 Raises 

221 ------ 

222 TypeError 

223 Raised if ``minimum`` is greater than one but ``multiple=False``. 

224 NotImplementedError 

225 Raised if ``minimum`` is zero for a regular `Input` connection; this 

226 is not currently supported by our QuantumGraph generation algorithm. 

227 """ 

228 

229 deferLoad: bool = False 

230 minimum: int = 1 

231 

232 def __post_init__(self) -> None: 

233 super().__post_init__() 

234 if self.minimum > 1 and not self.multiple: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

236 

237 

238@dataclasses.dataclass(frozen=True) 

239class Input(BaseInput): 

240 def __post_init__(self) -> None: 

241 super().__post_init__() 

242 if self.minimum == 0: 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true

243 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

244 

245 

246@dataclasses.dataclass(frozen=True) 

247class PrerequisiteInput(BaseInput): 

248 """Class used for declaring PipelineTask prerequisite connections 

249 

250 Parameters 

251 ---------- 

252 name : `str` 

253 The default name used to identify the dataset type 

254 storageClass : `str` 

255 The storage class used when (un)/persisting the dataset type 

256 multiple : `bool` 

257 Indicates if this connection should expect to contain multiple objects 

258 of the given dataset type. Tasks with more than one connection with 

259 ``multiple=True`` with the same dimensions may want to implement 

260 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

261 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

262 the execution system as early as possible of outputs that will not be 

263 produced because the corresponding input is missing. 

264 dimensions : iterable of `str` 

265 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

266 to identify the dataset type identified by the specified name 

267 minimum : `bool` 

268 Minimum number of datasets required for this connection, per quantum. 

269 This is checked in the base implementation of 

270 `PipelineTaskConnections.adjustQuantum`, which raises 

271 `FileNotFoundError` (causing QuantumGraph generation to fail). 

272 `PipelineTask` implementations may 

273 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

274 for more fine-grained or configuration-driven constraints, as long as 

275 they are compatible with this minium. 

276 lookupFunction: `typing.Callable`, optional 

277 An optional callable function that will look up PrerequisiteInputs 

278 using the DatasetType, registry, quantum dataId, and input collections 

279 passed to it. If no function is specified, the default temporal spatial 

280 lookup will be used. 

281 

282 Raises 

283 ------ 

284 TypeError 

285 Raised if ``minimum`` is greater than one but ``multiple=False``. 

286 

287 Notes 

288 ----- 

289 Prerequisite inputs are used for datasets that must exist in the data 

290 repository before a pipeline including this is run; they cannot be produced 

291 by another task in the same pipeline. 

292 

293 In exchange for this limitation, they have a number of advantages relative 

294 to regular `Input` connections: 

295 

296 - The query used to find them then during `QuantumGraph` generation can be 

297 fully customized by providing a ``lookupFunction``. 

298 - Failed searches for prerequisites during `QuantumGraph` generation will 

299 usually generate more helpful diagnostics than those for regular `Input` 

300 connections. 

301 - The default query for prerequisite inputs relates the quantum dimensions 

302 directly to the dimensions of its dataset type, without being constrained 

303 by any of the other dimensions in the pipeline. This allows them to be 

304 used for temporal calibration lookups (which regular `Input` connections 

305 cannot do at present) and to work around `QuantumGraph` generation 

306 limitations involving cases where naive spatial overlap relationships 

307 between dimensions are not desired (e.g. a task that wants all detectors 

308 in each visit for which the visit overlaps a tract, not just those where 

309 that detector+visit combination overlaps the tract). 

310 - Prerequisite inputs may be optional (regular inputs are never optional). 

311 

312 """ 

313 

314 lookupFunction: Optional[ 

315 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]] 

316 ] = None 

317 

318 

319@dataclasses.dataclass(frozen=True) 

320class Output(DimensionedConnection): 

321 pass 

322 

323 

324@dataclasses.dataclass(frozen=True) 

325class InitInput(BaseConnection): 

326 pass 

327 

328 

329@dataclasses.dataclass(frozen=True) 

330class InitOutput(BaseConnection): 

331 pass