Coverage for python/lsst/pipe/base/connectionTypes.py: 69%

64 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-29 10:30 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29import typing 

30from typing import Callable, Iterable, Optional, Union 

31 

32from lsst.daf.butler import ( 

33 CollectionSearch, 

34 DataCoordinate, 

35 DatasetRef, 

36 DatasetType, 

37 DimensionUniverse, 

38 Registry, 

39 StorageClass, 

40) 

41 

42 

43@dataclasses.dataclass(frozen=True) 

44class BaseConnection: 

45 """Base class used for declaring PipelineTask connections 

46 

47 Parameters 

48 ---------- 

49 name : `str` 

50 The name used to identify the dataset type 

51 storageClass : `str` 

52 The storage class used when (un)/persisting the dataset type 

53 multiple : `bool` 

54 Indicates if this connection should expect to contain multiple objects 

55 of the given dataset type. Tasks with more than one connection with 

56 ``multiple=True`` with the same dimensions may want to implement 

57 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

58 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

59 the execution system as early as possible of outputs that will not be 

60 produced because the corresponding input is missing. 

61 """ 

62 

63 name: str 

64 storageClass: str 

65 doc: str = "" 

66 multiple: bool = False 

67 

68 def __get__(self, inst, klass): 

69 """Descriptor method 

70 

71 This is a method used to turn a connection into a descriptor. 

72 When a connection is added to a connection class, it is a class level 

73 variable. This method makes accessing this connection, on the 

74 instance of the connection class owning this connection, return a 

75 result specialized for that instance. In the case of connections 

76 this specifically means names specified in a config instance will 

77 be visible instead of the default names for the connection. 

78 """ 

79 # If inst is None, this is being accessed by the class and not an 

80 # instance, return this connection itself 

81 if inst is None: 

82 return self 

83 # If no object cache exists, create one to track the instances this 

84 # connection has been accessed by 

85 if not hasattr(inst, "_connectionCache"): 

86 object.__setattr__(inst, "_connectionCache", {}) 

87 # Look up an existing cached instance 

88 idSelf = id(self) 

89 if idSelf in inst._connectionCache: 

90 return inst._connectionCache[idSelf] 

91 # Accumulate the parameters that define this connection 

92 params = {} 

93 for field in dataclasses.fields(self): 

94 params[field.name] = getattr(self, field.name) 

95 # Get the name override defined by the instance of the connection class 

96 params["name"] = inst._nameOverrides[self.varName] 

97 # Return a new instance of this connection specialized with the 

98 # information provided by the connection class instance 

99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params)) 

100 

101 def makeDatasetType( 

102 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None 

103 ) -> DatasetType: 

104 """Construct a true `DatasetType` instance with normalized dimensions. 

105 

106 Parameters 

107 ---------- 

108 universe : `lsst.daf.butler.DimensionUniverse` 

109 Set of all known dimensions to be used to normalize the dimension 

110 names specified in config. 

111 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

112 Parent storage class for component datasets; `None` otherwise. 

113 

114 Returns 

115 ------- 

116 datasetType : `DatasetType` 

117 The `DatasetType` defined by this connection. 

118 """ 

119 return DatasetType( 

120 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

121 ) 

122 

123 

124@dataclasses.dataclass(frozen=True) 

125class DimensionedConnection(BaseConnection): 

126 """Class used for declaring PipelineTask connections that includes 

127 dimensions 

128 

129 Parameters 

130 ---------- 

131 name : `str` 

132 The name used to identify the dataset type 

133 storageClass : `str` 

134 The storage class used when (un)/persisting the dataset type 

135 multiple : `bool` 

136 Indicates if this connection should expect to contain multiple objects 

137 of the given dataset type. Tasks with more than one connection with 

138 ``multiple=True`` with the same dimensions may want to implement 

139 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

140 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

141 the execution system as early as possible of outputs that will not be 

142 produced because the corresponding input is missing. 

143 dimensions : iterable of `str` 

144 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

145 to identify the dataset type identified by the specified name 

146 isCalibration: `bool`, optional 

147 `True` if this dataset type may be included in CALIBRATION-type 

148 collections to associate it with a validity range, `False` (default) 

149 otherwise. 

150 """ 

151 

152 dimensions: typing.Iterable[str] = () 

153 isCalibration: bool = False 

154 

155 def __post_init__(self): 

156 if isinstance(self.dimensions, str): 156 ↛ 157line 156 didn't jump to line 157, because the condition on line 156 was never true

157 raise TypeError( 

158 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

159 ) 

160 if not isinstance(self.dimensions, typing.Iterable): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true

161 raise TypeError("Dimensions must be iterable of dimensions") 

162 

163 def makeDatasetType( 

164 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None 

165 ) -> DatasetType: 

166 """Construct a true `DatasetType` instance with normalized dimensions. 

167 

168 Parameters 

169 ---------- 

170 universe : `lsst.daf.butler.DimensionUniverse` 

171 Set of all known dimensions to be used to normalize the dimension 

172 names specified in config. 

173 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

174 Parent storage class for component datasets; `None` otherwise. 

175 

176 Returns 

177 ------- 

178 datasetType : `DatasetType` 

179 The `DatasetType` defined by this connection. 

180 """ 

181 return DatasetType( 

182 self.name, 

183 universe.extract(self.dimensions), 

184 self.storageClass, 

185 isCalibration=self.isCalibration, 

186 parentStorageClass=parentStorageClass, 

187 ) 

188 

189 

190@dataclasses.dataclass(frozen=True) 

191class BaseInput(DimensionedConnection): 

192 """Class used for declaring PipelineTask input connections 

193 

194 Parameters 

195 ---------- 

196 name : `str` 

197 The default name used to identify the dataset type 

198 storageClass : `str` 

199 The storage class used when (un)/persisting the dataset type 

200 multiple : `bool` 

201 Indicates if this connection should expect to contain multiple objects 

202 of the given dataset type. Tasks with more than one connection with 

203 ``multiple=True`` with the same dimensions may want to implement 

204 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

205 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

206 the execution system as early as possible of outputs that will not be 

207 produced because the corresponding input is missing. 

208 dimensions : iterable of `str` 

209 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

210 to identify the dataset type identified by the specified name 

211 deferLoad : `bool` 

212 Indicates that this dataset type will be loaded as a 

213 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

214 object to load the object at a later time. 

215 minimum : `bool` 

216 Minimum number of datasets required for this connection, per quantum. 

217 This is checked in the base implementation of 

218 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

219 the minimum is not met for `Input` connections (causing the quantum to 

220 be pruned, skipped, or never created, depending on the context), and 

221 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

222 QuantumGraph generation to fail). `PipelineTask` implementations may 

223 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

224 for more fine-grained or configuration-driven constraints, as long as 

225 they are compatible with this minium. 

226 

227 Raises 

228 ------ 

229 TypeError 

230 Raised if ``minimum`` is greater than one but ``multiple=False``. 

231 NotImplementedError 

232 Raised if ``minimum`` is zero for a regular `Input` connection; this 

233 is not currently supported by our QuantumGraph generation algorithm. 

234 """ 

235 

236 deferLoad: bool = False 

237 minimum: int = 1 

238 

239 def __post_init__(self) -> None: 

240 super().__post_init__() 

241 if self.minimum > 1 and not self.multiple: 241 ↛ 242line 241 didn't jump to line 242, because the condition on line 241 was never true

242 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

243 

244 

245@dataclasses.dataclass(frozen=True) 

246class Input(BaseInput): 

247 def __post_init__(self) -> None: 

248 super().__post_init__() 

249 if self.minimum == 0: 249 ↛ 250line 249 didn't jump to line 250, because the condition on line 249 was never true

250 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

251 

252 

253@dataclasses.dataclass(frozen=True) 

254class PrerequisiteInput(BaseInput): 

255 """Class used for declaring PipelineTask prerequisite connections 

256 

257 Parameters 

258 ---------- 

259 name : `str` 

260 The default name used to identify the dataset type 

261 storageClass : `str` 

262 The storage class used when (un)/persisting the dataset type 

263 multiple : `bool` 

264 Indicates if this connection should expect to contain multiple objects 

265 of the given dataset type. Tasks with more than one connection with 

266 ``multiple=True`` with the same dimensions may want to implement 

267 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

268 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

269 the execution system as early as possible of outputs that will not be 

270 produced because the corresponding input is missing. 

271 dimensions : iterable of `str` 

272 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

273 to identify the dataset type identified by the specified name 

274 minimum : `bool` 

275 Minimum number of datasets required for this connection, per quantum. 

276 This is checked in the base implementation of 

277 `PipelineTaskConnections.adjustQuantum`, which raises 

278 `FileNotFoundError` (causing QuantumGraph generation to fail). 

279 `PipelineTask` implementations may 

280 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

281 for more fine-grained or configuration-driven constraints, as long as 

282 they are compatible with this minium. 

283 lookupFunction: `typing.Callable`, optional 

284 An optional callable function that will look up PrerequisiteInputs 

285 using the DatasetType, registry, quantum dataId, and input collections 

286 passed to it. If no function is specified, the default temporal spatial 

287 lookup will be used. 

288 

289 Raises 

290 ------ 

291 TypeError 

292 Raised if ``minimum`` is greater than one but ``multiple=False``. 

293 

294 Notes 

295 ----- 

296 Prerequisite inputs are used for datasets that must exist in the data 

297 repository before a pipeline including this is run; they cannot be produced 

298 by another task in the same pipeline. 

299 

300 In exchange for this limitation, they have a number of advantages relative 

301 to regular `Input` connections: 

302 

303 - The query used to find them then during `QuantumGraph` generation can be 

304 fully customized by providing a ``lookupFunction``. 

305 - Failed searches for prerequisites during `QuantumGraph` generation will 

306 usually generate more helpful diagnostics than those for regular `Input` 

307 connections. 

308 - The default query for prerequisite inputs relates the quantum dimensions 

309 directly to the dimensions of its dataset type, without being constrained 

310 by any of the other dimensions in the pipeline. This allows them to be 

311 used for temporal calibration lookups (which regular `Input` connections 

312 cannot do at present) and to work around `QuantumGraph` generation 

313 limitations involving cases where naive spatial overlap relationships 

314 between dimensions are not desired (e.g. a task that wants all detectors 

315 in each visit for which the visit overlaps a tract, not just those where 

316 that detector+visit combination overlaps the tract). 

317 - Prerequisite inputs may be optional (regular inputs are never optional). 

318 

319 """ 

320 

321 lookupFunction: Optional[ 

322 Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch], Iterable[DatasetRef]] 

323 ] = None 

324 

325 

326@dataclasses.dataclass(frozen=True) 

327class Output(DimensionedConnection): 

328 pass 

329 

330 

331@dataclasses.dataclass(frozen=True) 

332class InitInput(BaseConnection): 

333 pass 

334 

335 

336@dataclasses.dataclass(frozen=True) 

337class InitOutput(BaseConnection): 

338 pass