Coverage for python/lsst/pipe/base/connectionTypes.py: 69%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

64 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29import typing 

30from typing import Callable, Iterable, Optional 

31 

32from lsst.daf.butler import ( 

33 CollectionSearch, 

34 DataCoordinate, 

35 DatasetRef, 

36 DatasetType, 

37 DimensionUniverse, 

38 Registry, 

39 StorageClass, 

40) 

41 

42 

43@dataclasses.dataclass(frozen=True) 

44class BaseConnection: 

45 """Base class used for declaring PipelineTask connections 

46 

47 Parameters 

48 ---------- 

49 name : `str` 

50 The name used to identify the dataset type 

51 storageClass : `str` 

52 The storage class used when (un)/persisting the dataset type 

53 multiple : `bool` 

54 Indicates if this connection should expect to contain multiple objects 

55 of the given dataset type. Tasks with more than one connection with 

56 ``multiple=True`` with the same dimensions may want to implement 

57 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

58 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

59 the execution system as early as possible of outputs that will not be 

60 produced because the corresponding input is missing. 

61 """ 

62 

63 name: str 

64 storageClass: str 

65 doc: str = "" 

66 multiple: bool = False 

67 

68 def __get__(self, inst, klass): 

69 """Descriptor method 

70 

71 This is a method used to turn a connection into a descriptor. 

72 When a connection is added to a connection class, it is a class level 

73 variable. This method makes accessing this connection, on the 

74 instance of the connection class owning this connection, return a 

75 result specialized for that instance. In the case of connections 

76 this specifically means names specified in a config instance will 

77 be visible instead of the default names for the connection. 

78 """ 

79 # If inst is None, this is being accessed by the class and not an 

80 # instance, return this connection itself 

81 if inst is None: 

82 return self 

83 # If no object cache exists, create one to track the instances this 

84 # connection has been accessed by 

85 if not hasattr(inst, "_connectionCache"): 

86 object.__setattr__(inst, "_connectionCache", {}) 

87 # Look up an existing cached instance 

88 idSelf = id(self) 

89 if idSelf in inst._connectionCache: 

90 return inst._connectionCache[idSelf] 

91 # Accumulate the parameters that define this connection 

92 params = {} 

93 for field in dataclasses.fields(self): 

94 params[field.name] = getattr(self, field.name) 

95 # Get the name override defined by the instance of the connection class 

96 params["name"] = inst._nameOverrides[self.varName] 

97 # Return a new instance of this connection specialized with the 

98 # information provided by the connection class instance 

99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params)) 

100 

101 def makeDatasetType(self, universe: DimensionUniverse, parentStorageClass: Optional[StorageClass] = None): 

102 """Construct a true `DatasetType` instance with normalized dimensions. 

103 

104 Parameters 

105 ---------- 

106 universe : `lsst.daf.butler.DimensionUniverse` 

107 Set of all known dimensions to be used to normalize the dimension 

108 names specified in config. 

109 parentStorageClass : `lsst.daf.butler.StorageClass`, optional 

110 Parent storage class for component datasets; `None` otherwise. 

111 

112 Returns 

113 ------- 

114 datasetType : `DatasetType` 

115 The `DatasetType` defined by this connection. 

116 """ 

117 return DatasetType( 

118 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

119 ) 

120 

121 

122@dataclasses.dataclass(frozen=True) 

123class DimensionedConnection(BaseConnection): 

124 """Class used for declaring PipelineTask connections that includes 

125 dimensions 

126 

127 Parameters 

128 ---------- 

129 name : `str` 

130 The name used to identify the dataset type 

131 storageClass : `str` 

132 The storage class used when (un)/persisting the dataset type 

133 multiple : `bool` 

134 Indicates if this connection should expect to contain multiple objects 

135 of the given dataset type. Tasks with more than one connection with 

136 ``multiple=True`` with the same dimensions may want to implement 

137 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

138 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

139 the execution system as early as possible of outputs that will not be 

140 produced because the corresponding input is missing. 

141 dimensions : iterable of `str` 

142 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

143 to identify the dataset type identified by the specified name 

144 isCalibration: `bool`, optional 

145 `True` if this dataset type may be included in CALIBRATION-type 

146 collections to associate it with a validity range, `False` (default) 

147 otherwise. 

148 """ 

149 

150 dimensions: typing.Iterable[str] = () 

151 isCalibration: bool = False 

152 

153 def __post_init__(self): 

154 if isinstance(self.dimensions, str): 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true

155 raise TypeError( 

156 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

157 ) 

158 if not isinstance(self.dimensions, typing.Iterable): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true

159 raise TypeError("Dimensions must be iterable of dimensions") 

160 

161 def makeDatasetType(self, universe: DimensionUniverse, parentStorageClass: Optional[StorageClass] = None): 

162 """Construct a true `DatasetType` instance with normalized dimensions. 

163 

164 Parameters 

165 ---------- 

166 universe : `lsst.daf.butler.DimensionUniverse` 

167 Set of all known dimensions to be used to normalize the dimension 

168 names specified in config. 

169 parentStorageClass : `lsst.daf.butler.StorageClass`, optional 

170 Parent storage class for component datasets; `None` otherwise. 

171 

172 Returns 

173 ------- 

174 datasetType : `DatasetType` 

175 The `DatasetType` defined by this connection. 

176 """ 

177 return DatasetType( 

178 self.name, 

179 universe.extract(self.dimensions), 

180 self.storageClass, 

181 isCalibration=self.isCalibration, 

182 parentStorageClass=parentStorageClass, 

183 ) 

184 

185 

186@dataclasses.dataclass(frozen=True) 

187class BaseInput(DimensionedConnection): 

188 """Class used for declaring PipelineTask input connections 

189 

190 Parameters 

191 ---------- 

192 name : `str` 

193 The default name used to identify the dataset type 

194 storageClass : `str` 

195 The storage class used when (un)/persisting the dataset type 

196 multiple : `bool` 

197 Indicates if this connection should expect to contain multiple objects 

198 of the given dataset type. Tasks with more than one connection with 

199 ``multiple=True`` with the same dimensions may want to implement 

200 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

201 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

202 the execution system as early as possible of outputs that will not be 

203 produced because the corresponding input is missing. 

204 dimensions : iterable of `str` 

205 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

206 to identify the dataset type identified by the specified name 

207 deferLoad : `bool` 

208 Indicates that this dataset type will be loaded as a 

209 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

210 object to load the object at a later time. 

211 minimum : `bool` 

212 Minimum number of datasets required for this connection, per quantum. 

213 This is checked in the base implementation of 

214 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

215 the minimum is not met for `Input` connections (causing the quantum to 

216 be pruned, skipped, or never created, depending on the context), and 

217 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

218 QuantumGraph generation to fail). `PipelineTask` implementations may 

219 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

220 for more fine-grained or configuration-driven constraints, as long as 

221 they are compatible with this minium. 

222 

223 Raises 

224 ------ 

225 TypeError 

226 Raised if ``minimum`` is greater than one but ``multiple=False``. 

227 NotImplementedError 

228 Raised if ``minimum`` is zero for a regular `Input` connection; this 

229 is not currently supported by our QuantumGraph generation algorithm. 

230 """ 

231 

232 deferLoad: bool = False 

233 minimum: int = 1 

234 

235 def __post_init__(self) -> None: 

236 super().__post_init__() 

237 if self.minimum > 1 and not self.multiple: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

239 

240 

241@dataclasses.dataclass(frozen=True) 

242class Input(BaseInput): 

243 def __post_init__(self) -> None: 

244 super().__post_init__() 

245 if self.minimum == 0: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

247 

248 

249@dataclasses.dataclass(frozen=True) 

250class PrerequisiteInput(BaseInput): 

251 """Class used for declaring PipelineTask prerequisite connections 

252 

253 Parameters 

254 ---------- 

255 name : `str` 

256 The default name used to identify the dataset type 

257 storageClass : `str` 

258 The storage class used when (un)/persisting the dataset type 

259 multiple : `bool` 

260 Indicates if this connection should expect to contain multiple objects 

261 of the given dataset type. Tasks with more than one connection with 

262 ``multiple=True`` with the same dimensions may want to implement 

263 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

264 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

265 the execution system as early as possible of outputs that will not be 

266 produced because the corresponding input is missing. 

267 dimensions : iterable of `str` 

268 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

269 to identify the dataset type identified by the specified name 

270 minimum : `bool` 

271 Minimum number of datasets required for this connection, per quantum. 

272 This is checked in the base implementation of 

273 `PipelineTaskConnections.adjustQuantum`, which raises 

274 `FileNotFoundError` (causing QuantumGraph generation to fail). 

275 `PipelineTask` implementations may 

276 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

277 for more fine-grained or configuration-driven constraints, as long as 

278 they are compatible with this minium. 

279 lookupFunction: `typing.Callable`, optional 

280 An optional callable function that will look up PrerequisiteInputs 

281 using the DatasetType, registry, quantum dataId, and input collections 

282 passed to it. If no function is specified, the default temporal spatial 

283 lookup will be used. 

284 

285 Raises 

286 ------ 

287 TypeError 

288 Raised if ``minimum`` is greater than one but ``multiple=False``. 

289 

290 Notes 

291 ----- 

292 Prerequisite inputs are used for datasets that must exist in the data 

293 repository before a pipeline including this is run; they cannot be produced 

294 by another task in the same pipeline. 

295 

296 In exchange for this limitation, they have a number of advantages relative 

297 to regular `Input` connections: 

298 

299 - The query used to find them then during `QuantumGraph` generation can be 

300 fully customized by providing a ``lookupFunction``. 

301 - Failed searches for prerequisites during `QuantumGraph` generation will 

302 usually generate more helpful diagnostics than those for regular `Input` 

303 connections. 

304 - The default query for prerequisite inputs relates the quantum dimensions 

305 directly to the dimensions of its dataset type, without being constrained 

306 by any of the other dimensions in the pipeline. This allows them to be 

307 used for temporal calibration lookups (which regular `Input` connections 

308 cannot do at present) and to work around `QuantumGraph` generation 

309 limitations involving cases where naive spatial overlap relationships 

310 between dimensions are not desired (e.g. a task that wants all detectors 

311 in each visit for which the visit overlaps a tract, not just those where 

312 that detector+visit combination overlaps the tract). 

313 - Prerequisite inputs may be optional (regular inputs are never optional). 

314 

315 """ 

316 

317 lookupFunction: Optional[ 

318 Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch], Iterable[DatasetRef]] 

319 ] = None 

320 

321 

322@dataclasses.dataclass(frozen=True) 

323class Output(DimensionedConnection): 

324 pass 

325 

326 

327@dataclasses.dataclass(frozen=True) 

328class InitInput(BaseConnection): 

329 pass 

330 

331 

332@dataclasses.dataclass(frozen=True) 

333class InitOutput(BaseConnection): 

334 pass