Coverage for python/lsst/pipe/base/connectionTypes.py: 69%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

64 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", 

27 "Output", "BaseConnection"] 

28 

29import dataclasses 

30import typing 

31from typing import Callable, Iterable, Optional 

32 

33from lsst.daf.butler import ( 

34 CollectionSearch, 

35 DataCoordinate, 

36 DatasetRef, 

37 DatasetType, 

38 DimensionUniverse, 

39 Registry, 

40 StorageClass, 

41) 

42 

43 

44@dataclasses.dataclass(frozen=True) 

45class BaseConnection: 

46 """Base class used for declaring PipelineTask connections 

47 

48 Parameters 

49 ---------- 

50 name : `str` 

51 The name used to identify the dataset type 

52 storageClass : `str` 

53 The storage class used when (un)/persisting the dataset type 

54 multiple : `bool` 

55 Indicates if this connection should expect to contain multiple objects 

56 of the given dataset type. Tasks with more than one connection with 

57 ``multiple=True`` with the same dimensions may want to implement 

58 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

59 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

60 the execution system as early as possible of outputs that will not be 

61 produced because the corresponding input is missing. 

62 """ 

63 name: str 

64 storageClass: str 

65 doc: str = "" 

66 multiple: bool = False 

67 

68 def __get__(self, inst, klass): 

69 """Descriptor method 

70 

71 This is a method used to turn a connection into a descriptor. 

72 When a connection is added to a connection class, it is a class level 

73 variable. This method makes accessing this connection, on the 

74 instance of the connection class owning this connection, return a 

75 result specialized for that instance. In the case of connections 

76 this specifically means names specified in a config instance will 

77 be visible instead of the default names for the connection. 

78 """ 

79 # If inst is None, this is being accessed by the class and not an 

80 # instance, return this connection itself 

81 if inst is None: 

82 return self 

83 # If no object cache exists, create one to track the instances this 

84 # connection has been accessed by 

85 if not hasattr(inst, '_connectionCache'): 

86 object.__setattr__(inst, '_connectionCache', {}) 

87 # Look up an existing cached instance 

88 idSelf = id(self) 

89 if idSelf in inst._connectionCache: 

90 return inst._connectionCache[idSelf] 

91 # Accumulate the parameters that define this connection 

92 params = {} 

93 for field in dataclasses.fields(self): 

94 params[field.name] = getattr(self, field.name) 

95 # Get the name override defined by the instance of the connection class 

96 params['name'] = inst._nameOverrides[self.varName] 

97 # Return a new instance of this connection specialized with the 

98 # information provided by the connection class instance 

99 return inst._connectionCache.setdefault(idSelf, self.__class__(**params)) 

100 

101 def makeDatasetType(self, universe: DimensionUniverse, 

102 parentStorageClass: Optional[StorageClass] = None): 

103 """Construct a true `DatasetType` instance with normalized dimensions. 

104 

105 Parameters 

106 ---------- 

107 universe : `lsst.daf.butler.DimensionUniverse` 

108 Set of all known dimensions to be used to normalize the dimension 

109 names specified in config. 

110 parentStorageClass : `lsst.daf.butler.StorageClass`, optional 

111 Parent storage class for component datasets; `None` otherwise. 

112 

113 Returns 

114 ------- 

115 datasetType : `DatasetType` 

116 The `DatasetType` defined by this connection. 

117 """ 

118 return DatasetType(self.name, 

119 universe.empty, 

120 self.storageClass, 

121 parentStorageClass=parentStorageClass) 

122 

123 

124@dataclasses.dataclass(frozen=True) 

125class DimensionedConnection(BaseConnection): 

126 """Class used for declaring PipelineTask connections that includes 

127 dimensions 

128 

129 Parameters 

130 ---------- 

131 name : `str` 

132 The name used to identify the dataset type 

133 storageClass : `str` 

134 The storage class used when (un)/persisting the dataset type 

135 multiple : `bool` 

136 Indicates if this connection should expect to contain multiple objects 

137 of the given dataset type. Tasks with more than one connection with 

138 ``multiple=True`` with the same dimensions may want to implement 

139 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

140 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

141 the execution system as early as possible of outputs that will not be 

142 produced because the corresponding input is missing. 

143 dimensions : iterable of `str` 

144 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

145 to identify the dataset type identified by the specified name 

146 isCalibration: `bool`, optional 

147 `True` if this dataset type may be included in CALIBRATION-type 

148 collections to associate it with a validity range, `False` (default) 

149 otherwise. 

150 """ 

151 dimensions: typing.Iterable[str] = () 

152 isCalibration: bool = False 

153 

154 def __post_init__(self): 

155 if isinstance(self.dimensions, str): 155 ↛ 156line 155 didn't jump to line 156, because the condition on line 155 was never true

156 raise TypeError("Dimensions must be iterable of dimensions, got str," 

157 "possibly omitted trailing comma") 

158 if not isinstance(self.dimensions, typing.Iterable): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true

159 raise TypeError("Dimensions must be iterable of dimensions") 

160 

161 def makeDatasetType(self, universe: DimensionUniverse, 

162 parentStorageClass: Optional[StorageClass] = None): 

163 """Construct a true `DatasetType` instance with normalized dimensions. 

164 

165 Parameters 

166 ---------- 

167 universe : `lsst.daf.butler.DimensionUniverse` 

168 Set of all known dimensions to be used to normalize the dimension 

169 names specified in config. 

170 parentStorageClass : `lsst.daf.butler.StorageClass`, optional 

171 Parent storage class for component datasets; `None` otherwise. 

172 

173 Returns 

174 ------- 

175 datasetType : `DatasetType` 

176 The `DatasetType` defined by this connection. 

177 """ 

178 return DatasetType(self.name, 

179 universe.extract(self.dimensions), 

180 self.storageClass, isCalibration=self.isCalibration, 

181 parentStorageClass=parentStorageClass) 

182 

183 

184@dataclasses.dataclass(frozen=True) 

185class BaseInput(DimensionedConnection): 

186 """Class used for declaring PipelineTask input connections 

187 

188 Parameters 

189 ---------- 

190 name : `str` 

191 The default name used to identify the dataset type 

192 storageClass : `str` 

193 The storage class used when (un)/persisting the dataset type 

194 multiple : `bool` 

195 Indicates if this connection should expect to contain multiple objects 

196 of the given dataset type. Tasks with more than one connection with 

197 ``multiple=True`` with the same dimensions may want to implement 

198 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

199 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

200 the execution system as early as possible of outputs that will not be 

201 produced because the corresponding input is missing. 

202 dimensions : iterable of `str` 

203 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

204 to identify the dataset type identified by the specified name 

205 deferLoad : `bool` 

206 Indicates that this dataset type will be loaded as a 

207 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

208 object to load the object at a later time. 

209 minimum : `bool` 

210 Minimum number of datasets required for this connection, per quantum. 

211 This is checked in the base implementation of 

212 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

213 the minimum is not met for `Input` connections (causing the quantum to 

214 be pruned, skipped, or never created, depending on the context), and 

215 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

216 QuantumGraph generation to fail). `PipelineTask` implementations may 

217 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

218 for more fine-grained or configuration-driven constraints, as long as 

219 they are compatible with this minium. 

220 

221 Raises 

222 ------ 

223 TypeError 

224 Raised if ``minimum`` is greater than one but ``multiple=False``. 

225 NotImplementedError 

226 Raised if ``minimum`` is zero for a regular `Input` connection; this 

227 is not currently supported by our QuantumGraph generation algorithm. 

228 """ 

229 deferLoad: bool = False 

230 minimum: int = 1 

231 

232 def __post_init__(self) -> None: 

233 super().__post_init__() 

234 if self.minimum > 1 and not self.multiple: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

236 

237 

238@dataclasses.dataclass(frozen=True) 

239class Input(BaseInput): 

240 

241 def __post_init__(self) -> None: 

242 super().__post_init__() 

243 if self.minimum == 0: 243 ↛ 244line 243 didn't jump to line 244, because the condition on line 243 was never true

244 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

245 

246 

247@dataclasses.dataclass(frozen=True) 

248class PrerequisiteInput(BaseInput): 

249 """Class used for declaring PipelineTask prerequisite connections 

250 

251 Parameters 

252 ---------- 

253 name : `str` 

254 The default name used to identify the dataset type 

255 storageClass : `str` 

256 The storage class used when (un)/persisting the dataset type 

257 multiple : `bool` 

258 Indicates if this connection should expect to contain multiple objects 

259 of the given dataset type. Tasks with more than one connection with 

260 ``multiple=True`` with the same dimensions may want to implement 

261 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

262 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

263 the execution system as early as possible of outputs that will not be 

264 produced because the corresponding input is missing. 

265 dimensions : iterable of `str` 

266 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

267 to identify the dataset type identified by the specified name 

268 minimum : `bool` 

269 Minimum number of datasets required for this connection, per quantum. 

270 This is checked in the base implementation of 

271 `PipelineTaskConnections.adjustQuantum`, which raises 

272 `FileNotFoundError` (causing QuantumGraph generation to fail). 

273 `PipelineTask` implementations may 

274 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

275 for more fine-grained or configuration-driven constraints, as long as 

276 they are compatible with this minium. 

277 lookupFunction: `typing.Callable`, optional 

278 An optional callable function that will look up PrerequisiteInputs 

279 using the DatasetType, registry, quantum dataId, and input collections 

280 passed to it. If no function is specified, the default temporal spatial 

281 lookup will be used. 

282 

283 Raises 

284 ------ 

285 TypeError 

286 Raised if ``minimum`` is greater than one but ``multiple=False``. 

287 

288 Notes 

289 ----- 

290 Prerequisite inputs are used for datasets that must exist in the data 

291 repository before a pipeline including this is run; they cannot be produced 

292 by another task in the same pipeline. 

293 

294 In exchange for this limitation, they have a number of advantages relative 

295 to regular `Input` connections: 

296 

297 - The query used to find them then during `QuantumGraph` generation can be 

298 fully customized by providing a ``lookupFunction``. 

299 - Failed searches for prerequisites during `QuantumGraph` generation will 

300 usually generate more helpful diagnostics than those for regular `Input` 

301 connections. 

302 - The default query for prerequisite inputs relates the quantum dimensions 

303 directly to the dimensions of its dataset type, without being constrained 

304 by any of the other dimensions in the pipeline. This allows them to be 

305 used for temporal calibration lookups (which regular `Input` connections 

306 cannot do at present) and to work around `QuantumGraph` generation 

307 limitations involving cases where naive spatial overlap relationships 

308 between dimensions are not desired (e.g. a task that wants all detectors 

309 in each visit for which the visit overlaps a tract, not just those where 

310 that detector+visit combination overlaps the tract). 

311 - Prerequisite inputs may be optional (regular inputs are never optional). 

312 

313 """ 

314 lookupFunction: Optional[Callable[[DatasetType, Registry, DataCoordinate, CollectionSearch], 

315 Iterable[DatasetRef]]] = None 

316 

317 

318@dataclasses.dataclass(frozen=True) 

319class Output(DimensionedConnection): 

320 pass 

321 

322 

323@dataclasses.dataclass(frozen=True) 

324class InitInput(BaseConnection): 

325 pass 

326 

327 

328@dataclasses.dataclass(frozen=True) 

329class InitOutput(BaseConnection): 

330 pass