Coverage for python/lsst/pipe/base/connectionTypes.py: 69%

66 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-30 02:48 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29import typing 

30from collections.abc import Callable, Iterable, Sequence 

31from typing import Optional, Union 

32 

33from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

34 

35 

36@dataclasses.dataclass(frozen=True) 

37class BaseConnection: 

38 """Base class used for declaring PipelineTask connections 

39 

40 Parameters 

41 ---------- 

42 name : `str` 

43 The name used to identify the dataset type 

44 storageClass : `str` 

45 The storage class used when (un)/persisting the dataset type 

46 multiple : `bool` 

47 Indicates if this connection should expect to contain multiple objects 

48 of the given dataset type. Tasks with more than one connection with 

49 ``multiple=True`` with the same dimensions may want to implement 

50 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

51 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

52 the execution system as early as possible of outputs that will not be 

53 produced because the corresponding input is missing. 

54 """ 

55 

56 name: str 

57 storageClass: str 

58 doc: str = "" 

59 multiple: bool = False 

60 

61 def __get__(self, inst, klass): 

62 """Descriptor method 

63 

64 This is a method used to turn a connection into a descriptor. 

65 When a connection is added to a connection class, it is a class level 

66 variable. This method makes accessing this connection, on the 

67 instance of the connection class owning this connection, return a 

68 result specialized for that instance. In the case of connections 

69 this specifically means names specified in a config instance will 

70 be visible instead of the default names for the connection. 

71 """ 

72 # If inst is None, this is being accessed by the class and not an 

73 # instance, return this connection itself 

74 if inst is None: 

75 return self 

76 # If no object cache exists, create one to track the instances this 

77 # connection has been accessed by 

78 if not hasattr(inst, "_connectionCache"): 

79 object.__setattr__(inst, "_connectionCache", {}) 

80 # Look up an existing cached instance 

81 idSelf = id(self) 

82 if idSelf in inst._connectionCache: 

83 return inst._connectionCache[idSelf] 

84 # Accumulate the parameters that define this connection 

85 params = {} 

86 for field in dataclasses.fields(self): 

87 params[field.name] = getattr(self, field.name) 

88 # Get the name override defined by the instance of the connection class 

89 params["name"] = inst._nameOverrides[self.varName] 

90 # Return a new instance of this connection specialized with the 

91 # information provided by the connection class instance 

92 return inst._connectionCache.setdefault(idSelf, self.__class__(**params)) 

93 

94 def makeDatasetType( 

95 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None 

96 ) -> DatasetType: 

97 """Construct a true `DatasetType` instance with normalized dimensions. 

98 

99 Parameters 

100 ---------- 

101 universe : `lsst.daf.butler.DimensionUniverse` 

102 Set of all known dimensions to be used to normalize the dimension 

103 names specified in config. 

104 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

105 Parent storage class for component datasets; `None` otherwise. 

106 

107 Returns 

108 ------- 

109 datasetType : `DatasetType` 

110 The `DatasetType` defined by this connection. 

111 """ 

112 return DatasetType( 

113 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

114 ) 

115 

116 

117@dataclasses.dataclass(frozen=True) 

118class DimensionedConnection(BaseConnection): 

119 """Class used for declaring PipelineTask connections that includes 

120 dimensions 

121 

122 Parameters 

123 ---------- 

124 name : `str` 

125 The name used to identify the dataset type 

126 storageClass : `str` 

127 The storage class used when (un)/persisting the dataset type 

128 multiple : `bool` 

129 Indicates if this connection should expect to contain multiple objects 

130 of the given dataset type. Tasks with more than one connection with 

131 ``multiple=True`` with the same dimensions may want to implement 

132 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

133 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

134 the execution system as early as possible of outputs that will not be 

135 produced because the corresponding input is missing. 

136 dimensions : iterable of `str` 

137 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

138 to identify the dataset type identified by the specified name 

139 isCalibration: `bool`, optional 

140 `True` if this dataset type may be included in CALIBRATION-type 

141 collections to associate it with a validity range, `False` (default) 

142 otherwise. 

143 """ 

144 

145 dimensions: typing.Iterable[str] = () 

146 isCalibration: bool = False 

147 

148 def __post_init__(self): 

149 if isinstance(self.dimensions, str): 149 ↛ 150line 149 didn't jump to line 150, because the condition on line 149 was never true

150 raise TypeError( 

151 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

152 ) 

153 if not isinstance(self.dimensions, typing.Iterable): 153 ↛ 154line 153 didn't jump to line 154, because the condition on line 153 was never true

154 raise TypeError("Dimensions must be iterable of dimensions") 

155 

156 def makeDatasetType( 

157 self, universe: DimensionUniverse, parentStorageClass: Optional[Union[StorageClass, str]] = None 

158 ) -> DatasetType: 

159 """Construct a true `DatasetType` instance with normalized dimensions. 

160 

161 Parameters 

162 ---------- 

163 universe : `lsst.daf.butler.DimensionUniverse` 

164 Set of all known dimensions to be used to normalize the dimension 

165 names specified in config. 

166 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

167 Parent storage class for component datasets; `None` otherwise. 

168 

169 Returns 

170 ------- 

171 datasetType : `DatasetType` 

172 The `DatasetType` defined by this connection. 

173 """ 

174 return DatasetType( 

175 self.name, 

176 universe.extract(self.dimensions), 

177 self.storageClass, 

178 isCalibration=self.isCalibration, 

179 parentStorageClass=parentStorageClass, 

180 ) 

181 

182 

183@dataclasses.dataclass(frozen=True) 

184class BaseInput(DimensionedConnection): 

185 """Class used for declaring PipelineTask input connections 

186 

187 Parameters 

188 ---------- 

189 name : `str` 

190 The default name used to identify the dataset type 

191 storageClass : `str` 

192 The storage class used when (un)/persisting the dataset type 

193 multiple : `bool` 

194 Indicates if this connection should expect to contain multiple objects 

195 of the given dataset type. Tasks with more than one connection with 

196 ``multiple=True`` with the same dimensions may want to implement 

197 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

198 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

199 the execution system as early as possible of outputs that will not be 

200 produced because the corresponding input is missing. 

201 dimensions : iterable of `str` 

202 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

203 to identify the dataset type identified by the specified name 

204 deferLoad : `bool` 

205 Indicates that this dataset type will be loaded as a 

206 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

207 object to load the object at a later time. 

208 minimum : `bool` 

209 Minimum number of datasets required for this connection, per quantum. 

210 This is checked in the base implementation of 

211 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

212 the minimum is not met for `Input` connections (causing the quantum to 

213 be pruned, skipped, or never created, depending on the context), and 

214 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

215 QuantumGraph generation to fail). `PipelineTask` implementations may 

216 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

217 for more fine-grained or configuration-driven constraints, as long as 

218 they are compatible with this minium. 

219 

220 Raises 

221 ------ 

222 TypeError 

223 Raised if ``minimum`` is greater than one but ``multiple=False``. 

224 NotImplementedError 

225 Raised if ``minimum`` is zero for a regular `Input` connection; this 

226 is not currently supported by our QuantumGraph generation algorithm. 

227 """ 

228 

229 deferLoad: bool = False 

230 minimum: int = 1 

231 

232 def __post_init__(self) -> None: 

233 super().__post_init__() 

234 if self.minimum > 1 and not self.multiple: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

236 

237 

238@dataclasses.dataclass(frozen=True) 

239class Input(BaseInput): 

240 """Class used for declaring PipelineTask input connections 

241 

242 Parameters 

243 ---------- 

244 name : `str` 

245 The default name used to identify the dataset type 

246 storageClass : `str` 

247 The storage class used when (un)/persisting the dataset type 

248 multiple : `bool` 

249 Indicates if this connection should expect to contain multiple objects 

250 of the given dataset type. Tasks with more than one connection with 

251 ``multiple=True`` with the same dimensions may want to implement 

252 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

253 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

254 the execution system as early as possible of outputs that will not be 

255 produced because the corresponding input is missing. 

256 dimensions : iterable of `str` 

257 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

258 to identify the dataset type identified by the specified name 

259 deferLoad : `bool` 

260 Indicates that this dataset type will be loaded as a 

261 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

262 object to load the object at a later time. 

263 minimum : `bool` 

264 Minimum number of datasets required for this connection, per quantum. 

265 This is checked in the base implementation of 

266 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

267 the minimum is not met for `Input` connections (causing the quantum to 

268 be pruned, skipped, or never created, depending on the context), and 

269 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

270 QuantumGraph generation to fail). `PipelineTask` implementations may 

271 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

272 for more fine-grained or configuration-driven constraints, as long as 

273 they are compatible with this minium. 

274 deferGraphConstraint: `bool`, optional 

275 If `True`, do not include this dataset type's existence in the initial 

276 query that starts the QuantumGraph generation process. This can be 

277 used to make QuantumGraph generation faster by avoiding redundant 

278 datasets, and in certain cases it can (along with careful attention to 

279 which tasks are included in the same QuantumGraph) be used to work 

280 around the QuantumGraph generation algorithm's inflexible handling of 

281 spatial overlaps. This option has no effect when the connection is not 

282 an overall input of the pipeline (or subset thereof) for which a graph 

283 is being created, and it never affects the ordering of quanta. 

284 

285 Raises 

286 ------ 

287 TypeError 

288 Raised if ``minimum`` is greater than one but ``multiple=False``. 

289 NotImplementedError 

290 Raised if ``minimum`` is zero for a regular `Input` connection; this 

291 is not currently supported by our QuantumGraph generation algorithm. 

292 """ 

293 

294 deferGraphConstraint: bool = False 

295 

296 def __post_init__(self) -> None: 

297 super().__post_init__() 

298 if self.minimum == 0: 298 ↛ 299line 298 didn't jump to line 299, because the condition on line 298 was never true

299 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

300 

301 

302@dataclasses.dataclass(frozen=True) 

303class PrerequisiteInput(BaseInput): 

304 """Class used for declaring PipelineTask prerequisite connections 

305 

306 Parameters 

307 ---------- 

308 name : `str` 

309 The default name used to identify the dataset type 

310 storageClass : `str` 

311 The storage class used when (un)/persisting the dataset type 

312 multiple : `bool` 

313 Indicates if this connection should expect to contain multiple objects 

314 of the given dataset type. Tasks with more than one connection with 

315 ``multiple=True`` with the same dimensions may want to implement 

316 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

317 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

318 the execution system as early as possible of outputs that will not be 

319 produced because the corresponding input is missing. 

320 dimensions : iterable of `str` 

321 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

322 to identify the dataset type identified by the specified name 

323 minimum : `bool` 

324 Minimum number of datasets required for this connection, per quantum. 

325 This is checked in the base implementation of 

326 `PipelineTaskConnections.adjustQuantum`, which raises 

327 `FileNotFoundError` (causing QuantumGraph generation to fail). 

328 `PipelineTask` implementations may 

329 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

330 for more fine-grained or configuration-driven constraints, as long as 

331 they are compatible with this minium. 

332 lookupFunction: `typing.Callable`, optional 

333 An optional callable function that will look up PrerequisiteInputs 

334 using the DatasetType, registry, quantum dataId, and input collections 

335 passed to it. If no function is specified, the default temporal spatial 

336 lookup will be used. 

337 

338 Raises 

339 ------ 

340 TypeError 

341 Raised if ``minimum`` is greater than one but ``multiple=False``. 

342 

343 Notes 

344 ----- 

345 Prerequisite inputs are used for datasets that must exist in the data 

346 repository before a pipeline including this is run; they cannot be produced 

347 by another task in the same pipeline. 

348 

349 In exchange for this limitation, they have a number of advantages relative 

350 to regular `Input` connections: 

351 

352 - The query used to find them then during `QuantumGraph` generation can be 

353 fully customized by providing a ``lookupFunction``. 

354 - Failed searches for prerequisites during `QuantumGraph` generation will 

355 usually generate more helpful diagnostics than those for regular `Input` 

356 connections. 

357 - The default query for prerequisite inputs relates the quantum dimensions 

358 directly to the dimensions of its dataset type, without being constrained 

359 by any of the other dimensions in the pipeline. This allows them to be 

360 used for temporal calibration lookups (which regular `Input` connections 

361 cannot do at present) and to work around `QuantumGraph` generation 

362 limitations involving cases where naive spatial overlap relationships 

363 between dimensions are not desired (e.g. a task that wants all detectors 

364 in each visit for which the visit overlaps a tract, not just those where 

365 that detector+visit combination overlaps the tract). 

366 - Prerequisite inputs may be optional (regular inputs are never optional). 

367 

368 """ 

369 

370 lookupFunction: Optional[ 

371 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]] 

372 ] = None 

373 

374 

375@dataclasses.dataclass(frozen=True) 

376class Output(DimensionedConnection): 

377 pass 

378 

379 

380@dataclasses.dataclass(frozen=True) 

381class InitInput(BaseConnection): 

382 pass 

383 

384 

385@dataclasses.dataclass(frozen=True) 

386class InitOutput(BaseConnection): 

387 pass