Coverage for python/lsst/pipe/base/connectionTypes.py: 80%

62 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-18 02:12 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29from collections.abc import Callable, Iterable, Sequence 

30from typing import ClassVar 

31 

32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

33 

34 

35@dataclasses.dataclass(frozen=True) 

36class BaseConnection: 

37 """Base class used for declaring PipelineTask connections 

38 

39 Parameters 

40 ---------- 

41 name : `str` 

42 The name used to identify the dataset type 

43 storageClass : `str` 

44 The storage class used when (un)/persisting the dataset type 

45 multiple : `bool` 

46 Indicates if this connection should expect to contain multiple objects 

47 of the given dataset type. Tasks with more than one connection with 

48 ``multiple=True`` with the same dimensions may want to implement 

49 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

50 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

51 the execution system as early as possible of outputs that will not be 

52 produced because the corresponding input is missing. 

53 """ 

54 

55 name: str 

56 storageClass: str 

57 doc: str = "" 

58 multiple: bool = False 

59 

60 _connection_type_set: ClassVar[str] 

61 

62 def __get__(self, inst, klass): 

63 """Descriptor access method. 

64 

65 This is a method used to turn a connection into a descriptor. 

66 When a connection is added to a connection class, it is a class level 

67 variable. This method makes accessing this connection, on the 

68 instance of the connection class owning this connection, return a 

69 result specialized for that instance. In the case of connections 

70 this specifically means names specified in a config instance will 

71 be visible instead of the default names for the connection, and that 

72 removed connections will not be accessible on the instance. 

73 """ 

74 # If inst is None, this is being accessed by the class and not an 

75 # instance, return this connection itself 

76 if inst is None: 

77 return self 

78 # Attempt to return the configured connection object from the 

79 # connections instance allConnections mapping. 

80 try: 

81 return inst.allConnections[self.varName] 

82 except KeyError: 

83 raise AttributeError( 

84 f"Connection {self.varName!r} of {klass.__name__} has been removed." 

85 ) from None 

86 

87 def makeDatasetType( 

88 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

89 ) -> DatasetType: 

90 """Construct a true `DatasetType` instance with normalized dimensions. 

91 

92 Parameters 

93 ---------- 

94 universe : `lsst.daf.butler.DimensionUniverse` 

95 Set of all known dimensions to be used to normalize the dimension 

96 names specified in config. 

97 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

98 Parent storage class for component datasets; `None` otherwise. 

99 

100 Returns 

101 ------- 

102 datasetType : `DatasetType` 

103 The `DatasetType` defined by this connection. 

104 """ 

105 return DatasetType( 

106 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

107 ) 

108 

109 

110@dataclasses.dataclass(frozen=True) 

111class DimensionedConnection(BaseConnection): 

112 """Class used for declaring PipelineTask connections that includes 

113 dimensions 

114 

115 Parameters 

116 ---------- 

117 name : `str` 

118 The name used to identify the dataset type 

119 storageClass : `str` 

120 The storage class used when (un)/persisting the dataset type 

121 multiple : `bool` 

122 Indicates if this connection should expect to contain multiple objects 

123 of the given dataset type. Tasks with more than one connection with 

124 ``multiple=True`` with the same dimensions may want to implement 

125 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

126 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

127 the execution system as early as possible of outputs that will not be 

128 produced because the corresponding input is missing. 

129 dimensions : iterable of `str` 

130 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

131 to identify the dataset type identified by the specified name 

132 isCalibration: `bool`, optional 

133 `True` if this dataset type may be included in CALIBRATION-type 

134 collections to associate it with a validity range, `False` (default) 

135 otherwise. 

136 """ 

137 

138 dimensions: Iterable[str] = () 

139 isCalibration: bool = False 

140 

141 def __post_init__(self): 

142 if isinstance(self.dimensions, str): 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 raise TypeError( 

144 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

145 ) 

146 if not isinstance(self.dimensions, Iterable): 146 ↛ 147line 146 didn't jump to line 147, because the condition on line 146 was never true

147 raise TypeError("Dimensions must be iterable of dimensions") 

148 

149 def makeDatasetType( 

150 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

151 ) -> DatasetType: 

152 """Construct a true `DatasetType` instance with normalized dimensions. 

153 

154 Parameters 

155 ---------- 

156 universe : `lsst.daf.butler.DimensionUniverse` 

157 Set of all known dimensions to be used to normalize the dimension 

158 names specified in config. 

159 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

160 Parent storage class for component datasets; `None` otherwise. 

161 

162 Returns 

163 ------- 

164 datasetType : `DatasetType` 

165 The `DatasetType` defined by this connection. 

166 """ 

167 return DatasetType( 

168 self.name, 

169 universe.extract(self.dimensions), 

170 self.storageClass, 

171 isCalibration=self.isCalibration, 

172 parentStorageClass=parentStorageClass, 

173 ) 

174 

175 

176@dataclasses.dataclass(frozen=True) 

177class BaseInput(DimensionedConnection): 

178 """Class used for declaring PipelineTask input connections 

179 

180 Parameters 

181 ---------- 

182 name : `str` 

183 The default name used to identify the dataset type 

184 storageClass : `str` 

185 The storage class used when (un)/persisting the dataset type 

186 multiple : `bool` 

187 Indicates if this connection should expect to contain multiple objects 

188 of the given dataset type. Tasks with more than one connection with 

189 ``multiple=True`` with the same dimensions may want to implement 

190 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

191 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

192 the execution system as early as possible of outputs that will not be 

193 produced because the corresponding input is missing. 

194 dimensions : iterable of `str` 

195 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

196 to identify the dataset type identified by the specified name 

197 deferLoad : `bool` 

198 Indicates that this dataset type will be loaded as a 

199 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

200 object to load the object at a later time. 

201 minimum : `bool` 

202 Minimum number of datasets required for this connection, per quantum. 

203 This is checked in the base implementation of 

204 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

205 the minimum is not met for `Input` connections (causing the quantum to 

206 be pruned, skipped, or never created, depending on the context), and 

207 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

208 QuantumGraph generation to fail). `PipelineTask` implementations may 

209 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

210 for more fine-grained or configuration-driven constraints, as long as 

211 they are compatible with this minium. 

212 

213 Raises 

214 ------ 

215 TypeError 

216 Raised if ``minimum`` is greater than one but ``multiple=False``. 

217 NotImplementedError 

218 Raised if ``minimum`` is zero for a regular `Input` connection; this 

219 is not currently supported by our QuantumGraph generation algorithm. 

220 """ 

221 

222 deferLoad: bool = False 

223 minimum: int = 1 

224 

225 def __post_init__(self) -> None: 

226 super().__post_init__() 

227 if self.minimum > 1 and not self.multiple: 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true

228 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

229 

230 

231@dataclasses.dataclass(frozen=True) 

232class Input(BaseInput): 

233 """Class used for declaring PipelineTask input connections 

234 

235 Parameters 

236 ---------- 

237 name : `str` 

238 The default name used to identify the dataset type 

239 storageClass : `str` 

240 The storage class used when (un)/persisting the dataset type 

241 multiple : `bool` 

242 Indicates if this connection should expect to contain multiple objects 

243 of the given dataset type. Tasks with more than one connection with 

244 ``multiple=True`` with the same dimensions may want to implement 

245 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

246 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

247 the execution system as early as possible of outputs that will not be 

248 produced because the corresponding input is missing. 

249 dimensions : iterable of `str` 

250 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

251 to identify the dataset type identified by the specified name 

252 deferLoad : `bool` 

253 Indicates that this dataset type will be loaded as a 

254 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

255 object to load the object at a later time. 

256 minimum : `bool` 

257 Minimum number of datasets required for this connection, per quantum. 

258 This is checked in the base implementation of 

259 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

260 the minimum is not met for `Input` connections (causing the quantum to 

261 be pruned, skipped, or never created, depending on the context), and 

262 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

263 QuantumGraph generation to fail). `PipelineTask` implementations may 

264 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

265 for more fine-grained or configuration-driven constraints, as long as 

266 they are compatible with this minium. 

267 deferGraphConstraint: `bool`, optional 

268 If `True`, do not include this dataset type's existence in the initial 

269 query that starts the QuantumGraph generation process. This can be 

270 used to make QuantumGraph generation faster by avoiding redundant 

271 datasets, and in certain cases it can (along with careful attention to 

272 which tasks are included in the same QuantumGraph) be used to work 

273 around the QuantumGraph generation algorithm's inflexible handling of 

274 spatial overlaps. This option has no effect when the connection is not 

275 an overall input of the pipeline (or subset thereof) for which a graph 

276 is being created, and it never affects the ordering of quanta. 

277 

278 Raises 

279 ------ 

280 TypeError 

281 Raised if ``minimum`` is greater than one but ``multiple=False``. 

282 NotImplementedError 

283 Raised if ``minimum`` is zero for a regular `Input` connection; this 

284 is not currently supported by our QuantumGraph generation algorithm. 

285 """ 

286 

287 deferGraphConstraint: bool = False 

288 

289 _connection_type_set: ClassVar[str] = "inputs" 

290 

291 def __post_init__(self) -> None: 

292 super().__post_init__() 

293 if self.minimum == 0: 293 ↛ 294line 293 didn't jump to line 294, because the condition on line 293 was never true

294 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

295 

296 

297@dataclasses.dataclass(frozen=True) 

298class PrerequisiteInput(BaseInput): 

299 """Class used for declaring PipelineTask prerequisite connections 

300 

301 Parameters 

302 ---------- 

303 name : `str` 

304 The default name used to identify the dataset type 

305 storageClass : `str` 

306 The storage class used when (un)/persisting the dataset type 

307 multiple : `bool` 

308 Indicates if this connection should expect to contain multiple objects 

309 of the given dataset type. Tasks with more than one connection with 

310 ``multiple=True`` with the same dimensions may want to implement 

311 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

312 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

313 the execution system as early as possible of outputs that will not be 

314 produced because the corresponding input is missing. 

315 dimensions : iterable of `str` 

316 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

317 to identify the dataset type identified by the specified name 

318 minimum : `bool` 

319 Minimum number of datasets required for this connection, per quantum. 

320 This is checked in the base implementation of 

321 `PipelineTaskConnections.adjustQuantum`, which raises 

322 `FileNotFoundError` (causing QuantumGraph generation to fail). 

323 `PipelineTask` implementations may 

324 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

325 for more fine-grained or configuration-driven constraints, as long as 

326 they are compatible with this minium. 

327 lookupFunction: `typing.Callable`, optional 

328 An optional callable function that will look up PrerequisiteInputs 

329 using the DatasetType, registry, quantum dataId, and input collections 

330 passed to it. If no function is specified, the default temporal spatial 

331 lookup will be used. 

332 

333 Raises 

334 ------ 

335 TypeError 

336 Raised if ``minimum`` is greater than one but ``multiple=False``. 

337 

338 Notes 

339 ----- 

340 Prerequisite inputs are used for datasets that must exist in the data 

341 repository before a pipeline including this is run; they cannot be produced 

342 by another task in the same pipeline. 

343 

344 In exchange for this limitation, they have a number of advantages relative 

345 to regular `Input` connections: 

346 

347 - The query used to find them then during `QuantumGraph` generation can be 

348 fully customized by providing a ``lookupFunction``. 

349 - Failed searches for prerequisites during `QuantumGraph` generation will 

350 usually generate more helpful diagnostics than those for regular `Input` 

351 connections. 

352 - The default query for prerequisite inputs relates the quantum dimensions 

353 directly to the dimensions of its dataset type, without being constrained 

354 by any of the other dimensions in the pipeline. This allows them to be 

355 used for temporal calibration lookups (which regular `Input` connections 

356 cannot do at present) and to work around `QuantumGraph` generation 

357 limitations involving cases where naive spatial overlap relationships 

358 between dimensions are not desired (e.g. a task that wants all detectors 

359 in each visit for which the visit overlaps a tract, not just those where 

360 that detector+visit combination overlaps the tract). 

361 - Prerequisite inputs may be optional (regular inputs are never optional). 

362 

363 """ 

364 

365 lookupFunction: Callable[ 

366 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef] 

367 ] | None = None 

368 

369 _connection_type_set: ClassVar[str] = "prerequisiteInputs" 

370 

371 

372@dataclasses.dataclass(frozen=True) 

373class Output(DimensionedConnection): 

374 _connection_type_set: ClassVar[str] = "outputs" 

375 

376 

377@dataclasses.dataclass(frozen=True) 

378class InitInput(BaseConnection): 

379 _connection_type_set: ClassVar[str] = "initInputs" 

380 

381 

382@dataclasses.dataclass(frozen=True) 

383class InitOutput(BaseConnection): 

384 _connection_type_set: ClassVar[str] = "initOutputs"