Coverage for python/lsst/pipe/base/connectionTypes.py: 78%

71 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-31 09:39 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29from collections.abc import Callable, Iterable, Sequence 

30from typing import ClassVar 

31 

32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

33from lsst.utils.introspection import find_outside_stacklevel 

34 

35 

36@dataclasses.dataclass(frozen=True) 

37class BaseConnection: 

38 """Base class used for declaring `PipelineTask` connections. 

39 

40 Parameters 

41 ---------- 

42 name : `str` 

43 The name used to identify the dataset type. 

44 storageClass : `str` 

45 The storage class used when (un)/persisting the dataset type. 

46 multiple : `bool` 

47 Indicates if this connection should expect to contain multiple objects 

48 of the given dataset type. Tasks with more than one connection with 

49 ``multiple=True`` with the same dimensions may want to implement 

50 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

51 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and 

52 notify the execution system as early as possible of outputs that will 

53 not be produced because the corresponding input is missing. 

54 deprecated : `str`, optional 

55 A description of why this connection is deprecated, including the 

56 version after which it may be removed. 

57 

58 If not `None`, the string is appended to the docstring for this 

59 connection and the corresponding config Field. 

60 """ 

61 

62 name: str 

63 storageClass: str 

64 doc: str = "" 

65 multiple: bool = False 

66 deprecated: str | None = dataclasses.field(default=None, kw_only=True) 

67 

68 _connection_type_set: ClassVar[str] 

69 _deprecation_context: str = "" 

70 

71 def __post_init__(self): 

72 if self.deprecated and not self._deprecation_context: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 info = {} 

74 _ = find_outside_stacklevel("lsst.pipe.base", "dataclasses", stack_info=info) 

75 object.__setattr__(self, "_deprecation_context", f"{info['filename']}:{info['lineno']}") 

76 

77 def __get__(self, inst, klass): 

78 """Descriptor access method. 

79 

80 This is a method used to turn a connection into a descriptor. 

81 When a connection is added to a connection class, it is a class level 

82 variable. This method makes accessing this connection, on the 

83 instance of the connection class owning this connection, return a 

84 result specialized for that instance. In the case of connections 

85 this specifically means names specified in a config instance will 

86 be visible instead of the default names for the connection, and that 

87 removed connections will not be accessible on the instance. 

88 """ 

89 # If inst is None, this is being accessed by the class and not an 

90 # instance, return this connection itself 

91 if inst is None: 

92 return self 

93 # Attempt to return the configured connection object from the 

94 # connections instance allConnections mapping. 

95 try: 

96 return inst.allConnections[self.varName] 

97 except KeyError: 

98 raise AttributeError( 

99 f"Connection {self.varName!r} of {klass.__name__} has been removed." 

100 ) from None 

101 

102 def makeDatasetType( 

103 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

104 ) -> DatasetType: 

105 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

106 normalized dimensions. 

107 

108 Parameters 

109 ---------- 

110 universe : `lsst.daf.butler.DimensionUniverse` 

111 Set of all known dimensions to be used to normalize the dimension 

112 names specified in config. 

113 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

114 Parent storage class for component datasets; `None` otherwise. 

115 

116 Returns 

117 ------- 

118 datasetType : `~lsst.daf.butler.DatasetType` 

119 The `~lsst.daf.butler.DatasetType` defined by this connection. 

120 """ 

121 return DatasetType( 

122 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

123 ) 

124 

125 

126@dataclasses.dataclass(frozen=True) 

127class DimensionedConnection(BaseConnection): 

128 """Class used for declaring PipelineTask connections that includes 

129 dimensions 

130 

131 Parameters 

132 ---------- 

133 name : `str` 

134 The name used to identify the dataset type 

135 storageClass : `str` 

136 The storage class used when (un)/persisting the dataset type 

137 multiple : `bool` 

138 Indicates if this connection should expect to contain multiple objects 

139 of the given dataset type. Tasks with more than one connection with 

140 ``multiple=True`` with the same dimensions may want to implement 

141 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

142 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

143 the execution system as early as possible of outputs that will not be 

144 produced because the corresponding input is missing. 

145 dimensions : iterable of `str` 

146 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

147 to identify the dataset type identified by the specified name 

148 isCalibration: `bool`, optional 

149 `True` if this dataset type may be included in CALIBRATION-type 

150 collections to associate it with a validity range, `False` (default) 

151 otherwise. 

152 """ 

153 

154 dimensions: Iterable[str] = () 

155 isCalibration: bool = False 

156 

157 def __post_init__(self): 

158 super().__post_init__() 

159 if isinstance(self.dimensions, str): 159 ↛ 160line 159 didn't jump to line 160, because the condition on line 159 was never true

160 raise TypeError( 

161 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

162 ) 

163 if not isinstance(self.dimensions, Iterable): 163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true

164 raise TypeError("Dimensions must be iterable of dimensions") 

165 

166 def makeDatasetType( 

167 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

168 ) -> DatasetType: 

169 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

170 normalized dimensions. 

171 

172 Parameters 

173 ---------- 

174 universe : `lsst.daf.butler.DimensionUniverse` 

175 Set of all known dimensions to be used to normalize the dimension 

176 names specified in config. 

177 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

178 Parent storage class for component datasets; `None` otherwise. 

179 

180 Returns 

181 ------- 

182 datasetType : `~lsst.daf.butler.DatasetType` 

183 The `~lsst.daf.butler.DatasetType` defined by this connection. 

184 """ 

185 return DatasetType( 

186 self.name, 

187 universe.extract(self.dimensions), 

188 self.storageClass, 

189 isCalibration=self.isCalibration, 

190 parentStorageClass=parentStorageClass, 

191 ) 

192 

193 

194@dataclasses.dataclass(frozen=True) 

195class BaseInput(DimensionedConnection): 

196 """Class used for declaring PipelineTask input connections 

197 

198 Parameters 

199 ---------- 

200 name : `str` 

201 The default name used to identify the dataset type 

202 storageClass : `str` 

203 The storage class used when (un)/persisting the dataset type 

204 multiple : `bool` 

205 Indicates if this connection should expect to contain multiple objects 

206 of the given dataset type. Tasks with more than one connection with 

207 ``multiple=True`` with the same dimensions may want to implement 

208 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

209 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

210 the execution system as early as possible of outputs that will not be 

211 produced because the corresponding input is missing. 

212 dimensions : iterable of `str` 

213 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

214 to identify the dataset type identified by the specified name 

215 deferLoad : `bool` 

216 Indicates that this dataset type will be loaded as a 

217 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

218 object to load the object at a later time. 

219 minimum : `bool` 

220 Minimum number of datasets required for this connection, per quantum. 

221 This is checked in the base implementation of 

222 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

223 the minimum is not met for `Input` connections (causing the quantum to 

224 be pruned, skipped, or never created, depending on the context), and 

225 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

226 QuantumGraph generation to fail). `PipelineTask` implementations may 

227 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

228 for more fine-grained or configuration-driven constraints, as long as 

229 they are compatible with this minium. 

230 

231 Raises 

232 ------ 

233 TypeError 

234 Raised if ``minimum`` is greater than one but ``multiple=False``. 

235 NotImplementedError 

236 Raised if ``minimum`` is zero for a regular `Input` connection; this 

237 is not currently supported by our QuantumGraph generation algorithm. 

238 """ 

239 

240 deferLoad: bool = False 

241 minimum: int = 1 

242 

243 def __post_init__(self) -> None: 

244 super().__post_init__() 

245 if self.minimum > 1 and not self.multiple: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

247 

248 

249@dataclasses.dataclass(frozen=True) 

250class Input(BaseInput): 

251 """Class used for declaring PipelineTask input connections 

252 

253 Parameters 

254 ---------- 

255 name : `str` 

256 The default name used to identify the dataset type 

257 storageClass : `str` 

258 The storage class used when (un)/persisting the dataset type 

259 multiple : `bool` 

260 Indicates if this connection should expect to contain multiple objects 

261 of the given dataset type. Tasks with more than one connection with 

262 ``multiple=True`` with the same dimensions may want to implement 

263 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

264 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

265 the execution system as early as possible of outputs that will not be 

266 produced because the corresponding input is missing. 

267 dimensions : iterable of `str` 

268 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

269 to identify the dataset type identified by the specified name 

270 deferLoad : `bool` 

271 Indicates that this dataset type will be loaded as a 

272 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

273 object to load the object at a later time. 

274 minimum : `bool` 

275 Minimum number of datasets required for this connection, per quantum. 

276 This is checked in the base implementation of 

277 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

278 the minimum is not met for `Input` connections (causing the quantum to 

279 be pruned, skipped, or never created, depending on the context), and 

280 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

281 QuantumGraph generation to fail). `PipelineTask` implementations may 

282 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

283 for more fine-grained or configuration-driven constraints, as long as 

284 they are compatible with this minium. 

285 deferGraphConstraint: `bool`, optional 

286 If `True`, do not include this dataset type's existence in the initial 

287 query that starts the QuantumGraph generation process. This can be 

288 used to make QuantumGraph generation faster by avoiding redundant 

289 datasets, and in certain cases it can (along with careful attention to 

290 which tasks are included in the same QuantumGraph) be used to work 

291 around the QuantumGraph generation algorithm's inflexible handling of 

292 spatial overlaps. This option has no effect when the connection is not 

293 an overall input of the pipeline (or subset thereof) for which a graph 

294 is being created, and it never affects the ordering of quanta. 

295 

296 Raises 

297 ------ 

298 TypeError 

299 Raised if ``minimum`` is greater than one but ``multiple=False``. 

300 NotImplementedError 

301 Raised if ``minimum`` is zero for a regular `Input` connection; this 

302 is not currently supported by our QuantumGraph generation algorithm. 

303 """ 

304 

305 deferGraphConstraint: bool = False 

306 

307 _connection_type_set: ClassVar[str] = "inputs" 

308 

309 def __post_init__(self) -> None: 

310 super().__post_init__() 

311 if self.minimum == 0: 311 ↛ 312line 311 didn't jump to line 312, because the condition on line 311 was never true

312 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

313 

314 

315@dataclasses.dataclass(frozen=True) 

316class PrerequisiteInput(BaseInput): 

317 """Class used for declaring PipelineTask prerequisite connections. 

318 

319 Parameters 

320 ---------- 

321 name : `str` 

322 The default name used to identify the dataset type 

323 storageClass : `str` 

324 The storage class used when (un)/persisting the dataset type 

325 multiple : `bool` 

326 Indicates if this connection should expect to contain multiple objects 

327 of the given dataset type. Tasks with more than one connection with 

328 ``multiple=True`` with the same dimensions may want to implement 

329 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

330 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

331 the execution system as early as possible of outputs that will not be 

332 produced because the corresponding input is missing. 

333 dimensions : iterable of `str` 

334 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

335 to identify the dataset type identified by the specified name 

336 minimum : `bool` 

337 Minimum number of datasets required for this connection, per quantum. 

338 This is checked in the base implementation of 

339 `PipelineTaskConnections.adjustQuantum`, which raises 

340 `FileNotFoundError` (causing QuantumGraph generation to fail). 

341 `PipelineTask` implementations may 

342 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

343 for more fine-grained or configuration-driven constraints, as long as 

344 they are compatible with this minium. 

345 lookupFunction: `typing.Callable`, optional 

346 An optional callable function that will look up PrerequisiteInputs 

347 using the DatasetType, registry, quantum dataId, and input collections 

348 passed to it. If no function is specified, the default temporal spatial 

349 lookup will be used. 

350 

351 Raises 

352 ------ 

353 TypeError 

354 Raised if ``minimum`` is greater than one but ``multiple=False``. 

355 

356 Notes 

357 ----- 

358 Prerequisite inputs are used for datasets that must exist in the data 

359 repository before a pipeline including this is run; they cannot be produced 

360 by another task in the same pipeline. 

361 

362 In exchange for this limitation, they have a number of advantages relative 

363 to regular `Input` connections: 

364 

365 - The query used to find them then during `QuantumGraph` generation can be 

366 fully customized by providing a ``lookupFunction``. 

367 - Failed searches for prerequisites during `QuantumGraph` generation will 

368 usually generate more helpful diagnostics than those for regular `Input` 

369 connections. 

370 - The default query for prerequisite inputs relates the quantum dimensions 

371 directly to the dimensions of its dataset type, without being constrained 

372 by any of the other dimensions in the pipeline. This allows them to be 

373 used for temporal calibration lookups (which regular `Input` connections 

374 cannot do at present) and to work around `QuantumGraph` generation 

375 limitations involving cases where naive spatial overlap relationships 

376 between dimensions are not desired (e.g. a task that wants all detectors 

377 in each visit for which the visit overlaps a tract, not just those where 

378 that detector+visit combination overlaps the tract). 

379 - Prerequisite inputs may be optional (regular inputs are never optional). 

380 """ 

381 

382 lookupFunction: Callable[ 

383 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef] 

384 ] | None = None 

385 

386 _connection_type_set: ClassVar[str] = "prerequisiteInputs" 

387 

388 

389@dataclasses.dataclass(frozen=True) 

390class Output(DimensionedConnection): 

391 """Connection for output dataset.""" 

392 

393 _connection_type_set: ClassVar[str] = "outputs" 

394 

395 

396@dataclasses.dataclass(frozen=True) 

397class InitInput(BaseConnection): 

398 """Connection for initInput dataset.""" 

399 

400 _connection_type_set: ClassVar[str] = "initInputs" 

401 

402 

403@dataclasses.dataclass(frozen=True) 

404class InitOutput(BaseConnection): 

405 """Connection for initOutput dataset.""" 

406 

407 _connection_type_set: ClassVar[str] = "initOutputs"