Coverage for python/lsst/pipe/base/connectionTypes.py: 78%

72 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-06 04:05 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Module defining connection types to be used within a 

29`PipelineTaskConnections` class. 

30""" 

31 

32__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

33 

34import dataclasses 

35from collections.abc import Callable, Iterable, Sequence 

36from typing import ClassVar 

37 

38from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

39from lsst.utils.introspection import find_outside_stacklevel 

40 

41 

42@dataclasses.dataclass(frozen=True) 

43class BaseConnection: 

44 """Base class used for declaring `PipelineTask` connections. 

45 

46 Attributes 

47 ---------- 

48 name : `str` 

49 The name used to identify the dataset type. 

50 storageClass : `str` 

51 The storage class used when (un)/persisting the dataset type. 

52 multiple : `bool` 

53 Indicates if this connection should expect to contain multiple objects 

54 of the given dataset type. Tasks with more than one connection with 

55 ``multiple=True`` with the same dimensions may want to implement 

56 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

57 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and 

58 notify the execution system as early as possible of outputs that will 

59 not be produced because the corresponding input is missing. 

60 deprecated : `str`, optional 

61 A description of why this connection is deprecated, including the 

62 version after which it may be removed. 

63 

64 If not `None`, the string is appended to the docstring for this 

65 connection and the corresponding config Field. 

66 """ 

67 

68 name: str 

69 storageClass: str 

70 doc: str = "" 

71 multiple: bool = False 

72 deprecated: str | None = dataclasses.field(default=None, kw_only=True) 

73 

74 _connection_type_set: ClassVar[str] 

75 _deprecation_context: str = "" 

76 

77 def __post_init__(self): 

78 if self.deprecated and not self._deprecation_context: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 info = {} 

80 _ = find_outside_stacklevel("lsst.pipe.base", "dataclasses", stack_info=info) 

81 object.__setattr__(self, "_deprecation_context", f"{info['filename']}:{info['lineno']}") 

82 

83 def __get__(self, inst, klass): 

84 """Descriptor access method. 

85 

86 This is a method used to turn a connection into a descriptor. 

87 When a connection is added to a connection class, it is a class level 

88 variable. This method makes accessing this connection, on the 

89 instance of the connection class owning this connection, return a 

90 result specialized for that instance. In the case of connections 

91 this specifically means names specified in a config instance will 

92 be visible instead of the default names for the connection, and that 

93 removed connections will not be accessible on the instance. 

94 """ 

95 # If inst is None, this is being accessed by the class and not an 

96 # instance, return this connection itself 

97 if inst is None: 

98 return self 

99 # Attempt to return the configured connection object from the 

100 # connections instance allConnections mapping. 

101 try: 

102 return inst.allConnections[self.varName] 

103 except KeyError: 

104 raise AttributeError( 

105 f"Connection {self.varName!r} of {klass.__name__} has been removed." 

106 ) from None 

107 

108 def makeDatasetType( 

109 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

110 ) -> DatasetType: 

111 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

112 normalized dimensions. 

113 

114 Parameters 

115 ---------- 

116 universe : `lsst.daf.butler.DimensionUniverse` 

117 Set of all known dimensions to be used to normalize the dimension 

118 names specified in config. 

119 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

120 Parent storage class for component datasets; `None` otherwise. 

121 

122 Returns 

123 ------- 

124 datasetType : `~lsst.daf.butler.DatasetType` 

125 The `~lsst.daf.butler.DatasetType` defined by this connection. 

126 """ 

127 return DatasetType( 

128 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

129 ) 

130 

131 

132@dataclasses.dataclass(frozen=True) 

133class DimensionedConnection(BaseConnection): 

134 """Class used for declaring PipelineTask connections that includes 

135 dimensions. 

136 

137 Attributes 

138 ---------- 

139 name : `str` 

140 The name used to identify the dataset type. 

141 storageClass : `str` 

142 The storage class used when (un)/persisting the dataset type. 

143 multiple : `bool` 

144 Indicates if this connection should expect to contain multiple objects 

145 of the given dataset type. Tasks with more than one connection with 

146 ``multiple=True`` with the same dimensions may want to implement 

147 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

148 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

149 the execution system as early as possible of outputs that will not be 

150 produced because the corresponding input is missing. 

151 dimensions : iterable of `str` 

152 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

153 to identify the dataset type identified by the specified name. 

154 isCalibration : `bool`, optional 

155 `True` if this dataset type may be included in CALIBRATION-type 

156 collections to associate it with a validity range, `False` (default) 

157 otherwise. 

158 """ 

159 

160 dimensions: Iterable[str] = () 

161 isCalibration: bool = False 

162 

163 def __post_init__(self): 

164 super().__post_init__() 

165 if isinstance(self.dimensions, str): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise TypeError( 

167 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

168 ) 

169 if not isinstance(self.dimensions, Iterable): 169 ↛ 170line 169 didn't jump to line 170, because the condition on line 169 was never true

170 raise TypeError("Dimensions must be iterable of dimensions") 

171 

172 def makeDatasetType( 

173 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

174 ) -> DatasetType: 

175 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

176 normalized dimensions. 

177 

178 Parameters 

179 ---------- 

180 universe : `lsst.daf.butler.DimensionUniverse` 

181 Set of all known dimensions to be used to normalize the dimension 

182 names specified in config. 

183 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

184 Parent storage class for component datasets; `None` otherwise. 

185 

186 Returns 

187 ------- 

188 datasetType : `~lsst.daf.butler.DatasetType` 

189 The `~lsst.daf.butler.DatasetType` defined by this connection. 

190 """ 

191 return DatasetType( 

192 self.name, 

193 universe.conform(self.dimensions), 

194 self.storageClass, 

195 isCalibration=self.isCalibration, 

196 parentStorageClass=parentStorageClass, 

197 ) 

198 

199 

200@dataclasses.dataclass(frozen=True) 

201class BaseInput(DimensionedConnection): 

202 """Class used for declaring PipelineTask input connections. 

203 

204 Attributes 

205 ---------- 

206 name : `str` 

207 The default name used to identify the dataset type. 

208 storageClass : `str` 

209 The storage class used when (un)/persisting the dataset type. 

210 multiple : `bool` 

211 Indicates if this connection should expect to contain multiple objects 

212 of the given dataset type. Tasks with more than one connection with 

213 ``multiple=True`` with the same dimensions may want to implement 

214 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

215 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

216 the execution system as early as possible of outputs that will not be 

217 produced because the corresponding input is missing. 

218 dimensions : iterable of `str` 

219 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

220 to identify the dataset type identified by the specified name. 

221 deferLoad : `bool` 

222 Indicates that this dataset type will be loaded as a 

223 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

224 object to load the object at a later time. 

225 minimum : `bool` 

226 Minimum number of datasets required for this connection, per quantum. 

227 This is checked in the base implementation of 

228 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

229 the minimum is not met for `Input` connections (causing the quantum to 

230 be pruned, skipped, or never created, depending on the context), and 

231 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

232 QuantumGraph generation to fail). `PipelineTask` implementations may 

233 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

234 for more fine-grained or configuration-driven constraints, as long as 

235 they are compatible with this minium. 

236 

237 Raises 

238 ------ 

239 TypeError 

240 Raised if ``minimum`` is greater than one but ``multiple=False``. 

241 NotImplementedError 

242 Raised if ``minimum`` is zero for a regular `Input` connection; this 

243 is not currently supported by our QuantumGraph generation algorithm. 

244 """ 

245 

246 deferLoad: bool = False 

247 minimum: int = 1 

248 

249 def __post_init__(self) -> None: 

250 super().__post_init__() 

251 if self.minimum > 1 and not self.multiple: 251 ↛ 252line 251 didn't jump to line 252, because the condition on line 251 was never true

252 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

253 

254 

255@dataclasses.dataclass(frozen=True) 

256class Input(BaseInput): 

257 """Class used for declaring PipelineTask input connections. 

258 

259 Attributes 

260 ---------- 

261 name : `str` 

262 The default name used to identify the dataset type. 

263 storageClass : `str` 

264 The storage class used when (un)/persisting the dataset type. 

265 multiple : `bool` 

266 Indicates if this connection should expect to contain multiple objects 

267 of the given dataset type. Tasks with more than one connection with 

268 ``multiple=True`` with the same dimensions may want to implement 

269 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

270 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

271 the execution system as early as possible of outputs that will not be 

272 produced because the corresponding input is missing. 

273 dimensions : iterable of `str` 

274 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

275 to identify the dataset type identified by the specified name. 

276 deferLoad : `bool` 

277 Indicates that this dataset type will be loaded as a 

278 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

279 object to load the object at a later time. 

280 minimum : `bool` 

281 Minimum number of datasets required for this connection, per quantum. 

282 This is checked in the base implementation of 

283 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

284 the minimum is not met for `Input` connections (causing the quantum to 

285 be pruned, skipped, or never created, depending on the context), and 

286 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

287 QuantumGraph generation to fail). `PipelineTask` implementations may 

288 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

289 for more fine-grained or configuration-driven constraints, as long as 

290 they are compatible with this minium. 

291 deferGraphConstraint : `bool`, optional 

292 If `True`, do not include this dataset type's existence in the initial 

293 query that starts the QuantumGraph generation process. This can be 

294 used to make QuantumGraph generation faster by avoiding redundant 

295 datasets, and in certain cases it can (along with careful attention to 

296 which tasks are included in the same QuantumGraph) be used to work 

297 around the QuantumGraph generation algorithm's inflexible handling of 

298 spatial overlaps. This option has no effect when the connection is not 

299 an overall input of the pipeline (or subset thereof) for which a graph 

300 is being created, and it never affects the ordering of quanta. 

301 deferBinding : `bool`, optional 

302 If `True`, the dataset will not be automatically included in 

303 the pipeline graph, ``deferGraphConstraint`` is implied. 

304 The custom QuantumGraphBuilder is required to bind it and add a 

305 corresponding edge to the pipeline graph. 

306 This option allows to have the same dataset type as both 

307 input and output of a quantum. 

308 

309 Raises 

310 ------ 

311 TypeError 

312 Raised if ``minimum`` is greater than one but ``multiple=False``. 

313 NotImplementedError 

314 Raised if ``minimum`` is zero for a regular `Input` connection; this 

315 is not currently supported by our QuantumGraph generation algorithm. 

316 """ 

317 

318 deferGraphConstraint: bool = False 

319 

320 deferBinding: bool = False 

321 

322 _connection_type_set: ClassVar[str] = "inputs" 

323 

324 def __post_init__(self) -> None: 

325 super().__post_init__() 

326 if self.minimum == 0: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true

327 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

328 

329 

330@dataclasses.dataclass(frozen=True) 

331class PrerequisiteInput(BaseInput): 

332 """Class used for declaring PipelineTask prerequisite connections. 

333 

334 Attributes 

335 ---------- 

336 name : `str` 

337 The default name used to identify the dataset type. 

338 storageClass : `str` 

339 The storage class used when (un)/persisting the dataset type. 

340 multiple : `bool` 

341 Indicates if this connection should expect to contain multiple objects 

342 of the given dataset type. Tasks with more than one connection with 

343 ``multiple=True`` with the same dimensions may want to implement 

344 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

345 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

346 the execution system as early as possible of outputs that will not be 

347 produced because the corresponding input is missing. 

348 dimensions : iterable of `str` 

349 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

350 to identify the dataset type identified by the specified name. 

351 minimum : `bool` 

352 Minimum number of datasets required for this connection, per quantum. 

353 This is checked in the base implementation of 

354 `PipelineTaskConnections.adjustQuantum`, which raises 

355 `FileNotFoundError` (causing QuantumGraph generation to fail). 

356 `PipelineTask` implementations may 

357 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

358 for more fine-grained or configuration-driven constraints, as long as 

359 they are compatible with this minium. 

360 lookupFunction : `typing.Callable`, optional 

361 An optional callable function that will look up PrerequisiteInputs 

362 using the DatasetType, registry, quantum dataId, and input collections 

363 passed to it. If no function is specified, the default temporal spatial 

364 lookup will be used. 

365 

366 Raises 

367 ------ 

368 TypeError 

369 Raised if ``minimum`` is greater than one but ``multiple=False``. 

370 

371 Notes 

372 ----- 

373 Prerequisite inputs are used for datasets that must exist in the data 

374 repository before a pipeline including this is run; they cannot be produced 

375 by another task in the same pipeline. 

376 

377 In exchange for this limitation, they have a number of advantages relative 

378 to regular `Input` connections: 

379 

380 - The query used to find them then during `QuantumGraph` generation can be 

381 fully customized by providing a ``lookupFunction``. 

382 - Failed searches for prerequisites during `QuantumGraph` generation will 

383 usually generate more helpful diagnostics than those for regular `Input` 

384 connections. 

385 - The default query for prerequisite inputs relates the quantum dimensions 

386 directly to the dimensions of its dataset type, without being constrained 

387 by any of the other dimensions in the pipeline. This allows them to be 

388 used for temporal calibration lookups (which regular `Input` connections 

389 cannot do at present) and to work around `QuantumGraph` generation 

390 limitations involving cases where naive spatial overlap relationships 

391 between dimensions are not desired (e.g. a task that wants all detectors 

392 in each visit for which the visit overlaps a tract, not just those where 

393 that detector+visit combination overlaps the tract). 

394 - Prerequisite inputs may be optional (regular inputs are never optional). 

395 """ 

396 

397 lookupFunction: ( 

398 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]] | None 

399 ) = None 

400 

401 _connection_type_set: ClassVar[str] = "prerequisiteInputs" 

402 

403 

404@dataclasses.dataclass(frozen=True) 

405class Output(DimensionedConnection): 

406 """Connection for output dataset.""" 

407 

408 _connection_type_set: ClassVar[str] = "outputs" 

409 

410 

411@dataclasses.dataclass(frozen=True) 

412class InitInput(BaseConnection): 

413 """Connection for initInput dataset.""" 

414 

415 _connection_type_set: ClassVar[str] = "initInputs" 

416 

417 

418@dataclasses.dataclass(frozen=True) 

419class InitOutput(BaseConnection): 

420 """Connection for initOutput dataset.""" 

421 

422 _connection_type_set: ClassVar[str] = "initOutputs"