Coverage for python/lsst/pipe/base/connectionTypes.py: 79%

75 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-17 02:45 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Module defining connection types to be used within a 

29`PipelineTaskConnections` class. 

30""" 

31 

32__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

33 

34import dataclasses 

35from collections.abc import Callable, Iterable, Sequence 

36from typing import ClassVar 

37 

38from deprecated.sphinx import deprecated as deprecated_sphinx # avoid clash with BaseConnection.deprecated 

39from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

40from lsst.utils.introspection import find_outside_stacklevel 

41 

42 

43@dataclasses.dataclass(frozen=True) 

44class BaseConnection: 

45 """Base class used for declaring `PipelineTask` connections. 

46 

47 Attributes 

48 ---------- 

49 name : `str` 

50 The name used to identify the dataset type. 

51 storageClass : `str` 

52 The storage class used when (un)/persisting the dataset type. 

53 multiple : `bool` 

54 Indicates if this connection should expect to contain multiple objects 

55 of the given dataset type. Tasks with more than one connection with 

56 ``multiple=True`` with the same dimensions may want to implement 

57 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

58 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and 

59 notify the execution system as early as possible of outputs that will 

60 not be produced because the corresponding input is missing. 

61 deprecated : `str`, optional 

62 A description of why this connection is deprecated, including the 

63 version after which it may be removed. 

64 

65 If not `None`, the string is appended to the docstring for this 

66 connection and the corresponding config Field. 

67 """ 

68 

69 name: str 

70 storageClass: str 

71 doc: str = "" 

72 multiple: bool = False 

73 deprecated: str | None = dataclasses.field(default=None, kw_only=True) 

74 

75 _connection_type_set: ClassVar[str] 

76 _deprecation_context: str = "" 

77 

78 def __post_init__(self): 

79 if self.deprecated and not self._deprecation_context: 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true

80 info = {} 

81 _ = find_outside_stacklevel("lsst.pipe.base", "dataclasses", stack_info=info) 

82 object.__setattr__(self, "_deprecation_context", f"{info['filename']}:{info['lineno']}") 

83 

84 def __get__(self, inst, klass): 

85 """Descriptor access method. 

86 

87 This is a method used to turn a connection into a descriptor. 

88 When a connection is added to a connection class, it is a class level 

89 variable. This method makes accessing this connection, on the 

90 instance of the connection class owning this connection, return a 

91 result specialized for that instance. In the case of connections 

92 this specifically means names specified in a config instance will 

93 be visible instead of the default names for the connection, and that 

94 removed connections will not be accessible on the instance. 

95 """ 

96 # If inst is None, this is being accessed by the class and not an 

97 # instance, return this connection itself 

98 if inst is None: 

99 return self 

100 # Attempt to return the configured connection object from the 

101 # connections instance allConnections mapping. 

102 try: 

103 return inst.allConnections[self.varName] 

104 except KeyError: 

105 raise AttributeError( 

106 f"Connection {self.varName!r} of {klass.__name__} has been removed." 

107 ) from None 

108 

109 # TODO: remove on DM-40443. 

110 @deprecated_sphinx( 

111 reason="Deprecated in favor of PipelineGraph, and will be removed after v27.", 

112 version="27.0", 

113 category=FutureWarning, 

114 ) 

115 def makeDatasetType( 

116 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

117 ) -> DatasetType: 

118 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

119 normalized dimensions. 

120 

121 Parameters 

122 ---------- 

123 universe : `lsst.daf.butler.DimensionUniverse` 

124 Set of all known dimensions to be used to normalize the dimension 

125 names specified in config. 

126 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

127 Parent storage class for component datasets; `None` otherwise. 

128 

129 Returns 

130 ------- 

131 datasetType : `~lsst.daf.butler.DatasetType` 

132 The `~lsst.daf.butler.DatasetType` defined by this connection. 

133 """ 

134 return DatasetType( 

135 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

136 ) 

137 

138 

139@dataclasses.dataclass(frozen=True) 

140class DimensionedConnection(BaseConnection): 

141 """Class used for declaring PipelineTask connections that includes 

142 dimensions. 

143 

144 Attributes 

145 ---------- 

146 name : `str` 

147 The name used to identify the dataset type. 

148 storageClass : `str` 

149 The storage class used when (un)/persisting the dataset type. 

150 multiple : `bool` 

151 Indicates if this connection should expect to contain multiple objects 

152 of the given dataset type. Tasks with more than one connection with 

153 ``multiple=True`` with the same dimensions may want to implement 

154 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

155 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

156 the execution system as early as possible of outputs that will not be 

157 produced because the corresponding input is missing. 

158 dimensions : iterable of `str` 

159 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

160 to identify the dataset type identified by the specified name. 

161 isCalibration : `bool`, optional 

162 `True` if this dataset type may be included in CALIBRATION-type 

163 collections to associate it with a validity range, `False` (default) 

164 otherwise. 

165 """ 

166 

167 dimensions: Iterable[str] = () 

168 isCalibration: bool = False 

169 

170 def __post_init__(self): 

171 super().__post_init__() 

172 if isinstance(self.dimensions, str): 172 ↛ 173line 172 didn't jump to line 173, because the condition on line 172 was never true

173 raise TypeError( 

174 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

175 ) 

176 if not isinstance(self.dimensions, Iterable): 176 ↛ 177line 176 didn't jump to line 177, because the condition on line 176 was never true

177 raise TypeError("Dimensions must be iterable of dimensions") 

178 

179 # TODO: remove on DM-40443. 

180 @deprecated_sphinx( 

181 reason="Deprecated in favor of PipelineGraph, and will be removed after v27.", 

182 version="27.0", 

183 category=FutureWarning, 

184 ) 

185 def makeDatasetType( 

186 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

187 ) -> DatasetType: 

188 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

189 normalized dimensions. 

190 

191 Parameters 

192 ---------- 

193 universe : `lsst.daf.butler.DimensionUniverse` 

194 Set of all known dimensions to be used to normalize the dimension 

195 names specified in config. 

196 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

197 Parent storage class for component datasets; `None` otherwise. 

198 

199 Returns 

200 ------- 

201 datasetType : `~lsst.daf.butler.DatasetType` 

202 The `~lsst.daf.butler.DatasetType` defined by this connection. 

203 """ 

204 return DatasetType( 

205 self.name, 

206 universe.conform(self.dimensions), 

207 self.storageClass, 

208 isCalibration=self.isCalibration, 

209 parentStorageClass=parentStorageClass, 

210 ) 

211 

212 

213@dataclasses.dataclass(frozen=True) 

214class BaseInput(DimensionedConnection): 

215 """Class used for declaring PipelineTask input connections. 

216 

217 Attributes 

218 ---------- 

219 name : `str` 

220 The default name used to identify the dataset type. 

221 storageClass : `str` 

222 The storage class used when (un)/persisting the dataset type. 

223 multiple : `bool` 

224 Indicates if this connection should expect to contain multiple objects 

225 of the given dataset type. Tasks with more than one connection with 

226 ``multiple=True`` with the same dimensions may want to implement 

227 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

228 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

229 the execution system as early as possible of outputs that will not be 

230 produced because the corresponding input is missing. 

231 dimensions : iterable of `str` 

232 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

233 to identify the dataset type identified by the specified name. 

234 deferLoad : `bool` 

235 Indicates that this dataset type will be loaded as a 

236 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

237 object to load the object at a later time. 

238 minimum : `bool` 

239 Minimum number of datasets required for this connection, per quantum. 

240 This is checked in the base implementation of 

241 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

242 the minimum is not met for `Input` connections (causing the quantum to 

243 be pruned, skipped, or never created, depending on the context), and 

244 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

245 QuantumGraph generation to fail). `PipelineTask` implementations may 

246 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

247 for more fine-grained or configuration-driven constraints, as long as 

248 they are compatible with this minium. 

249 

250 Raises 

251 ------ 

252 TypeError 

253 Raised if ``minimum`` is greater than one but ``multiple=False``. 

254 NotImplementedError 

255 Raised if ``minimum`` is zero for a regular `Input` connection; this 

256 is not currently supported by our QuantumGraph generation algorithm. 

257 """ 

258 

259 deferLoad: bool = False 

260 minimum: int = 1 

261 

262 def __post_init__(self) -> None: 

263 super().__post_init__() 

264 if self.minimum > 1 and not self.multiple: 264 ↛ 265line 264 didn't jump to line 265, because the condition on line 264 was never true

265 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

266 

267 

268@dataclasses.dataclass(frozen=True) 

269class Input(BaseInput): 

270 """Class used for declaring PipelineTask input connections. 

271 

272 Attributes 

273 ---------- 

274 name : `str` 

275 The default name used to identify the dataset type. 

276 storageClass : `str` 

277 The storage class used when (un)/persisting the dataset type. 

278 multiple : `bool` 

279 Indicates if this connection should expect to contain multiple objects 

280 of the given dataset type. Tasks with more than one connection with 

281 ``multiple=True`` with the same dimensions may want to implement 

282 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

283 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

284 the execution system as early as possible of outputs that will not be 

285 produced because the corresponding input is missing. 

286 dimensions : iterable of `str` 

287 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

288 to identify the dataset type identified by the specified name. 

289 deferLoad : `bool` 

290 Indicates that this dataset type will be loaded as a 

291 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

292 object to load the object at a later time. 

293 minimum : `bool` 

294 Minimum number of datasets required for this connection, per quantum. 

295 This is checked in the base implementation of 

296 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

297 the minimum is not met for `Input` connections (causing the quantum to 

298 be pruned, skipped, or never created, depending on the context), and 

299 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

300 QuantumGraph generation to fail). `PipelineTask` implementations may 

301 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

302 for more fine-grained or configuration-driven constraints, as long as 

303 they are compatible with this minium. 

304 deferGraphConstraint : `bool`, optional 

305 If `True`, do not include this dataset type's existence in the initial 

306 query that starts the QuantumGraph generation process. This can be 

307 used to make QuantumGraph generation faster by avoiding redundant 

308 datasets, and in certain cases it can (along with careful attention to 

309 which tasks are included in the same QuantumGraph) be used to work 

310 around the QuantumGraph generation algorithm's inflexible handling of 

311 spatial overlaps. This option has no effect when the connection is not 

312 an overall input of the pipeline (or subset thereof) for which a graph 

313 is being created, and it never affects the ordering of quanta. 

314 deferBinding : `bool`, optional 

315 If `True`, the dataset will not be automatically included in 

316 the pipeline graph, ``deferGraphConstraint`` is implied. 

317 The custom QuantumGraphBuilder is required to bind it and add a 

318 corresponding edge to the pipeline graph. 

319 This option allows to have the same dataset type as both 

320 input and output of a quantum. 

321 

322 Raises 

323 ------ 

324 TypeError 

325 Raised if ``minimum`` is greater than one but ``multiple=False``. 

326 NotImplementedError 

327 Raised if ``minimum`` is zero for a regular `Input` connection; this 

328 is not currently supported by our QuantumGraph generation algorithm. 

329 """ 

330 

331 deferGraphConstraint: bool = False 

332 

333 deferBinding: bool = False 

334 

335 _connection_type_set: ClassVar[str] = "inputs" 

336 

337 def __post_init__(self) -> None: 

338 super().__post_init__() 

339 if self.minimum == 0: 339 ↛ 340line 339 didn't jump to line 340, because the condition on line 339 was never true

340 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

341 

342 

343@dataclasses.dataclass(frozen=True) 

344class PrerequisiteInput(BaseInput): 

345 """Class used for declaring PipelineTask prerequisite connections. 

346 

347 Attributes 

348 ---------- 

349 name : `str` 

350 The default name used to identify the dataset type. 

351 storageClass : `str` 

352 The storage class used when (un)/persisting the dataset type. 

353 multiple : `bool` 

354 Indicates if this connection should expect to contain multiple objects 

355 of the given dataset type. Tasks with more than one connection with 

356 ``multiple=True`` with the same dimensions may want to implement 

357 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

358 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

359 the execution system as early as possible of outputs that will not be 

360 produced because the corresponding input is missing. 

361 dimensions : iterable of `str` 

362 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

363 to identify the dataset type identified by the specified name. 

364 minimum : `bool` 

365 Minimum number of datasets required for this connection, per quantum. 

366 This is checked in the base implementation of 

367 `PipelineTaskConnections.adjustQuantum`, which raises 

368 `FileNotFoundError` (causing QuantumGraph generation to fail). 

369 `PipelineTask` implementations may 

370 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

371 for more fine-grained or configuration-driven constraints, as long as 

372 they are compatible with this minium. 

373 lookupFunction : `typing.Callable`, optional 

374 An optional callable function that will look up PrerequisiteInputs 

375 using the DatasetType, registry, quantum dataId, and input collections 

376 passed to it. If no function is specified, the default temporal spatial 

377 lookup will be used. 

378 

379 Raises 

380 ------ 

381 TypeError 

382 Raised if ``minimum`` is greater than one but ``multiple=False``. 

383 

384 Notes 

385 ----- 

386 Prerequisite inputs are used for datasets that must exist in the data 

387 repository before a pipeline including this is run; they cannot be produced 

388 by another task in the same pipeline. 

389 

390 In exchange for this limitation, they have a number of advantages relative 

391 to regular `Input` connections: 

392 

393 - The query used to find them then during `QuantumGraph` generation can be 

394 fully customized by providing a ``lookupFunction``. 

395 - Failed searches for prerequisites during `QuantumGraph` generation will 

396 usually generate more helpful diagnostics than those for regular `Input` 

397 connections. 

398 - The default query for prerequisite inputs relates the quantum dimensions 

399 directly to the dimensions of its dataset type, without being constrained 

400 by any of the other dimensions in the pipeline. This allows them to be 

401 used for temporal calibration lookups (which regular `Input` connections 

402 cannot do at present) and to work around `QuantumGraph` generation 

403 limitations involving cases where naive spatial overlap relationships 

404 between dimensions are not desired (e.g. a task that wants all detectors 

405 in each visit for which the visit overlaps a tract, not just those where 

406 that detector+visit combination overlaps the tract). 

407 - Prerequisite inputs may be optional (regular inputs are never optional). 

408 """ 

409 

410 lookupFunction: ( 

411 Callable[[DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef]] | None 

412 ) = None 

413 

414 _connection_type_set: ClassVar[str] = "prerequisiteInputs" 

415 

416 

417@dataclasses.dataclass(frozen=True) 

418class Output(DimensionedConnection): 

419 """Connection for output dataset.""" 

420 

421 _connection_type_set: ClassVar[str] = "outputs" 

422 

423 

424@dataclasses.dataclass(frozen=True) 

425class InitInput(BaseConnection): 

426 """Connection for initInput dataset.""" 

427 

428 _connection_type_set: ClassVar[str] = "initInputs" 

429 

430 

431@dataclasses.dataclass(frozen=True) 

432class InitOutput(BaseConnection): 

433 """Connection for initOutput dataset.""" 

434 

435 _connection_type_set: ClassVar[str] = "initOutputs"