Coverage for python/lsst/pipe/base/connectionTypes.py: 80%

63 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 11:14 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29from collections.abc import Callable, Iterable, Sequence 

30from typing import ClassVar 

31 

32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

33 

34 

35@dataclasses.dataclass(frozen=True) 

36class BaseConnection: 

37 """Base class used for declaring `PipelineTask` connections. 

38 

39 Parameters 

40 ---------- 

41 name : `str` 

42 The name used to identify the dataset type. 

43 storageClass : `str` 

44 The storage class used when (un)/persisting the dataset type. 

45 multiple : `bool` 

46 Indicates if this connection should expect to contain multiple objects 

47 of the given dataset type. Tasks with more than one connection with 

48 ``multiple=True`` with the same dimensions may want to implement 

49 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

50 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and 

51 notify the execution system as early as possible of outputs that will 

52 not be produced because the corresponding input is missing. 

53 deprecated : `str`, optional 

54 A description of why this connection is deprecated, including the 

55 version after which it may be removed. 

56 

57 If not `None`, the string is appended to the docstring for this 

58 connection and the corresponding config Field. 

59 """ 

60 

61 name: str 

62 storageClass: str 

63 doc: str = "" 

64 multiple: bool = False 

65 deprecated: str | None = dataclasses.field(default=None, kw_only=True) 

66 

67 _connection_type_set: ClassVar[str] 

68 

69 def __get__(self, inst, klass): 

70 """Descriptor access method. 

71 

72 This is a method used to turn a connection into a descriptor. 

73 When a connection is added to a connection class, it is a class level 

74 variable. This method makes accessing this connection, on the 

75 instance of the connection class owning this connection, return a 

76 result specialized for that instance. In the case of connections 

77 this specifically means names specified in a config instance will 

78 be visible instead of the default names for the connection, and that 

79 removed connections will not be accessible on the instance. 

80 """ 

81 # If inst is None, this is being accessed by the class and not an 

82 # instance, return this connection itself 

83 if inst is None: 

84 return self 

85 # Attempt to return the configured connection object from the 

86 # connections instance allConnections mapping. 

87 try: 

88 return inst.allConnections[self.varName] 

89 except KeyError: 

90 raise AttributeError( 

91 f"Connection {self.varName!r} of {klass.__name__} has been removed." 

92 ) from None 

93 

94 def makeDatasetType( 

95 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

96 ) -> DatasetType: 

97 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

98 normalized dimensions. 

99 

100 Parameters 

101 ---------- 

102 universe : `lsst.daf.butler.DimensionUniverse` 

103 Set of all known dimensions to be used to normalize the dimension 

104 names specified in config. 

105 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

106 Parent storage class for component datasets; `None` otherwise. 

107 

108 Returns 

109 ------- 

110 datasetType : `~lsst.daf.butler.DatasetType` 

111 The `~lsst.daf.butler.DatasetType` defined by this connection. 

112 """ 

113 return DatasetType( 

114 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

115 ) 

116 

117 

118@dataclasses.dataclass(frozen=True) 

119class DimensionedConnection(BaseConnection): 

120 """Class used for declaring PipelineTask connections that includes 

121 dimensions 

122 

123 Parameters 

124 ---------- 

125 name : `str` 

126 The name used to identify the dataset type 

127 storageClass : `str` 

128 The storage class used when (un)/persisting the dataset type 

129 multiple : `bool` 

130 Indicates if this connection should expect to contain multiple objects 

131 of the given dataset type. Tasks with more than one connection with 

132 ``multiple=True`` with the same dimensions may want to implement 

133 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

134 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

135 the execution system as early as possible of outputs that will not be 

136 produced because the corresponding input is missing. 

137 dimensions : iterable of `str` 

138 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

139 to identify the dataset type identified by the specified name 

140 isCalibration: `bool`, optional 

141 `True` if this dataset type may be included in CALIBRATION-type 

142 collections to associate it with a validity range, `False` (default) 

143 otherwise. 

144 """ 

145 

146 dimensions: Iterable[str] = () 

147 isCalibration: bool = False 

148 

149 def __post_init__(self): 

150 if isinstance(self.dimensions, str): 150 ↛ 151line 150 didn't jump to line 151, because the condition on line 150 was never true

151 raise TypeError( 

152 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

153 ) 

154 if not isinstance(self.dimensions, Iterable): 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true

155 raise TypeError("Dimensions must be iterable of dimensions") 

156 

157 def makeDatasetType( 

158 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

159 ) -> DatasetType: 

160 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

161 normalized dimensions. 

162 

163 Parameters 

164 ---------- 

165 universe : `lsst.daf.butler.DimensionUniverse` 

166 Set of all known dimensions to be used to normalize the dimension 

167 names specified in config. 

168 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

169 Parent storage class for component datasets; `None` otherwise. 

170 

171 Returns 

172 ------- 

173 datasetType : `~lsst.daf.butler.DatasetType` 

174 The `~lsst.daf.butler.DatasetType` defined by this connection. 

175 """ 

176 return DatasetType( 

177 self.name, 

178 universe.extract(self.dimensions), 

179 self.storageClass, 

180 isCalibration=self.isCalibration, 

181 parentStorageClass=parentStorageClass, 

182 ) 

183 

184 

185@dataclasses.dataclass(frozen=True) 

186class BaseInput(DimensionedConnection): 

187 """Class used for declaring PipelineTask input connections 

188 

189 Parameters 

190 ---------- 

191 name : `str` 

192 The default name used to identify the dataset type 

193 storageClass : `str` 

194 The storage class used when (un)/persisting the dataset type 

195 multiple : `bool` 

196 Indicates if this connection should expect to contain multiple objects 

197 of the given dataset type. Tasks with more than one connection with 

198 ``multiple=True`` with the same dimensions may want to implement 

199 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

200 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

201 the execution system as early as possible of outputs that will not be 

202 produced because the corresponding input is missing. 

203 dimensions : iterable of `str` 

204 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

205 to identify the dataset type identified by the specified name 

206 deferLoad : `bool` 

207 Indicates that this dataset type will be loaded as a 

208 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

209 object to load the object at a later time. 

210 minimum : `bool` 

211 Minimum number of datasets required for this connection, per quantum. 

212 This is checked in the base implementation of 

213 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

214 the minimum is not met for `Input` connections (causing the quantum to 

215 be pruned, skipped, or never created, depending on the context), and 

216 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

217 QuantumGraph generation to fail). `PipelineTask` implementations may 

218 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

219 for more fine-grained or configuration-driven constraints, as long as 

220 they are compatible with this minium. 

221 

222 Raises 

223 ------ 

224 TypeError 

225 Raised if ``minimum`` is greater than one but ``multiple=False``. 

226 NotImplementedError 

227 Raised if ``minimum`` is zero for a regular `Input` connection; this 

228 is not currently supported by our QuantumGraph generation algorithm. 

229 """ 

230 

231 deferLoad: bool = False 

232 minimum: int = 1 

233 

234 def __post_init__(self) -> None: 

235 super().__post_init__() 

236 if self.minimum > 1 and not self.multiple: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

238 

239 

240@dataclasses.dataclass(frozen=True) 

241class Input(BaseInput): 

242 """Class used for declaring PipelineTask input connections 

243 

244 Parameters 

245 ---------- 

246 name : `str` 

247 The default name used to identify the dataset type 

248 storageClass : `str` 

249 The storage class used when (un)/persisting the dataset type 

250 multiple : `bool` 

251 Indicates if this connection should expect to contain multiple objects 

252 of the given dataset type. Tasks with more than one connection with 

253 ``multiple=True`` with the same dimensions may want to implement 

254 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

255 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

256 the execution system as early as possible of outputs that will not be 

257 produced because the corresponding input is missing. 

258 dimensions : iterable of `str` 

259 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

260 to identify the dataset type identified by the specified name 

261 deferLoad : `bool` 

262 Indicates that this dataset type will be loaded as a 

263 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

264 object to load the object at a later time. 

265 minimum : `bool` 

266 Minimum number of datasets required for this connection, per quantum. 

267 This is checked in the base implementation of 

268 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

269 the minimum is not met for `Input` connections (causing the quantum to 

270 be pruned, skipped, or never created, depending on the context), and 

271 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

272 QuantumGraph generation to fail). `PipelineTask` implementations may 

273 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

274 for more fine-grained or configuration-driven constraints, as long as 

275 they are compatible with this minium. 

276 deferGraphConstraint: `bool`, optional 

277 If `True`, do not include this dataset type's existence in the initial 

278 query that starts the QuantumGraph generation process. This can be 

279 used to make QuantumGraph generation faster by avoiding redundant 

280 datasets, and in certain cases it can (along with careful attention to 

281 which tasks are included in the same QuantumGraph) be used to work 

282 around the QuantumGraph generation algorithm's inflexible handling of 

283 spatial overlaps. This option has no effect when the connection is not 

284 an overall input of the pipeline (or subset thereof) for which a graph 

285 is being created, and it never affects the ordering of quanta. 

286 

287 Raises 

288 ------ 

289 TypeError 

290 Raised if ``minimum`` is greater than one but ``multiple=False``. 

291 NotImplementedError 

292 Raised if ``minimum`` is zero for a regular `Input` connection; this 

293 is not currently supported by our QuantumGraph generation algorithm. 

294 """ 

295 

296 deferGraphConstraint: bool = False 

297 

298 _connection_type_set: ClassVar[str] = "inputs" 

299 

300 def __post_init__(self) -> None: 

301 super().__post_init__() 

302 if self.minimum == 0: 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true

303 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

304 

305 

306@dataclasses.dataclass(frozen=True) 

307class PrerequisiteInput(BaseInput): 

308 """Class used for declaring PipelineTask prerequisite connections. 

309 

310 Parameters 

311 ---------- 

312 name : `str` 

313 The default name used to identify the dataset type 

314 storageClass : `str` 

315 The storage class used when (un)/persisting the dataset type 

316 multiple : `bool` 

317 Indicates if this connection should expect to contain multiple objects 

318 of the given dataset type. Tasks with more than one connection with 

319 ``multiple=True`` with the same dimensions may want to implement 

320 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

321 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

322 the execution system as early as possible of outputs that will not be 

323 produced because the corresponding input is missing. 

324 dimensions : iterable of `str` 

325 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

326 to identify the dataset type identified by the specified name 

327 minimum : `bool` 

328 Minimum number of datasets required for this connection, per quantum. 

329 This is checked in the base implementation of 

330 `PipelineTaskConnections.adjustQuantum`, which raises 

331 `FileNotFoundError` (causing QuantumGraph generation to fail). 

332 `PipelineTask` implementations may 

333 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

334 for more fine-grained or configuration-driven constraints, as long as 

335 they are compatible with this minium. 

336 lookupFunction: `typing.Callable`, optional 

337 An optional callable function that will look up PrerequisiteInputs 

338 using the DatasetType, registry, quantum dataId, and input collections 

339 passed to it. If no function is specified, the default temporal spatial 

340 lookup will be used. 

341 

342 Raises 

343 ------ 

344 TypeError 

345 Raised if ``minimum`` is greater than one but ``multiple=False``. 

346 

347 Notes 

348 ----- 

349 Prerequisite inputs are used for datasets that must exist in the data 

350 repository before a pipeline including this is run; they cannot be produced 

351 by another task in the same pipeline. 

352 

353 In exchange for this limitation, they have a number of advantages relative 

354 to regular `Input` connections: 

355 

356 - The query used to find them then during `QuantumGraph` generation can be 

357 fully customized by providing a ``lookupFunction``. 

358 - Failed searches for prerequisites during `QuantumGraph` generation will 

359 usually generate more helpful diagnostics than those for regular `Input` 

360 connections. 

361 - The default query for prerequisite inputs relates the quantum dimensions 

362 directly to the dimensions of its dataset type, without being constrained 

363 by any of the other dimensions in the pipeline. This allows them to be 

364 used for temporal calibration lookups (which regular `Input` connections 

365 cannot do at present) and to work around `QuantumGraph` generation 

366 limitations involving cases where naive spatial overlap relationships 

367 between dimensions are not desired (e.g. a task that wants all detectors 

368 in each visit for which the visit overlaps a tract, not just those where 

369 that detector+visit combination overlaps the tract). 

370 - Prerequisite inputs may be optional (regular inputs are never optional). 

371 """ 

372 

373 lookupFunction: Callable[ 

374 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef] 

375 ] | None = None 

376 

377 _connection_type_set: ClassVar[str] = "prerequisiteInputs" 

378 

379 

380@dataclasses.dataclass(frozen=True) 

381class Output(DimensionedConnection): 

382 """Connection for output dataset.""" 

383 

384 _connection_type_set: ClassVar[str] = "outputs" 

385 

386 

387@dataclasses.dataclass(frozen=True) 

388class InitInput(BaseConnection): 

389 """Connection for initInput dataset.""" 

390 

391 _connection_type_set: ClassVar[str] = "initInputs" 

392 

393 

394@dataclasses.dataclass(frozen=True) 

395class InitOutput(BaseConnection): 

396 """Connection for initOutput dataset.""" 

397 

398 _connection_type_set: ClassVar[str] = "initOutputs"