Coverage for python/lsst/pipe/base/connectionTypes.py: 80%

62 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-16 09:02 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining connection types to be used within a 

23`PipelineTaskConnections` class. 

24""" 

25 

26__all__ = ["InitInput", "InitOutput", "Input", "PrerequisiteInput", "Output", "BaseConnection"] 

27 

28import dataclasses 

29from collections.abc import Callable, Iterable, Sequence 

30from typing import ClassVar 

31 

32from lsst.daf.butler import DataCoordinate, DatasetRef, DatasetType, DimensionUniverse, Registry, StorageClass 

33 

34 

35@dataclasses.dataclass(frozen=True) 

36class BaseConnection: 

37 """Base class used for declaring `PipelineTask` connections. 

38 

39 Parameters 

40 ---------- 

41 name : `str` 

42 The name used to identify the dataset type. 

43 storageClass : `str` 

44 The storage class used when (un)/persisting the dataset type. 

45 multiple : `bool` 

46 Indicates if this connection should expect to contain multiple objects 

47 of the given dataset type. Tasks with more than one connection with 

48 ``multiple=True`` with the same dimensions may want to implement 

49 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

50 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum()` and 

51 notify the execution system as early as possible of outputs that will 

52 not be produced because the corresponding input is missing. 

53 """ 

54 

55 name: str 

56 storageClass: str 

57 doc: str = "" 

58 multiple: bool = False 

59 

60 _connection_type_set: ClassVar[str] 

61 

62 def __get__(self, inst, klass): 

63 """Descriptor access method. 

64 

65 This is a method used to turn a connection into a descriptor. 

66 When a connection is added to a connection class, it is a class level 

67 variable. This method makes accessing this connection, on the 

68 instance of the connection class owning this connection, return a 

69 result specialized for that instance. In the case of connections 

70 this specifically means names specified in a config instance will 

71 be visible instead of the default names for the connection, and that 

72 removed connections will not be accessible on the instance. 

73 """ 

74 # If inst is None, this is being accessed by the class and not an 

75 # instance, return this connection itself 

76 if inst is None: 

77 return self 

78 # Attempt to return the configured connection object from the 

79 # connections instance allConnections mapping. 

80 try: 

81 return inst.allConnections[self.varName] 

82 except KeyError: 

83 raise AttributeError( 

84 f"Connection {self.varName!r} of {klass.__name__} has been removed." 

85 ) from None 

86 

87 def makeDatasetType( 

88 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

89 ) -> DatasetType: 

90 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

91 normalized dimensions. 

92 

93 Parameters 

94 ---------- 

95 universe : `lsst.daf.butler.DimensionUniverse` 

96 Set of all known dimensions to be used to normalize the dimension 

97 names specified in config. 

98 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

99 Parent storage class for component datasets; `None` otherwise. 

100 

101 Returns 

102 ------- 

103 datasetType : `~lsst.daf.butler.DatasetType` 

104 The `~lsst.daf.butler.DatasetType` defined by this connection. 

105 """ 

106 return DatasetType( 

107 self.name, universe.empty, self.storageClass, parentStorageClass=parentStorageClass 

108 ) 

109 

110 

111@dataclasses.dataclass(frozen=True) 

112class DimensionedConnection(BaseConnection): 

113 """Class used for declaring PipelineTask connections that includes 

114 dimensions 

115 

116 Parameters 

117 ---------- 

118 name : `str` 

119 The name used to identify the dataset type 

120 storageClass : `str` 

121 The storage class used when (un)/persisting the dataset type 

122 multiple : `bool` 

123 Indicates if this connection should expect to contain multiple objects 

124 of the given dataset type. Tasks with more than one connection with 

125 ``multiple=True`` with the same dimensions may want to implement 

126 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

127 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

128 the execution system as early as possible of outputs that will not be 

129 produced because the corresponding input is missing. 

130 dimensions : iterable of `str` 

131 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

132 to identify the dataset type identified by the specified name 

133 isCalibration: `bool`, optional 

134 `True` if this dataset type may be included in CALIBRATION-type 

135 collections to associate it with a validity range, `False` (default) 

136 otherwise. 

137 """ 

138 

139 dimensions: Iterable[str] = () 

140 isCalibration: bool = False 

141 

142 def __post_init__(self): 

143 if isinstance(self.dimensions, str): 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true

144 raise TypeError( 

145 "Dimensions must be iterable of dimensions, got str, possibly omitted trailing comma" 

146 ) 

147 if not isinstance(self.dimensions, Iterable): 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true

148 raise TypeError("Dimensions must be iterable of dimensions") 

149 

150 def makeDatasetType( 

151 self, universe: DimensionUniverse, parentStorageClass: StorageClass | str | None = None 

152 ) -> DatasetType: 

153 """Construct a true `~lsst.daf.butler.DatasetType` instance with 

154 normalized dimensions. 

155 

156 Parameters 

157 ---------- 

158 universe : `lsst.daf.butler.DimensionUniverse` 

159 Set of all known dimensions to be used to normalize the dimension 

160 names specified in config. 

161 parentStorageClass : `lsst.daf.butler.StorageClass` or `str`, optional 

162 Parent storage class for component datasets; `None` otherwise. 

163 

164 Returns 

165 ------- 

166 datasetType : `~lsst.daf.butler.DatasetType` 

167 The `~lsst.daf.butler.DatasetType` defined by this connection. 

168 """ 

169 return DatasetType( 

170 self.name, 

171 universe.extract(self.dimensions), 

172 self.storageClass, 

173 isCalibration=self.isCalibration, 

174 parentStorageClass=parentStorageClass, 

175 ) 

176 

177 

178@dataclasses.dataclass(frozen=True) 

179class BaseInput(DimensionedConnection): 

180 """Class used for declaring PipelineTask input connections 

181 

182 Parameters 

183 ---------- 

184 name : `str` 

185 The default name used to identify the dataset type 

186 storageClass : `str` 

187 The storage class used when (un)/persisting the dataset type 

188 multiple : `bool` 

189 Indicates if this connection should expect to contain multiple objects 

190 of the given dataset type. Tasks with more than one connection with 

191 ``multiple=True`` with the same dimensions may want to implement 

192 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

193 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

194 the execution system as early as possible of outputs that will not be 

195 produced because the corresponding input is missing. 

196 dimensions : iterable of `str` 

197 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

198 to identify the dataset type identified by the specified name 

199 deferLoad : `bool` 

200 Indicates that this dataset type will be loaded as a 

201 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

202 object to load the object at a later time. 

203 minimum : `bool` 

204 Minimum number of datasets required for this connection, per quantum. 

205 This is checked in the base implementation of 

206 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

207 the minimum is not met for `Input` connections (causing the quantum to 

208 be pruned, skipped, or never created, depending on the context), and 

209 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

210 QuantumGraph generation to fail). `PipelineTask` implementations may 

211 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

212 for more fine-grained or configuration-driven constraints, as long as 

213 they are compatible with this minium. 

214 

215 Raises 

216 ------ 

217 TypeError 

218 Raised if ``minimum`` is greater than one but ``multiple=False``. 

219 NotImplementedError 

220 Raised if ``minimum`` is zero for a regular `Input` connection; this 

221 is not currently supported by our QuantumGraph generation algorithm. 

222 """ 

223 

224 deferLoad: bool = False 

225 minimum: int = 1 

226 

227 def __post_init__(self) -> None: 

228 super().__post_init__() 

229 if self.minimum > 1 and not self.multiple: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 raise TypeError(f"Cannot set minimum={self.minimum} if multiple=False.") 

231 

232 

233@dataclasses.dataclass(frozen=True) 

234class Input(BaseInput): 

235 """Class used for declaring PipelineTask input connections 

236 

237 Parameters 

238 ---------- 

239 name : `str` 

240 The default name used to identify the dataset type 

241 storageClass : `str` 

242 The storage class used when (un)/persisting the dataset type 

243 multiple : `bool` 

244 Indicates if this connection should expect to contain multiple objects 

245 of the given dataset type. Tasks with more than one connection with 

246 ``multiple=True`` with the same dimensions may want to implement 

247 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

248 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

249 the execution system as early as possible of outputs that will not be 

250 produced because the corresponding input is missing. 

251 dimensions : iterable of `str` 

252 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

253 to identify the dataset type identified by the specified name 

254 deferLoad : `bool` 

255 Indicates that this dataset type will be loaded as a 

256 `lsst.daf.butler.DeferredDatasetHandle`. PipelineTasks can use this 

257 object to load the object at a later time. 

258 minimum : `bool` 

259 Minimum number of datasets required for this connection, per quantum. 

260 This is checked in the base implementation of 

261 `PipelineTaskConnections.adjustQuantum`, which raises `NoWorkFound` if 

262 the minimum is not met for `Input` connections (causing the quantum to 

263 be pruned, skipped, or never created, depending on the context), and 

264 `FileNotFoundError` for `PrerequisiteInput` connections (causing 

265 QuantumGraph generation to fail). `PipelineTask` implementations may 

266 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

267 for more fine-grained or configuration-driven constraints, as long as 

268 they are compatible with this minium. 

269 deferGraphConstraint: `bool`, optional 

270 If `True`, do not include this dataset type's existence in the initial 

271 query that starts the QuantumGraph generation process. This can be 

272 used to make QuantumGraph generation faster by avoiding redundant 

273 datasets, and in certain cases it can (along with careful attention to 

274 which tasks are included in the same QuantumGraph) be used to work 

275 around the QuantumGraph generation algorithm's inflexible handling of 

276 spatial overlaps. This option has no effect when the connection is not 

277 an overall input of the pipeline (or subset thereof) for which a graph 

278 is being created, and it never affects the ordering of quanta. 

279 

280 Raises 

281 ------ 

282 TypeError 

283 Raised if ``minimum`` is greater than one but ``multiple=False``. 

284 NotImplementedError 

285 Raised if ``minimum`` is zero for a regular `Input` connection; this 

286 is not currently supported by our QuantumGraph generation algorithm. 

287 """ 

288 

289 deferGraphConstraint: bool = False 

290 

291 _connection_type_set: ClassVar[str] = "inputs" 

292 

293 def __post_init__(self) -> None: 

294 super().__post_init__() 

295 if self.minimum == 0: 295 ↛ 296line 295 didn't jump to line 296, because the condition on line 295 was never true

296 raise TypeError(f"Cannot set minimum={self.minimum} for regular input.") 

297 

298 

299@dataclasses.dataclass(frozen=True) 

300class PrerequisiteInput(BaseInput): 

301 """Class used for declaring PipelineTask prerequisite connections. 

302 

303 Parameters 

304 ---------- 

305 name : `str` 

306 The default name used to identify the dataset type 

307 storageClass : `str` 

308 The storage class used when (un)/persisting the dataset type 

309 multiple : `bool` 

310 Indicates if this connection should expect to contain multiple objects 

311 of the given dataset type. Tasks with more than one connection with 

312 ``multiple=True`` with the same dimensions may want to implement 

313 `PipelineTaskConnections.adjustQuantum` to ensure those datasets are 

314 consistent (i.e. zip-iterable) in `PipelineTask.runQuantum` and notify 

315 the execution system as early as possible of outputs that will not be 

316 produced because the corresponding input is missing. 

317 dimensions : iterable of `str` 

318 The `lsst.daf.butler.Butler` `lsst.daf.butler.Registry` dimensions used 

319 to identify the dataset type identified by the specified name 

320 minimum : `bool` 

321 Minimum number of datasets required for this connection, per quantum. 

322 This is checked in the base implementation of 

323 `PipelineTaskConnections.adjustQuantum`, which raises 

324 `FileNotFoundError` (causing QuantumGraph generation to fail). 

325 `PipelineTask` implementations may 

326 provide custom `~PipelineTaskConnections.adjustQuantum` implementations 

327 for more fine-grained or configuration-driven constraints, as long as 

328 they are compatible with this minium. 

329 lookupFunction: `typing.Callable`, optional 

330 An optional callable function that will look up PrerequisiteInputs 

331 using the DatasetType, registry, quantum dataId, and input collections 

332 passed to it. If no function is specified, the default temporal spatial 

333 lookup will be used. 

334 

335 Raises 

336 ------ 

337 TypeError 

338 Raised if ``minimum`` is greater than one but ``multiple=False``. 

339 

340 Notes 

341 ----- 

342 Prerequisite inputs are used for datasets that must exist in the data 

343 repository before a pipeline including this is run; they cannot be produced 

344 by another task in the same pipeline. 

345 

346 In exchange for this limitation, they have a number of advantages relative 

347 to regular `Input` connections: 

348 

349 - The query used to find them then during `QuantumGraph` generation can be 

350 fully customized by providing a ``lookupFunction``. 

351 - Failed searches for prerequisites during `QuantumGraph` generation will 

352 usually generate more helpful diagnostics than those for regular `Input` 

353 connections. 

354 - The default query for prerequisite inputs relates the quantum dimensions 

355 directly to the dimensions of its dataset type, without being constrained 

356 by any of the other dimensions in the pipeline. This allows them to be 

357 used for temporal calibration lookups (which regular `Input` connections 

358 cannot do at present) and to work around `QuantumGraph` generation 

359 limitations involving cases where naive spatial overlap relationships 

360 between dimensions are not desired (e.g. a task that wants all detectors 

361 in each visit for which the visit overlaps a tract, not just those where 

362 that detector+visit combination overlaps the tract). 

363 - Prerequisite inputs may be optional (regular inputs are never optional). 

364 """ 

365 

366 lookupFunction: Callable[ 

367 [DatasetType, Registry, DataCoordinate, Sequence[str]], Iterable[DatasetRef] 

368 ] | None = None 

369 

370 _connection_type_set: ClassVar[str] = "prerequisiteInputs" 

371 

372 

373@dataclasses.dataclass(frozen=True) 

374class Output(DimensionedConnection): 

375 """Connection for output dataset.""" 

376 

377 _connection_type_set: ClassVar[str] = "outputs" 

378 

379 

380@dataclasses.dataclass(frozen=True) 

381class InitInput(BaseConnection): 

382 """Connection for initInput dataset.""" 

383 

384 _connection_type_set: ClassVar[str] = "initInputs" 

385 

386 

387@dataclasses.dataclass(frozen=True) 

388class InitOutput(BaseConnection): 

389 """Connection for initOutput dataset.""" 

390 

391 _connection_type_set: ClassVar[str] = "initOutputs"