Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 19%

145 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 02:50 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["PreExecInit"] 

31 

32# ------------------------------- 

33# Imports of standard modules -- 

34# ------------------------------- 

35import abc 

36import logging 

37from collections.abc import Iterable, Iterator 

38from contextlib import contextmanager 

39from typing import TYPE_CHECKING, Any 

40 

41# ----------------------------- 

42# Imports for other modules -- 

43# ----------------------------- 

44from lsst.daf.butler import DatasetRef, DatasetType 

45from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError 

46from lsst.pipe.base.automatic_connection_constants import ( 

47 PACKAGES_INIT_OUTPUT_NAME, 

48 PACKAGES_INIT_OUTPUT_STORAGE_CLASS, 

49) 

50from lsst.utils.packages import Packages 

51 

52if TYPE_CHECKING: 

53 from lsst.daf.butler import Butler, LimitedButler 

54 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory 

55 

56_LOG = logging.getLogger(__name__) 

57 

58 

59class MissingReferenceError(Exception): 

60 """Exception raised when resolved reference is missing from graph.""" 

61 

62 pass 

63 

64 

65def _compare_packages(old_packages: Packages, new_packages: Packages) -> None: 

66 """Compare two versions of Packages. 

67 

68 Parameters 

69 ---------- 

70 old_packages : `Packages` 

71 Previously recorded package versions. 

72 new_packages : `Packages` 

73 New set of package versions. 

74 

75 Raises 

76 ------ 

77 TypeError 

78 Raised if parameters are inconsistent. 

79 """ 

80 diff = new_packages.difference(old_packages) 

81 if diff: 

82 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

83 raise TypeError(f"Package versions mismatch: ({versions_str})") 

84 else: 

85 _LOG.debug("new packages are consistent with old") 

86 

87 

88class PreExecInitBase(abc.ABC): 

89 """Common part of the implementation of PreExecInit classes that does not 

90 depend on Butler type. 

91 

92 Parameters 

93 ---------- 

94 butler : `~lsst.daf.butler.LimitedButler` 

95 Butler to use. 

96 taskFactory : `lsst.pipe.base.TaskFactory` 

97 Task factory. 

98 extendRun : `bool` 

99 Whether extend run parameter is in use. 

100 """ 

101 

102 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool): 

103 self.butler = butler 

104 self.taskFactory = taskFactory 

105 self.extendRun = extendRun 

106 

107 def initialize( 

108 self, 

109 graph: QuantumGraph, 

110 saveInitOutputs: bool = True, 

111 registerDatasetTypes: bool = False, 

112 saveVersions: bool = True, 

113 ) -> None: 

114 """Perform all initialization steps. 

115 

116 Convenience method to execute all initialization steps. Instead of 

117 calling this method and providing all options it is also possible to 

118 call methods individually. 

119 

120 Parameters 

121 ---------- 

122 graph : `~lsst.pipe.base.QuantumGraph` 

123 Execution graph. 

124 saveInitOutputs : `bool`, optional 

125 If ``True`` (default) then save "init outputs", configurations, 

126 and package versions to butler. 

127 registerDatasetTypes : `bool`, optional 

128 If ``True`` then register dataset types in registry, otherwise 

129 they must be already registered. 

130 saveVersions : `bool`, optional 

131 If ``False`` then do not save package versions even if 

132 ``saveInitOutputs`` is set to ``True``. 

133 """ 

134 # register dataset types or check consistency 

135 self.initializeDatasetTypes(graph, registerDatasetTypes) 

136 

137 # Save task initialization data or check that saved data 

138 # is consistent with what tasks would save 

139 if saveInitOutputs: 

140 self.saveInitOutputs(graph) 

141 self.saveConfigs(graph) 

142 if saveVersions: 

143 self.savePackageVersions(graph) 

144 

145 @abc.abstractmethod 

146 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

147 """Save or check DatasetTypes output by the tasks in a graph. 

148 

149 Iterates over all DatasetTypes for all tasks in a graph and either 

150 tries to add them to registry or compares them to existing ones. 

151 

152 Parameters 

153 ---------- 

154 graph : `~lsst.pipe.base.QuantumGraph` 

155 Execution graph. 

156 registerDatasetTypes : `bool`, optional 

157 If ``True`` then register dataset types in registry, otherwise 

158 they must be already registered. 

159 

160 Raises 

161 ------ 

162 ValueError 

163 Raised if existing DatasetType is different from DatasetType 

164 in a graph. 

165 KeyError 

166 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

167 does not exist in registry. 

168 """ 

169 raise NotImplementedError() 

170 

171 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

172 """Write any datasets produced by initializing tasks in a graph. 

173 

174 Parameters 

175 ---------- 

176 graph : `~lsst.pipe.base.QuantumGraph` 

177 Execution graph. 

178 

179 Raises 

180 ------ 

181 TypeError 

182 Raised if the type of existing object in butler is different from 

183 new data. 

184 """ 

185 _LOG.debug("Will save InitOutputs for all tasks") 

186 for taskDef in self._task_iter(graph): 

187 init_input_refs = graph.initInputRefs(taskDef) or [] 

188 task = self.taskFactory.makeTask( 

189 graph.pipeline_graph.tasks[taskDef.label], self.butler, init_input_refs 

190 ) 

191 for name in taskDef.connections.initOutputs: 

192 attribute = getattr(taskDef.connections, name) 

193 init_output_refs = graph.initOutputRefs(taskDef) or [] 

194 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name) 

195 if init_output_ref is None: 

196 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph") 

197 init_output_var = getattr(task, name) 

198 

199 if obj_from_store is not None: 

200 _LOG.debug( 

201 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

202 ) 

203 obj_from_store = self.butler.get(init_output_ref) 

204 # Types are supposed to be identical. 

205 # TODO: Check that object contents is identical too. 

206 if type(obj_from_store) is not type(init_output_var): 

207 raise TypeError( 

208 f"Stored initOutput object type {type(obj_from_store)} " 

209 "is different from task-generated type " 

210 f"{type(init_output_var)} for task {taskDef}" 

211 ) 

212 else: 

213 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

214 # This can still raise if there is a concurrent write. 

215 self.butler.put(init_output_var, init_output_ref) 

216 

217 def saveConfigs(self, graph: QuantumGraph) -> None: 

218 """Write configurations for pipeline tasks to butler or check that 

219 existing configurations are equal to the new ones. 

220 

221 Parameters 

222 ---------- 

223 graph : `~lsst.pipe.base.QuantumGraph` 

224 Execution graph. 

225 

226 Raises 

227 ------ 

228 TypeError 

229 Raised if existing object in butler is different from new data. 

230 Exception 

231 Raised if ``extendRun`` is `False` and datasets already exists. 

232 Content of a butler collection should not be changed if exception 

233 is raised. 

234 """ 

235 

236 def logConfigMismatch(msg: str) -> None: 

237 """Log messages about configuration mismatch. 

238 

239 Parameters 

240 ---------- 

241 msg : `str` 

242 Log message to use. 

243 """ 

244 _LOG.fatal("Comparing configuration: %s", msg) 

245 

246 _LOG.debug("Will save Configs for all tasks") 

247 # start transaction to rollback any changes on exceptions 

248 with self.transaction(): 

249 for taskDef in self._task_iter(graph): 

250 # Config dataset ref is stored in task init outputs, but it 

251 # may be also be missing. 

252 task_output_refs = graph.initOutputRefs(taskDef) 

253 if task_output_refs is None: 

254 continue 

255 

256 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName) 

257 if config_ref is None: 

258 continue 

259 

260 if old_config is not None: 

261 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch): 

262 raise TypeError( 

263 f"Config does not match existing task config {taskDef.configDatasetName!r} in " 

264 "butler; tasks configurations must be consistent within the same run collection" 

265 ) 

266 else: 

267 _LOG.debug( 

268 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName 

269 ) 

270 self.butler.put(taskDef.config, config_ref) 

271 

272 def savePackageVersions(self, graph: QuantumGraph) -> None: 

273 """Write versions of software packages to butler. 

274 

275 Parameters 

276 ---------- 

277 graph : `~lsst.pipe.base.QuantumGraph` 

278 Execution graph. 

279 

280 Raises 

281 ------ 

282 TypeError 

283 Raised if existing object in butler is incompatible with new data. 

284 """ 

285 packages = Packages.fromSystem() 

286 _LOG.debug("want to save packages: %s", packages) 

287 

288 # start transaction to rollback any changes on exceptions 

289 with self.transaction(): 

290 # Packages dataset ref is stored in graph's global init outputs, 

291 # but it may be also be missing. 

292 

293 packages_ref, old_packages = self._find_dataset( 

294 graph.globalInitOutputRefs(), PACKAGES_INIT_OUTPUT_NAME 

295 ) 

296 if packages_ref is None: 

297 return 

298 

299 if old_packages is not None: 

300 # Note that because we can only detect python modules that have 

301 # been imported, the stored list of products may be more or 

302 # less complete than what we have now. What's important is 

303 # that the products that are in common have the same version. 

304 _compare_packages(old_packages, packages) 

305 # Update the old set of packages in case we have more packages 

306 # that haven't been persisted. 

307 extra = packages.extra(old_packages) 

308 if extra: 

309 _LOG.debug("extra packages: %s", extra) 

310 old_packages.update(packages) 

311 # have to remove existing dataset first, butler has no 

312 # replace option. 

313 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True) 

314 self.butler.put(old_packages, packages_ref) 

315 else: 

316 self.butler.put(packages, packages_ref) 

317 

318 def _find_dataset( 

319 self, refs: Iterable[DatasetRef], dataset_type: str 

320 ) -> tuple[DatasetRef | None, Any | None]: 

321 """Find a ref with a given dataset type name in a list of references 

322 and try to retrieve its data from butler. 

323 

324 Parameters 

325 ---------- 

326 refs : `~collections.abc.Iterable` [ `~lsst.daf.butler.DatasetRef` ] 

327 References to check for matching dataset type. 

328 dataset_type : `str` 

329 Name of a dataset type to look for. 

330 

331 Returns 

332 ------- 

333 ref : `~lsst.daf.butler.DatasetRef` or `None` 

334 Dataset reference or `None` if there is no matching dataset type. 

335 data : `Any` 

336 An existing object extracted from butler, `None` if ``ref`` is 

337 `None` or if there is no existing object for that reference. 

338 """ 

339 ref: DatasetRef | None = None 

340 for ref in refs: 

341 if ref.datasetType.name == dataset_type: 

342 break 

343 else: 

344 return None, None 

345 

346 try: 

347 data = self.butler.get(ref) 

348 if data is not None and not self.extendRun: 

349 # It must not exist unless we are extending run. 

350 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler") 

351 except (LookupError, FileNotFoundError): 

352 data = None 

353 return ref, data 

354 

355 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]: 

356 """Iterate over TaskDefs in a graph, return only tasks that have one or 

357 more associated quanta. 

358 """ 

359 for taskDef in graph.iterTaskGraph(): 

360 if graph.getNumberOfQuantaForTask(taskDef) > 0: 

361 yield taskDef 

362 

363 @contextmanager 

364 def transaction(self) -> Iterator[None]: 

365 """Context manager for transaction. 

366 

367 Default implementation has no transaction support. 

368 

369 Yields 

370 ------ 

371 `None` 

372 No transaction support. 

373 """ 

374 yield 

375 

376 

377class PreExecInit(PreExecInitBase): 

378 """Initialization of registry for QuantumGraph execution. 

379 

380 This class encapsulates all necessary operations that have to be performed 

381 on butler and registry to prepare them for QuantumGraph execution. 

382 

383 Parameters 

384 ---------- 

385 butler : `~lsst.daf.butler.Butler` 

386 Data butler instance. 

387 taskFactory : `~lsst.pipe.base.TaskFactory` 

388 Task factory. 

389 extendRun : `bool`, optional 

390 If `True` then do not try to overwrite any datasets that might exist 

391 in ``butler.run``; instead compare them when appropriate/possible. If 

392 `False`, then any existing conflicting dataset will cause a butler 

393 exception to be raised. 

394 """ 

395 

396 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False): 

397 super().__init__(butler, taskFactory, extendRun) 

398 self.full_butler = butler 

399 if self.extendRun and self.full_butler.run is None: 

400 raise RuntimeError( 

401 "Cannot perform extendRun logic unless butler is initialized " 

402 "with a default output RUN collection." 

403 ) 

404 

405 @contextmanager 

406 def transaction(self) -> Iterator[None]: 

407 # dosctring inherited 

408 with self.full_butler.transaction(): 

409 yield 

410 

411 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

412 # docstring inherited 

413 missing_dataset_types: set[str] = set() 

414 dataset_types = [node.dataset_type for node in graph.pipeline_graph.dataset_types.values()] 

415 dataset_types.append( 

416 DatasetType( 

417 PACKAGES_INIT_OUTPUT_NAME, self.butler.dimensions.empty, PACKAGES_INIT_OUTPUT_STORAGE_CLASS 

418 ) 

419 ) 

420 for dataset_type in dataset_types: 

421 # Resolving the PipelineGraph when building the QuantumGraph should 

422 # have already guaranteed that this is the registry dataset type 

423 # and that all references to it use compatible storage classes, 

424 # so we don't need another check for compatibility here; if the 

425 # dataset type doesn't match the registry that's already a problem. 

426 if registerDatasetTypes: 

427 _LOG.debug("Registering DatasetType %s with registry", dataset_type.name) 

428 try: 

429 self.full_butler.registry.registerDatasetType(dataset_type) 

430 except ConflictingDefinitionError: 

431 expected = self.full_butler.registry.getDatasetType(dataset_type.name) 

432 raise ConflictingDefinitionError( 

433 f"DatasetType definition in registry has changed since the QuantumGraph was built: " 

434 f"{dataset_type} (graph) != {expected} (registry)." 

435 ) 

436 else: 

437 _LOG.debug("Checking DatasetType %s against registry", dataset_type.name) 

438 try: 

439 expected = self.full_butler.registry.getDatasetType(dataset_type.name) 

440 except MissingDatasetTypeError: 

441 # Likely means that --register-dataset-types is forgotten, 

442 # but we could also get here if there is a prerequisite 

443 # input that is optional and none were found in this repo; 

444 # that is not an error. And we don't bother to check if 

445 # they are optional here, since the fact that we were able 

446 # to make the QG says that they were, since there couldn't 

447 # have been any datasets if the dataset types weren't 

448 # registered. 

449 if not graph.pipeline_graph.dataset_types[dataset_type.name].is_prerequisite: 

450 missing_dataset_types.add(dataset_type.name) 

451 continue 

452 if expected != dataset_type: 

453 raise ConflictingDefinitionError( 

454 f"DatasetType definition in registry has changed since the QuantumGraph was built: " 

455 f"{dataset_type} (graph) != {expected} (registry)." 

456 ) 

457 if missing_dataset_types: 

458 plural = "s" if len(missing_dataset_types) != 1 else "" 

459 raise MissingDatasetTypeError( 

460 f"Missing dataset type definition{plural}: {', '.join(missing_dataset_types)}. " 

461 "Dataset types have to be registered with either `butler register-dataset-type` or " 

462 "passing `--register-dataset-types` option to `pipetask run`." 

463 ) 

464 

465 

466class PreExecInitLimited(PreExecInitBase): 

467 """Initialization of registry for QuantumGraph execution. 

468 

469 This class works with LimitedButler and expects that all references in 

470 QuantumGraph are resolved. 

471 

472 Parameters 

473 ---------- 

474 butler : `~lsst.daf.butler.LimitedButler` 

475 Limited data butler instance. 

476 taskFactory : `~lsst.pipe.base.TaskFactory` 

477 Task factory. 

478 """ 

479 

480 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

481 super().__init__(butler, taskFactory, False) 

482 

483 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

484 # docstring inherited 

485 # With LimitedButler we never create or check dataset types. 

486 pass