Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 18%

159 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-20 04:15 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["PreExecInit"] 

31 

32# ------------------------------- 

33# Imports of standard modules -- 

34# ------------------------------- 

35import abc 

36import logging 

37from collections.abc import Iterable, Iterator 

38from contextlib import contextmanager 

39from typing import TYPE_CHECKING, Any 

40 

41# ----------------------------- 

42# Imports for other modules -- 

43# ----------------------------- 

44from lsst.daf.butler import DatasetRef, DatasetType 

45from lsst.daf.butler.registry import ConflictingDefinitionError 

46from lsst.pipe.base import PipelineDatasetTypes 

47from lsst.pipe.base import automatic_connection_constants as acc 

48from lsst.utils.packages import Packages 

49 

50if TYPE_CHECKING: 

51 from lsst.daf.butler import Butler, LimitedButler 

52 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory 

53 

54_LOG = logging.getLogger(__name__) 

55 

56 

57class MissingReferenceError(Exception): 

58 """Exception raised when resolved reference is missing from graph.""" 

59 

60 pass 

61 

62 

63def _compare_packages(old_packages: Packages, new_packages: Packages) -> None: 

64 """Compare two versions of Packages. 

65 

66 Parameters 

67 ---------- 

68 old_packages : `Packages` 

69 Previously recorded package versions. 

70 new_packages : `Packages` 

71 New set of package versions. 

72 

73 Raises 

74 ------ 

75 TypeError 

76 Raised if parameters are inconsistent. 

77 """ 

78 diff = new_packages.difference(old_packages) 

79 if diff: 

80 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

81 raise TypeError(f"Package versions mismatch: ({versions_str})") 

82 else: 

83 _LOG.debug("new packages are consistent with old") 

84 

85 

86class PreExecInitBase(abc.ABC): 

87 """Common part of the implementation of PreExecInit classes that does not 

88 depend on Butler type. 

89 

90 Parameters 

91 ---------- 

92 butler : `~lsst.daf.butler.LimitedButler` 

93 Butler to use. 

94 taskFactory : `lsst.pipe.base.TaskFactory` 

95 Task factory. 

96 extendRun : `bool` 

97 Whether extend run parameter is in use. 

98 """ 

99 

100 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool): 

101 self.butler = butler 

102 self.taskFactory = taskFactory 

103 self.extendRun = extendRun 

104 

105 def initialize( 

106 self, 

107 graph: QuantumGraph, 

108 saveInitOutputs: bool = True, 

109 registerDatasetTypes: bool = False, 

110 saveVersions: bool = True, 

111 ) -> None: 

112 """Perform all initialization steps. 

113 

114 Convenience method to execute all initialization steps. Instead of 

115 calling this method and providing all options it is also possible to 

116 call methods individually. 

117 

118 Parameters 

119 ---------- 

120 graph : `~lsst.pipe.base.QuantumGraph` 

121 Execution graph. 

122 saveInitOutputs : `bool`, optional 

123 If ``True`` (default) then save "init outputs", configurations, 

124 and package versions to butler. 

125 registerDatasetTypes : `bool`, optional 

126 If ``True`` then register dataset types in registry, otherwise 

127 they must be already registered. 

128 saveVersions : `bool`, optional 

129 If ``False`` then do not save package versions even if 

130 ``saveInitOutputs`` is set to ``True``. 

131 """ 

132 # register dataset types or check consistency 

133 self.initializeDatasetTypes(graph, registerDatasetTypes) 

134 

135 # Save task initialization data or check that saved data 

136 # is consistent with what tasks would save 

137 if saveInitOutputs: 

138 self.saveInitOutputs(graph) 

139 self.saveConfigs(graph) 

140 if saveVersions: 

141 self.savePackageVersions(graph) 

142 

143 @abc.abstractmethod 

144 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

145 """Save or check DatasetTypes output by the tasks in a graph. 

146 

147 Iterates over all DatasetTypes for all tasks in a graph and either 

148 tries to add them to registry or compares them to existing ones. 

149 

150 Parameters 

151 ---------- 

152 graph : `~lsst.pipe.base.QuantumGraph` 

153 Execution graph. 

154 registerDatasetTypes : `bool`, optional 

155 If ``True`` then register dataset types in registry, otherwise 

156 they must be already registered. 

157 

158 Raises 

159 ------ 

160 ValueError 

161 Raised if existing DatasetType is different from DatasetType 

162 in a graph. 

163 KeyError 

164 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

165 does not exist in registry. 

166 """ 

167 raise NotImplementedError() 

168 

169 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

170 """Write any datasets produced by initializing tasks in a graph. 

171 

172 Parameters 

173 ---------- 

174 graph : `~lsst.pipe.base.QuantumGraph` 

175 Execution graph. 

176 

177 Raises 

178 ------ 

179 TypeError 

180 Raised if the type of existing object in butler is different from 

181 new data. 

182 """ 

183 _LOG.debug("Will save InitOutputs for all tasks") 

184 for taskDef in self._task_iter(graph): 

185 init_input_refs = graph.initInputRefs(taskDef) or [] 

186 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs) 

187 for name in taskDef.connections.initOutputs: 

188 attribute = getattr(taskDef.connections, name) 

189 init_output_refs = graph.initOutputRefs(taskDef) or [] 

190 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name) 

191 if init_output_ref is None: 

192 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph") 

193 init_output_var = getattr(task, name) 

194 

195 if obj_from_store is not None: 

196 _LOG.debug( 

197 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

198 ) 

199 obj_from_store = self.butler.get(init_output_ref) 

200 # Types are supposed to be identical. 

201 # TODO: Check that object contents is identical too. 

202 if type(obj_from_store) is not type(init_output_var): 

203 raise TypeError( 

204 f"Stored initOutput object type {type(obj_from_store)} " 

205 "is different from task-generated type " 

206 f"{type(init_output_var)} for task {taskDef}" 

207 ) 

208 else: 

209 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

210 # This can still raise if there is a concurrent write. 

211 self.butler.put(init_output_var, init_output_ref) 

212 

213 def saveConfigs(self, graph: QuantumGraph) -> None: 

214 """Write configurations for pipeline tasks to butler or check that 

215 existing configurations are equal to the new ones. 

216 

217 Parameters 

218 ---------- 

219 graph : `~lsst.pipe.base.QuantumGraph` 

220 Execution graph. 

221 

222 Raises 

223 ------ 

224 TypeError 

225 Raised if existing object in butler is different from new data. 

226 Exception 

227 Raised if ``extendRun`` is `False` and datasets already exists. 

228 Content of a butler collection should not be changed if exception 

229 is raised. 

230 """ 

231 

232 def logConfigMismatch(msg: str) -> None: 

233 """Log messages about configuration mismatch. 

234 

235 Parameters 

236 ---------- 

237 msg : `str` 

238 Log message to use. 

239 """ 

240 _LOG.fatal("Comparing configuration: %s", msg) 

241 

242 _LOG.debug("Will save Configs for all tasks") 

243 # start transaction to rollback any changes on exceptions 

244 with self.transaction(): 

245 for taskDef in self._task_iter(graph): 

246 # Config dataset ref is stored in task init outputs, but it 

247 # may be also be missing. 

248 task_output_refs = graph.initOutputRefs(taskDef) 

249 if task_output_refs is None: 

250 continue 

251 

252 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName) 

253 if config_ref is None: 

254 continue 

255 

256 if old_config is not None: 

257 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch): 

258 raise TypeError( 

259 f"Config does not match existing task config {taskDef.configDatasetName!r} in " 

260 "butler; tasks configurations must be consistent within the same run collection" 

261 ) 

262 else: 

263 _LOG.debug( 

264 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName 

265 ) 

266 self.butler.put(taskDef.config, config_ref) 

267 

268 def savePackageVersions(self, graph: QuantumGraph) -> None: 

269 """Write versions of software packages to butler. 

270 

271 Parameters 

272 ---------- 

273 graph : `~lsst.pipe.base.QuantumGraph` 

274 Execution graph. 

275 

276 Raises 

277 ------ 

278 TypeError 

279 Raised if existing object in butler is incompatible with new data. 

280 """ 

281 packages = Packages.fromSystem() 

282 _LOG.debug("want to save packages: %s", packages) 

283 

284 # start transaction to rollback any changes on exceptions 

285 with self.transaction(): 

286 # Packages dataset ref is stored in graph's global init outputs, 

287 # but it may be also be missing. 

288 

289 packages_ref, old_packages = self._find_dataset( 

290 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName 

291 ) 

292 if packages_ref is None: 

293 return 

294 

295 if old_packages is not None: 

296 # Note that because we can only detect python modules that have 

297 # been imported, the stored list of products may be more or 

298 # less complete than what we have now. What's important is 

299 # that the products that are in common have the same version. 

300 _compare_packages(old_packages, packages) 

301 # Update the old set of packages in case we have more packages 

302 # that haven't been persisted. 

303 extra = packages.extra(old_packages) 

304 if extra: 

305 _LOG.debug("extra packages: %s", extra) 

306 old_packages.update(packages) 

307 # have to remove existing dataset first, butler has no 

308 # replace option. 

309 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True) 

310 self.butler.put(old_packages, packages_ref) 

311 else: 

312 self.butler.put(packages, packages_ref) 

313 

314 def _find_dataset( 

315 self, refs: Iterable[DatasetRef], dataset_type: str 

316 ) -> tuple[DatasetRef | None, Any | None]: 

317 """Find a ref with a given dataset type name in a list of references 

318 and try to retrieve its data from butler. 

319 

320 Parameters 

321 ---------- 

322 refs : `~collections.abc.Iterable` [ `~lsst.daf.butler.DatasetRef` ] 

323 References to check for matching dataset type. 

324 dataset_type : `str` 

325 Name of a dataset type to look for. 

326 

327 Returns 

328 ------- 

329 ref : `~lsst.daf.butler.DatasetRef` or `None` 

330 Dataset reference or `None` if there is no matching dataset type. 

331 data : `Any` 

332 An existing object extracted from butler, `None` if ``ref`` is 

333 `None` or if there is no existing object for that reference. 

334 """ 

335 ref: DatasetRef | None = None 

336 for ref in refs: 

337 if ref.datasetType.name == dataset_type: 

338 break 

339 else: 

340 return None, None 

341 

342 try: 

343 data = self.butler.get(ref) 

344 if data is not None and not self.extendRun: 

345 # It must not exist unless we are extending run. 

346 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler") 

347 except (LookupError, FileNotFoundError): 

348 data = None 

349 return ref, data 

350 

351 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]: 

352 """Iterate over TaskDefs in a graph, return only tasks that have one or 

353 more associated quanta. 

354 """ 

355 for taskDef in graph.iterTaskGraph(): 

356 if graph.getNumberOfQuantaForTask(taskDef) > 0: 

357 yield taskDef 

358 

359 @contextmanager 

360 def transaction(self) -> Iterator[None]: 

361 """Context manager for transaction. 

362 

363 Default implementation has no transaction support. 

364 

365 Yields 

366 ------ 

367 `None` 

368 No transaction support. 

369 """ 

370 yield 

371 

372 

373class PreExecInit(PreExecInitBase): 

374 """Initialization of registry for QuantumGraph execution. 

375 

376 This class encapsulates all necessary operations that have to be performed 

377 on butler and registry to prepare them for QuantumGraph execution. 

378 

379 Parameters 

380 ---------- 

381 butler : `~lsst.daf.butler.Butler` 

382 Data butler instance. 

383 taskFactory : `~lsst.pipe.base.TaskFactory` 

384 Task factory. 

385 extendRun : `bool`, optional 

386 If `True` then do not try to overwrite any datasets that might exist 

387 in ``butler.run``; instead compare them when appropriate/possible. If 

388 `False`, then any existing conflicting dataset will cause a butler 

389 exception to be raised. 

390 """ 

391 

392 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False): 

393 super().__init__(butler, taskFactory, extendRun) 

394 self.full_butler = butler 

395 if self.extendRun and self.full_butler.run is None: 

396 raise RuntimeError( 

397 "Cannot perform extendRun logic unless butler is initialized " 

398 "with a default output RUN collection." 

399 ) 

400 

401 @contextmanager 

402 def transaction(self) -> Iterator[None]: 

403 # dosctring inherited 

404 with self.full_butler.transaction(): 

405 yield 

406 

407 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

408 # docstring inherited 

409 pipeline = graph.taskGraph 

410 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

411 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True 

412 ) 

413 # The "registry dataset types" saved with the QG have had their storage 

414 # classes carefully resolved by PipelineGraph, whereas the dataset 

415 # types from PipelineDatasetTypes are a mess because it uses 

416 # NamedValueSet and that ignores storage classes. It will be fully 

417 # removed here (and deprecated everywhere) on DM-40441. 

418 # Note that these "registry dataset types" include dataset types that 

419 # are not actually registered yet; they're the PipelineGraph's 

420 # determination of what _should_ be registered. 

421 registry_storage_classes = { 

422 dataset_type.name: dataset_type.storageClass_name for dataset_type in graph.registryDatasetTypes() 

423 } 

424 registry_storage_classes[acc.PACKAGES_INIT_OUTPUT_NAME] = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS 

425 dataset_types: Iterable[DatasetType] 

426 for dataset_types, is_input in ( 

427 (pipelineDatasetTypes.initIntermediates, True), 

428 (pipelineDatasetTypes.initOutputs, False), 

429 (pipelineDatasetTypes.intermediates, True), 

430 (pipelineDatasetTypes.outputs, False), 

431 ): 

432 dataset_types = [ 

433 ( 

434 # The registry dataset types do not include components, but 

435 # we don't support storage class overrides for those in 

436 # other contexts anyway, and custom-built QGs may not have 

437 # the registry dataset types field populated at all.x 

438 dataset_type.overrideStorageClass(registry_storage_classes[dataset_type.name]) 

439 if dataset_type.name in registry_storage_classes 

440 else dataset_type 

441 ) 

442 for dataset_type in dataset_types 

443 ] 

444 self._register_output_dataset_types(registerDatasetTypes, dataset_types, is_input) 

445 

446 def _register_output_dataset_types( 

447 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool 

448 ) -> None: 

449 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool: 

450 # These are output dataset types so check for compatibility on put. 

451 is_compatible = expected.is_compatible_with(datasetType) 

452 

453 if is_input: 

454 # This dataset type is also used for input so must be 

455 # compatible on get as ell. 

456 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

457 

458 if is_compatible: 

459 _LOG.debug( 

460 "The dataset type configurations differ (%s from task != %s from registry) " 

461 "but the storage classes are compatible. Can continue.", 

462 datasetType, 

463 expected, 

464 ) 

465 return is_compatible 

466 

467 missing_datasetTypes = set() 

468 for datasetType in datasetTypes: 

469 # Only composites are registered, no components, and by this point 

470 # the composite should already exist. 

471 if registerDatasetTypes and not datasetType.isComponent(): 

472 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

473 # this is a no-op if it already exists and is consistent, 

474 # and it raises if it is inconsistent. 

475 try: 

476 self.full_butler.registry.registerDatasetType(datasetType) 

477 except ConflictingDefinitionError: 

478 if not _check_compatibility( 

479 datasetType, self.full_butler.get_dataset_type(datasetType.name), is_input 

480 ): 

481 raise 

482 else: 

483 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

484 try: 

485 expected = self.full_butler.get_dataset_type(datasetType.name) 

486 except KeyError: 

487 # Likely means that --register-dataset-types is forgotten. 

488 missing_datasetTypes.add(datasetType.name) 

489 continue 

490 if expected != datasetType: 

491 if not _check_compatibility(datasetType, expected, is_input): 

492 raise ValueError( 

493 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

494 ) 

495 

496 if missing_datasetTypes: 

497 plural = "s" if len(missing_datasetTypes) != 1 else "" 

498 raise KeyError( 

499 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

500 "Dataset types have to be registered with either `butler register-dataset-type` or " 

501 "passing `--register-dataset-types` option to `pipetask run`." 

502 ) 

503 

504 

505class PreExecInitLimited(PreExecInitBase): 

506 """Initialization of registry for QuantumGraph execution. 

507 

508 This class works with LimitedButler and expects that all references in 

509 QuantumGraph are resolved. 

510 

511 Parameters 

512 ---------- 

513 butler : `~lsst.daf.butler.LimitedButler` 

514 Limited data butler instance. 

515 taskFactory : `~lsst.pipe.base.TaskFactory` 

516 Task factory. 

517 """ 

518 

519 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

520 super().__init__(butler, taskFactory, False) 

521 

522 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

523 # docstring inherited 

524 # With LimitedButler we never create or check dataset types. 

525 pass