Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 18%

159 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-03 10:43 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ["PreExecInit"] 

31 

32# ------------------------------- 

33# Imports of standard modules -- 

34# ------------------------------- 

35import abc 

36import logging 

37from collections.abc import Iterable, Iterator 

38from contextlib import contextmanager 

39from typing import TYPE_CHECKING, Any 

40 

41# ----------------------------- 

42# Imports for other modules -- 

43# ----------------------------- 

44from lsst.daf.butler import DatasetRef, DatasetType 

45from lsst.daf.butler.registry import ConflictingDefinitionError 

46from lsst.pipe.base import PipelineDatasetTypes 

47from lsst.pipe.base import automatic_connection_constants as acc 

48from lsst.utils.packages import Packages 

49 

50if TYPE_CHECKING: 

51 from lsst.daf.butler import Butler, LimitedButler 

52 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory 

53 

54_LOG = logging.getLogger(__name__) 

55 

56 

57class MissingReferenceError(Exception): 

58 """Exception raised when resolved reference is missing from graph.""" 

59 

60 pass 

61 

62 

63def _compare_packages(old_packages: Packages, new_packages: Packages) -> None: 

64 """Compare two versions of Packages. 

65 

66 Parameters 

67 ---------- 

68 old_packages : `Packages` 

69 Previously recorded package versions. 

70 new_packages : `Packages` 

71 New set of package versions. 

72 

73 Raises 

74 ------ 

75 TypeError 

76 Raised if parameters are inconsistent. 

77 """ 

78 diff = new_packages.difference(old_packages) 

79 if diff: 

80 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

81 raise TypeError(f"Package versions mismatch: ({versions_str})") 

82 else: 

83 _LOG.debug("new packages are consistent with old") 

84 

85 

86class PreExecInitBase(abc.ABC): 

87 """Common part of the implementation of PreExecInit classes that does not 

88 depend on Butler type. 

89 """ 

90 

91 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool): 

92 self.butler = butler 

93 self.taskFactory = taskFactory 

94 self.extendRun = extendRun 

95 

96 def initialize( 

97 self, 

98 graph: QuantumGraph, 

99 saveInitOutputs: bool = True, 

100 registerDatasetTypes: bool = False, 

101 saveVersions: bool = True, 

102 ) -> None: 

103 """Perform all initialization steps. 

104 

105 Convenience method to execute all initialization steps. Instead of 

106 calling this method and providing all options it is also possible to 

107 call methods individually. 

108 

109 Parameters 

110 ---------- 

111 graph : `~lsst.pipe.base.QuantumGraph` 

112 Execution graph. 

113 saveInitOutputs : `bool`, optional 

114 If ``True`` (default) then save "init outputs", configurations, 

115 and package versions to butler. 

116 registerDatasetTypes : `bool`, optional 

117 If ``True`` then register dataset types in registry, otherwise 

118 they must be already registered. 

119 saveVersions : `bool`, optional 

120 If ``False`` then do not save package versions even if 

121 ``saveInitOutputs`` is set to ``True``. 

122 """ 

123 # register dataset types or check consistency 

124 self.initializeDatasetTypes(graph, registerDatasetTypes) 

125 

126 # Save task initialization data or check that saved data 

127 # is consistent with what tasks would save 

128 if saveInitOutputs: 

129 self.saveInitOutputs(graph) 

130 self.saveConfigs(graph) 

131 if saveVersions: 

132 self.savePackageVersions(graph) 

133 

134 @abc.abstractmethod 

135 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

136 """Save or check DatasetTypes output by the tasks in a graph. 

137 

138 Iterates over all DatasetTypes for all tasks in a graph and either 

139 tries to add them to registry or compares them to existing ones. 

140 

141 Parameters 

142 ---------- 

143 graph : `~lsst.pipe.base.QuantumGraph` 

144 Execution graph. 

145 registerDatasetTypes : `bool`, optional 

146 If ``True`` then register dataset types in registry, otherwise 

147 they must be already registered. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 Raised if existing DatasetType is different from DatasetType 

153 in a graph. 

154 KeyError 

155 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

156 does not exist in registry. 

157 """ 

158 raise NotImplementedError() 

159 

160 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

161 """Write any datasets produced by initializing tasks in a graph. 

162 

163 Parameters 

164 ---------- 

165 graph : `~lsst.pipe.base.QuantumGraph` 

166 Execution graph. 

167 

168 Raises 

169 ------ 

170 TypeError 

171 Raised if the type of existing object in butler is different from 

172 new data. 

173 """ 

174 _LOG.debug("Will save InitOutputs for all tasks") 

175 for taskDef in self._task_iter(graph): 

176 init_input_refs = graph.initInputRefs(taskDef) or [] 

177 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs) 

178 for name in taskDef.connections.initOutputs: 

179 attribute = getattr(taskDef.connections, name) 

180 init_output_refs = graph.initOutputRefs(taskDef) or [] 

181 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name) 

182 if init_output_ref is None: 

183 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph") 

184 init_output_var = getattr(task, name) 

185 

186 if obj_from_store is not None: 

187 _LOG.debug( 

188 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

189 ) 

190 obj_from_store = self.butler.get(init_output_ref) 

191 # Types are supposed to be identical. 

192 # TODO: Check that object contents is identical too. 

193 if type(obj_from_store) is not type(init_output_var): 

194 raise TypeError( 

195 f"Stored initOutput object type {type(obj_from_store)} " 

196 "is different from task-generated type " 

197 f"{type(init_output_var)} for task {taskDef}" 

198 ) 

199 else: 

200 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

201 # This can still raise if there is a concurrent write. 

202 self.butler.put(init_output_var, init_output_ref) 

203 

204 def saveConfigs(self, graph: QuantumGraph) -> None: 

205 """Write configurations for pipeline tasks to butler or check that 

206 existing configurations are equal to the new ones. 

207 

208 Parameters 

209 ---------- 

210 graph : `~lsst.pipe.base.QuantumGraph` 

211 Execution graph. 

212 

213 Raises 

214 ------ 

215 TypeError 

216 Raised if existing object in butler is different from new data. 

217 Exception 

218 Raised if ``extendRun`` is `False` and datasets already exists. 

219 Content of a butler collection should not be changed if exception 

220 is raised. 

221 """ 

222 

223 def logConfigMismatch(msg: str) -> None: 

224 """Log messages about configuration mismatch.""" 

225 _LOG.fatal("Comparing configuration: %s", msg) 

226 

227 _LOG.debug("Will save Configs for all tasks") 

228 # start transaction to rollback any changes on exceptions 

229 with self.transaction(): 

230 for taskDef in self._task_iter(graph): 

231 # Config dataset ref is stored in task init outputs, but it 

232 # may be also be missing. 

233 task_output_refs = graph.initOutputRefs(taskDef) 

234 if task_output_refs is None: 

235 continue 

236 

237 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName) 

238 if config_ref is None: 

239 continue 

240 

241 if old_config is not None: 

242 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch): 

243 raise TypeError( 

244 f"Config does not match existing task config {taskDef.configDatasetName!r} in " 

245 "butler; tasks configurations must be consistent within the same run collection" 

246 ) 

247 else: 

248 _LOG.debug( 

249 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName 

250 ) 

251 self.butler.put(taskDef.config, config_ref) 

252 

253 def savePackageVersions(self, graph: QuantumGraph) -> None: 

254 """Write versions of software packages to butler. 

255 

256 Parameters 

257 ---------- 

258 graph : `~lsst.pipe.base.QuantumGraph` 

259 Execution graph. 

260 

261 Raises 

262 ------ 

263 TypeError 

264 Raised if existing object in butler is incompatible with new data. 

265 """ 

266 packages = Packages.fromSystem() 

267 _LOG.debug("want to save packages: %s", packages) 

268 

269 # start transaction to rollback any changes on exceptions 

270 with self.transaction(): 

271 # Packages dataset ref is stored in graph's global init outputs, 

272 # but it may be also be missing. 

273 

274 packages_ref, old_packages = self._find_dataset( 

275 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName 

276 ) 

277 if packages_ref is None: 

278 return 

279 

280 if old_packages is not None: 

281 # Note that because we can only detect python modules that have 

282 # been imported, the stored list of products may be more or 

283 # less complete than what we have now. What's important is 

284 # that the products that are in common have the same version. 

285 _compare_packages(old_packages, packages) 

286 # Update the old set of packages in case we have more packages 

287 # that haven't been persisted. 

288 extra = packages.extra(old_packages) 

289 if extra: 

290 _LOG.debug("extra packages: %s", extra) 

291 old_packages.update(packages) 

292 # have to remove existing dataset first, butler has no 

293 # replace option. 

294 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True) 

295 self.butler.put(old_packages, packages_ref) 

296 else: 

297 self.butler.put(packages, packages_ref) 

298 

299 def _find_dataset( 

300 self, refs: Iterable[DatasetRef], dataset_type: str 

301 ) -> tuple[DatasetRef | None, Any | None]: 

302 """Find a ref with a given dataset type name in a list of references 

303 and try to retrieve its data from butler. 

304 

305 Parameters 

306 ---------- 

307 refs : `~collections.abc.Iterable` [ `~lsst.daf.butler.DatasetRef` ] 

308 References to check for matching dataset type. 

309 dataset_type : `str` 

310 Name of a dataset type to look for. 

311 

312 Returns 

313 ------- 

314 ref : `~lsst.daf.butler.DatasetRef` or `None` 

315 Dataset reference or `None` if there is no matching dataset type. 

316 data : `Any` 

317 An existing object extracted from butler, `None` if ``ref`` is 

318 `None` or if there is no existing object for that reference. 

319 """ 

320 ref: DatasetRef | None = None 

321 for ref in refs: 

322 if ref.datasetType.name == dataset_type: 

323 break 

324 else: 

325 return None, None 

326 

327 try: 

328 data = self.butler.get(ref) 

329 if data is not None and not self.extendRun: 

330 # It must not exist unless we are extending run. 

331 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler") 

332 except (LookupError, FileNotFoundError): 

333 data = None 

334 return ref, data 

335 

336 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]: 

337 """Iterate over TaskDefs in a graph, return only tasks that have one or 

338 more associated quanta. 

339 """ 

340 for taskDef in graph.iterTaskGraph(): 

341 if graph.getNumberOfQuantaForTask(taskDef) > 0: 

342 yield taskDef 

343 

344 @contextmanager 

345 def transaction(self) -> Iterator[None]: 

346 """Context manager for transaction. 

347 

348 Default implementation has no transaction support. 

349 """ 

350 yield 

351 

352 

353class PreExecInit(PreExecInitBase): 

354 """Initialization of registry for QuantumGraph execution. 

355 

356 This class encapsulates all necessary operations that have to be performed 

357 on butler and registry to prepare them for QuantumGraph execution. 

358 

359 Parameters 

360 ---------- 

361 butler : `~lsst.daf.butler.Butler` 

362 Data butler instance. 

363 taskFactory : `~lsst.pipe.base.TaskFactory` 

364 Task factory. 

365 extendRun : `bool`, optional 

366 If `True` then do not try to overwrite any datasets that might exist 

367 in ``butler.run``; instead compare them when appropriate/possible. If 

368 `False`, then any existing conflicting dataset will cause a butler 

369 exception to be raised. 

370 """ 

371 

372 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False): 

373 super().__init__(butler, taskFactory, extendRun) 

374 self.full_butler = butler 

375 if self.extendRun and self.full_butler.run is None: 

376 raise RuntimeError( 

377 "Cannot perform extendRun logic unless butler is initialized " 

378 "with a default output RUN collection." 

379 ) 

380 

381 @contextmanager 

382 def transaction(self) -> Iterator[None]: 

383 # dosctring inherited 

384 with self.full_butler.transaction(): 

385 yield 

386 

387 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

388 # docstring inherited 

389 pipeline = graph.taskGraph 

390 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

391 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True 

392 ) 

393 # The "registry dataset types" saved with the QG have had their storage 

394 # classes carefully resolved by PipelineGraph, whereas the dataset 

395 # types from PipelineDatasetTypes are a mess because it uses 

396 # NamedValueSet and that ignores storage classes. It will be fully 

397 # removed here (and deprecated everywhere) on DM-40441. 

398 # Note that these "registry dataset types" include dataset types that 

399 # are not actually registered yet; they're the PipelineGraph's 

400 # determination of what _should_ be registered. 

401 registry_storage_classes = { 

402 dataset_type.name: dataset_type.storageClass_name for dataset_type in graph.registryDatasetTypes() 

403 } 

404 registry_storage_classes[acc.PACKAGES_INIT_OUTPUT_NAME] = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS 

405 dataset_types: Iterable[DatasetType] 

406 for dataset_types, is_input in ( 

407 (pipelineDatasetTypes.initIntermediates, True), 

408 (pipelineDatasetTypes.initOutputs, False), 

409 (pipelineDatasetTypes.intermediates, True), 

410 (pipelineDatasetTypes.outputs, False), 

411 ): 

412 dataset_types = [ 

413 ( 

414 # The registry dataset types do not include components, but 

415 # we don't support storage class overrides for those in 

416 # other contexts anyway, and custom-built QGs may not have 

417 # the registry dataset types field populated at all.x 

418 dataset_type.overrideStorageClass(registry_storage_classes[dataset_type.name]) 

419 if dataset_type.name in registry_storage_classes 

420 else dataset_type 

421 ) 

422 for dataset_type in dataset_types 

423 ] 

424 self._register_output_dataset_types(registerDatasetTypes, dataset_types, is_input) 

425 

426 def _register_output_dataset_types( 

427 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool 

428 ) -> None: 

429 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool: 

430 # These are output dataset types so check for compatibility on put. 

431 is_compatible = expected.is_compatible_with(datasetType) 

432 

433 if is_input: 

434 # This dataset type is also used for input so must be 

435 # compatible on get as ell. 

436 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

437 

438 if is_compatible: 

439 _LOG.debug( 

440 "The dataset type configurations differ (%s from task != %s from registry) " 

441 "but the storage classes are compatible. Can continue.", 

442 datasetType, 

443 expected, 

444 ) 

445 return is_compatible 

446 

447 missing_datasetTypes = set() 

448 for datasetType in datasetTypes: 

449 # Only composites are registered, no components, and by this point 

450 # the composite should already exist. 

451 if registerDatasetTypes and not datasetType.isComponent(): 

452 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

453 # this is a no-op if it already exists and is consistent, 

454 # and it raises if it is inconsistent. 

455 try: 

456 self.full_butler.registry.registerDatasetType(datasetType) 

457 except ConflictingDefinitionError: 

458 if not _check_compatibility( 

459 datasetType, self.full_butler.get_dataset_type(datasetType.name), is_input 

460 ): 

461 raise 

462 else: 

463 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

464 try: 

465 expected = self.full_butler.get_dataset_type(datasetType.name) 

466 except KeyError: 

467 # Likely means that --register-dataset-types is forgotten. 

468 missing_datasetTypes.add(datasetType.name) 

469 continue 

470 if expected != datasetType: 

471 if not _check_compatibility(datasetType, expected, is_input): 

472 raise ValueError( 

473 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

474 ) 

475 

476 if missing_datasetTypes: 

477 plural = "s" if len(missing_datasetTypes) != 1 else "" 

478 raise KeyError( 

479 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

480 "Dataset types have to be registered with either `butler register-dataset-type` or " 

481 "passing `--register-dataset-types` option to `pipetask run`." 

482 ) 

483 

484 

485class PreExecInitLimited(PreExecInitBase): 

486 """Initialization of registry for QuantumGraph execution. 

487 

488 This class works with LimitedButler and expects that all references in 

489 QuantumGraph are resolved. 

490 

491 Parameters 

492 ---------- 

493 butler : `~lsst.daf.butler.LimitedButler` 

494 Limited data butler instance. 

495 taskFactory : `~lsst.pipe.base.TaskFactory` 

496 Task factory. 

497 """ 

498 

499 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

500 super().__init__(butler, taskFactory, False) 

501 

502 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

503 # docstring inherited 

504 # With LimitedButler we never create or check dataset types. 

505 pass