Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 16%

155 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-14 09:14 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["PreExecInit"] 

25 

26# ------------------------------- 

27# Imports of standard modules -- 

28# ------------------------------- 

29import abc 

30import logging 

31from collections.abc import Iterable, Iterator 

32from contextlib import contextmanager 

33from typing import TYPE_CHECKING, Any 

34 

35# ----------------------------- 

36# Imports for other modules -- 

37# ----------------------------- 

38from lsst.daf.butler import DatasetRef, DatasetType 

39from lsst.daf.butler.registry import ConflictingDefinitionError 

40from lsst.pipe.base import PipelineDatasetTypes 

41from lsst.utils.packages import Packages 

42 

43if TYPE_CHECKING: 

44 from lsst.daf.butler import Butler, LimitedButler 

45 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory 

46 

47_LOG = logging.getLogger(__name__) 

48 

49 

50class MissingReferenceError(Exception): 

51 """Exception raised when resolved reference is missing from graph.""" 

52 

53 pass 

54 

55 

56def _compare_packages(old_packages: Packages, new_packages: Packages) -> None: 

57 """Compare two versions of Packages. 

58 

59 Parameters 

60 ---------- 

61 old_packages : `Packages` 

62 Previously recorded package versions. 

63 new_packages : `Packages` 

64 New set of package versions. 

65 

66 Raises 

67 ------ 

68 TypeError 

69 Raised if parameters are inconsistent. 

70 """ 

71 diff = new_packages.difference(old_packages) 

72 if diff: 

73 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

74 raise TypeError(f"Package versions mismatch: ({versions_str})") 

75 else: 

76 _LOG.debug("new packages are consistent with old") 

77 

78 

79class PreExecInitBase(abc.ABC): 

80 """Common part of the implementation of PreExecInit classes that does not 

81 depend on Butler type. 

82 """ 

83 

84 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool): 

85 self.butler = butler 

86 self.taskFactory = taskFactory 

87 self.extendRun = extendRun 

88 

89 def initialize( 

90 self, 

91 graph: QuantumGraph, 

92 saveInitOutputs: bool = True, 

93 registerDatasetTypes: bool = False, 

94 saveVersions: bool = True, 

95 ) -> None: 

96 """Perform all initialization steps. 

97 

98 Convenience method to execute all initialization steps. Instead of 

99 calling this method and providing all options it is also possible to 

100 call methods individually. 

101 

102 Parameters 

103 ---------- 

104 graph : `~lsst.pipe.base.QuantumGraph` 

105 Execution graph. 

106 saveInitOutputs : `bool`, optional 

107 If ``True`` (default) then save "init outputs", configurations, 

108 and package versions to butler. 

109 registerDatasetTypes : `bool`, optional 

110 If ``True`` then register dataset types in registry, otherwise 

111 they must be already registered. 

112 saveVersions : `bool`, optional 

113 If ``False`` then do not save package versions even if 

114 ``saveInitOutputs`` is set to ``True``. 

115 """ 

116 # register dataset types or check consistency 

117 self.initializeDatasetTypes(graph, registerDatasetTypes) 

118 

119 # Save task initialization data or check that saved data 

120 # is consistent with what tasks would save 

121 if saveInitOutputs: 

122 self.saveInitOutputs(graph) 

123 self.saveConfigs(graph) 

124 if saveVersions: 

125 self.savePackageVersions(graph) 

126 

127 @abc.abstractmethod 

128 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

129 """Save or check DatasetTypes output by the tasks in a graph. 

130 

131 Iterates over all DatasetTypes for all tasks in a graph and either 

132 tries to add them to registry or compares them to existing ones. 

133 

134 Parameters 

135 ---------- 

136 graph : `~lsst.pipe.base.QuantumGraph` 

137 Execution graph. 

138 registerDatasetTypes : `bool`, optional 

139 If ``True`` then register dataset types in registry, otherwise 

140 they must be already registered. 

141 

142 Raises 

143 ------ 

144 ValueError 

145 Raised if existing DatasetType is different from DatasetType 

146 in a graph. 

147 KeyError 

148 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

149 does not exist in registry. 

150 """ 

151 raise NotImplementedError() 

152 

153 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

154 """Write any datasets produced by initializing tasks in a graph. 

155 

156 Parameters 

157 ---------- 

158 graph : `~lsst.pipe.base.QuantumGraph` 

159 Execution graph. 

160 

161 Raises 

162 ------ 

163 TypeError 

164 Raised if the type of existing object in butler is different from 

165 new data. 

166 """ 

167 _LOG.debug("Will save InitOutputs for all tasks") 

168 for taskDef in self._task_iter(graph): 

169 init_input_refs = graph.initInputRefs(taskDef) or [] 

170 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs) 

171 for name in taskDef.connections.initOutputs: 

172 attribute = getattr(taskDef.connections, name) 

173 init_output_refs = graph.initOutputRefs(taskDef) or [] 

174 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name) 

175 if init_output_ref is None: 

176 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph") 

177 init_output_var = getattr(task, name) 

178 

179 if obj_from_store is not None: 

180 _LOG.debug( 

181 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

182 ) 

183 obj_from_store = self.butler.get(init_output_ref) 

184 # Types are supposed to be identical. 

185 # TODO: Check that object contents is identical too. 

186 if type(obj_from_store) is not type(init_output_var): 

187 raise TypeError( 

188 f"Stored initOutput object type {type(obj_from_store)} " 

189 "is different from task-generated type " 

190 f"{type(init_output_var)} for task {taskDef}" 

191 ) 

192 else: 

193 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

194 # This can still raise if there is a concurrent write. 

195 self.butler.put(init_output_var, init_output_ref) 

196 

197 def saveConfigs(self, graph: QuantumGraph) -> None: 

198 """Write configurations for pipeline tasks to butler or check that 

199 existing configurations are equal to the new ones. 

200 

201 Parameters 

202 ---------- 

203 graph : `~lsst.pipe.base.QuantumGraph` 

204 Execution graph. 

205 

206 Raises 

207 ------ 

208 TypeError 

209 Raised if existing object in butler is different from new data. 

210 Exception 

211 Raised if ``extendRun`` is `False` and datasets already exists. 

212 Content of a butler collection should not be changed if exception 

213 is raised. 

214 """ 

215 

216 def logConfigMismatch(msg: str) -> None: 

217 """Log messages about configuration mismatch.""" 

218 _LOG.fatal("Comparing configuration: %s", msg) 

219 

220 _LOG.debug("Will save Configs for all tasks") 

221 # start transaction to rollback any changes on exceptions 

222 with self.transaction(): 

223 for taskDef in self._task_iter(graph): 

224 # Config dataset ref is stored in task init outputs, but it 

225 # may be also be missing. 

226 task_output_refs = graph.initOutputRefs(taskDef) 

227 if task_output_refs is None: 

228 continue 

229 

230 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName) 

231 if config_ref is None: 

232 continue 

233 

234 if old_config is not None: 

235 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch): 

236 raise TypeError( 

237 f"Config does not match existing task config {taskDef.configDatasetName!r} in " 

238 "butler; tasks configurations must be consistent within the same run collection" 

239 ) 

240 else: 

241 _LOG.debug( 

242 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName 

243 ) 

244 self.butler.put(taskDef.config, config_ref) 

245 

246 def savePackageVersions(self, graph: QuantumGraph) -> None: 

247 """Write versions of software packages to butler. 

248 

249 Parameters 

250 ---------- 

251 graph : `~lsst.pipe.base.QuantumGraph` 

252 Execution graph. 

253 

254 Raises 

255 ------ 

256 TypeError 

257 Raised if existing object in butler is incompatible with new data. 

258 """ 

259 packages = Packages.fromSystem() 

260 _LOG.debug("want to save packages: %s", packages) 

261 

262 # start transaction to rollback any changes on exceptions 

263 with self.transaction(): 

264 # Packages dataset ref is stored in graph's global init outputs, 

265 # but it may be also be missing. 

266 

267 packages_ref, old_packages = self._find_dataset( 

268 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName 

269 ) 

270 if packages_ref is None: 

271 return 

272 

273 if old_packages is not None: 

274 # Note that because we can only detect python modules that have 

275 # been imported, the stored list of products may be more or 

276 # less complete than what we have now. What's important is 

277 # that the products that are in common have the same version. 

278 _compare_packages(old_packages, packages) 

279 # Update the old set of packages in case we have more packages 

280 # that haven't been persisted. 

281 extra = packages.extra(old_packages) 

282 if extra: 

283 _LOG.debug("extra packages: %s", extra) 

284 old_packages.update(packages) 

285 # have to remove existing dataset first, butler has no 

286 # replace option. 

287 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True) 

288 self.butler.put(old_packages, packages_ref) 

289 else: 

290 self.butler.put(packages, packages_ref) 

291 

292 def _find_dataset( 

293 self, refs: Iterable[DatasetRef], dataset_type: str 

294 ) -> tuple[DatasetRef | None, Any | None]: 

295 """Find a ref with a given dataset type name in a list of references 

296 and try to retrieve its data from butler. 

297 

298 Parameters 

299 ---------- 

300 refs : `~collections.abc.Iterable` [ `~lsst.daf.butler.DatasetRef` ] 

301 References to check for matching dataset type. 

302 dataset_type : `str` 

303 Name of a dataset type to look for. 

304 

305 Returns 

306 ------- 

307 ref : `~lsst.daf.butler.DatasetRef` or `None` 

308 Dataset reference or `None` if there is no matching dataset type. 

309 data : `Any` 

310 An existing object extracted from butler, `None` if ``ref`` is 

311 `None` or if there is no existing object for that reference. 

312 """ 

313 ref: DatasetRef | None = None 

314 for ref in refs: 

315 if ref.datasetType.name == dataset_type: 

316 break 

317 else: 

318 return None, None 

319 

320 try: 

321 data = self.butler.get(ref) 

322 if data is not None and not self.extendRun: 

323 # It must not exist unless we are extending run. 

324 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler") 

325 except (LookupError, FileNotFoundError): 

326 data = None 

327 return ref, data 

328 

329 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]: 

330 """Iterate over TaskDefs in a graph, return only tasks that have one or 

331 more associated quanta. 

332 """ 

333 for taskDef in graph.iterTaskGraph(): 

334 if graph.getNumberOfQuantaForTask(taskDef) > 0: 

335 yield taskDef 

336 

337 @contextmanager 

338 def transaction(self) -> Iterator[None]: 

339 """Context manager for transaction. 

340 

341 Default implementation has no transaction support. 

342 """ 

343 yield 

344 

345 

346class PreExecInit(PreExecInitBase): 

347 """Initialization of registry for QuantumGraph execution. 

348 

349 This class encapsulates all necessary operations that have to be performed 

350 on butler and registry to prepare them for QuantumGraph execution. 

351 

352 Parameters 

353 ---------- 

354 butler : `~lsst.daf.butler.Butler` 

355 Data butler instance. 

356 taskFactory : `~lsst.pipe.base.TaskFactory` 

357 Task factory. 

358 extendRun : `bool`, optional 

359 If `True` then do not try to overwrite any datasets that might exist 

360 in ``butler.run``; instead compare them when appropriate/possible. If 

361 `False`, then any existing conflicting dataset will cause a butler 

362 exception to be raised. 

363 """ 

364 

365 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False): 

366 super().__init__(butler, taskFactory, extendRun) 

367 self.full_butler = butler 

368 if self.extendRun and self.full_butler.run is None: 

369 raise RuntimeError( 

370 "Cannot perform extendRun logic unless butler is initialized " 

371 "with a default output RUN collection." 

372 ) 

373 

374 @contextmanager 

375 def transaction(self) -> Iterator[None]: 

376 # dosctring inherited 

377 with self.full_butler.transaction(): 

378 yield 

379 

380 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

381 # docstring inherited 

382 pipeline = graph.taskGraph 

383 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

384 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True 

385 ) 

386 

387 for datasetTypes, is_input in ( 

388 (pipelineDatasetTypes.initIntermediates, True), 

389 (pipelineDatasetTypes.initOutputs, False), 

390 (pipelineDatasetTypes.intermediates, True), 

391 (pipelineDatasetTypes.outputs, False), 

392 ): 

393 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input) 

394 

395 def _register_output_dataset_types( 

396 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool 

397 ) -> None: 

398 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool: 

399 # These are output dataset types so check for compatibility on put. 

400 is_compatible = expected.is_compatible_with(datasetType) 

401 

402 if is_input: 

403 # This dataset type is also used for input so must be 

404 # compatible on get as ell. 

405 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

406 

407 if is_compatible: 

408 _LOG.debug( 

409 "The dataset type configurations differ (%s from task != %s from registry) " 

410 "but the storage classes are compatible. Can continue.", 

411 datasetType, 

412 expected, 

413 ) 

414 return is_compatible 

415 

416 missing_datasetTypes = set() 

417 for datasetType in datasetTypes: 

418 # Only composites are registered, no components, and by this point 

419 # the composite should already exist. 

420 if registerDatasetTypes and not datasetType.isComponent(): 

421 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

422 # this is a no-op if it already exists and is consistent, 

423 # and it raises if it is inconsistent. 

424 try: 

425 self.full_butler.registry.registerDatasetType(datasetType) 

426 except ConflictingDefinitionError: 

427 if not _check_compatibility( 

428 datasetType, self.full_butler.registry.getDatasetType(datasetType.name), is_input 

429 ): 

430 raise 

431 else: 

432 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

433 try: 

434 expected = self.full_butler.registry.getDatasetType(datasetType.name) 

435 except KeyError: 

436 # Likely means that --register-dataset-types is forgotten. 

437 missing_datasetTypes.add(datasetType.name) 

438 continue 

439 if expected != datasetType: 

440 if not _check_compatibility(datasetType, expected, is_input): 

441 raise ValueError( 

442 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

443 ) 

444 

445 if missing_datasetTypes: 

446 plural = "s" if len(missing_datasetTypes) != 1 else "" 

447 raise KeyError( 

448 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

449 "Dataset types have to be registered with either `butler register-dataset-type` or " 

450 "passing `--register-dataset-types` option to `pipetask run`." 

451 ) 

452 

453 

454class PreExecInitLimited(PreExecInitBase): 

455 """Initialization of registry for QuantumGraph execution. 

456 

457 This class works with LimitedButler and expects that all references in 

458 QuantumGraph are resolved. 

459 

460 Parameters 

461 ---------- 

462 butler : `~lsst.daf.butler.LimitedButler` 

463 Limited data butler instance. 

464 taskFactory : `~lsst.pipe.base.TaskFactory` 

465 Task factory. 

466 """ 

467 

468 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

469 super().__init__(butler, taskFactory, False) 

470 

471 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

472 # docstring inherited 

473 # With LimitedButler we never create or check dataset types. 

474 pass