Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 15%

171 statements  

« prev     ^ index     » next       coverage.py v7.2.6, created at 2023-05-26 02:14 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["PreExecInit"] 

25 

26# ------------------------------- 

27# Imports of standard modules -- 

28# ------------------------------- 

29import abc 

30import logging 

31from collections.abc import Iterable, Iterator 

32from contextlib import contextmanager 

33from typing import TYPE_CHECKING, Any 

34 

35# ----------------------------- 

36# Imports for other modules -- 

37# ----------------------------- 

38from lsst.daf.butler import DatasetRef, DatasetType 

39from lsst.daf.butler.registry import ConflictingDefinitionError 

40from lsst.pipe.base import PipelineDatasetTypes 

41from lsst.utils.packages import Packages 

42 

43from .mock_task import MockButlerQuantumContext 

44 

45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true

46 from lsst.daf.butler import Butler, LimitedButler 

47 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory 

48 

49_LOG = logging.getLogger(__name__) 

50 

51 

52class MissingReferenceError(Exception): 

53 """Exception raised when resolved reference is missing from graph.""" 

54 

55 pass 

56 

57 

58def _compare_packages(old_packages: Packages, new_packages: Packages) -> None: 

59 """Compare two versions of Packages. 

60 

61 Parameters 

62 ---------- 

63 old_packages : `Packages` 

64 Previously recorded package versions. 

65 new_packages : `Packages` 

66 New set of package versions. 

67 

68 Raises 

69 ------ 

70 TypeError 

71 Raised if parameters are inconsistent. 

72 """ 

73 diff = new_packages.difference(old_packages) 

74 if diff: 

75 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

76 raise TypeError(f"Package versions mismatch: ({versions_str})") 

77 else: 

78 _LOG.debug("new packages are consistent with old") 

79 

80 

81class PreExecInitBase(abc.ABC): 

82 """Common part of the implementation of PreExecInit classes that does not 

83 depend on Butler type. 

84 """ 

85 

86 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool): 

87 self.butler = butler 

88 self.taskFactory = taskFactory 

89 self.extendRun = extendRun 

90 

91 def initialize( 

92 self, 

93 graph: QuantumGraph, 

94 saveInitOutputs: bool = True, 

95 registerDatasetTypes: bool = False, 

96 saveVersions: bool = True, 

97 ) -> None: 

98 """Perform all initialization steps. 

99 

100 Convenience method to execute all initialization steps. Instead of 

101 calling this method and providing all options it is also possible to 

102 call methods individually. 

103 

104 Parameters 

105 ---------- 

106 graph : `~lsst.pipe.base.QuantumGraph` 

107 Execution graph. 

108 saveInitOutputs : `bool`, optional 

109 If ``True`` (default) then save "init outputs", configurations, 

110 and package versions to butler. 

111 registerDatasetTypes : `bool`, optional 

112 If ``True`` then register dataset types in registry, otherwise 

113 they must be already registered. 

114 saveVersions : `bool`, optional 

115 If ``False`` then do not save package versions even if 

116 ``saveInitOutputs`` is set to ``True``. 

117 """ 

118 # register dataset types or check consistency 

119 self.initializeDatasetTypes(graph, registerDatasetTypes) 

120 

121 # Save task initialization data or check that saved data 

122 # is consistent with what tasks would save 

123 if saveInitOutputs: 

124 self.saveInitOutputs(graph) 

125 self.saveConfigs(graph) 

126 if saveVersions: 

127 self.savePackageVersions(graph) 

128 

129 @abc.abstractmethod 

130 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

131 """Save or check DatasetTypes output by the tasks in a graph. 

132 

133 Iterates over all DatasetTypes for all tasks in a graph and either 

134 tries to add them to registry or compares them to existing ones. 

135 

136 Parameters 

137 ---------- 

138 graph : `~lsst.pipe.base.QuantumGraph` 

139 Execution graph. 

140 registerDatasetTypes : `bool`, optional 

141 If ``True`` then register dataset types in registry, otherwise 

142 they must be already registered. 

143 

144 Raises 

145 ------ 

146 ValueError 

147 Raised if existing DatasetType is different from DatasetType 

148 in a graph. 

149 KeyError 

150 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

151 does not exist in registry. 

152 """ 

153 raise NotImplementedError() 

154 

155 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

156 """Write any datasets produced by initializing tasks in a graph. 

157 

158 Parameters 

159 ---------- 

160 graph : `~lsst.pipe.base.QuantumGraph` 

161 Execution graph. 

162 

163 Raises 

164 ------ 

165 TypeError 

166 Raised if the type of existing object in butler is different from 

167 new data. 

168 """ 

169 _LOG.debug("Will save InitOutputs for all tasks") 

170 for taskDef in self._task_iter(graph): 

171 init_input_refs = graph.initInputRefs(taskDef) or [] 

172 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs) 

173 for name in taskDef.connections.initOutputs: 

174 attribute = getattr(taskDef.connections, name) 

175 init_output_refs = graph.initOutputRefs(taskDef) or [] 

176 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name) 

177 if init_output_ref is None: 

178 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph") 

179 init_output_var = getattr(task, name) 

180 

181 if obj_from_store is not None: 

182 _LOG.debug( 

183 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

184 ) 

185 obj_from_store = self.butler.get(init_output_ref) 

186 # Types are supposed to be identical. 

187 # TODO: Check that object contents is identical too. 

188 if type(obj_from_store) is not type(init_output_var): 

189 raise TypeError( 

190 f"Stored initOutput object type {type(obj_from_store)} " 

191 "is different from task-generated type " 

192 f"{type(init_output_var)} for task {taskDef}" 

193 ) 

194 else: 

195 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

196 # This can still raise if there is a concurrent write. 

197 self.butler.put(init_output_var, init_output_ref) 

198 

199 def saveConfigs(self, graph: QuantumGraph) -> None: 

200 """Write configurations for pipeline tasks to butler or check that 

201 existing configurations are equal to the new ones. 

202 

203 Parameters 

204 ---------- 

205 graph : `~lsst.pipe.base.QuantumGraph` 

206 Execution graph. 

207 

208 Raises 

209 ------ 

210 TypeError 

211 Raised if existing object in butler is different from new data. 

212 Exception 

213 Raised if ``extendRun`` is `False` and datasets already exists. 

214 Content of a butler collection should not be changed if exception 

215 is raised. 

216 """ 

217 

218 def logConfigMismatch(msg: str) -> None: 

219 """Log messages about configuration mismatch.""" 

220 _LOG.fatal("Comparing configuration: %s", msg) 

221 

222 _LOG.debug("Will save Configs for all tasks") 

223 # start transaction to rollback any changes on exceptions 

224 with self.transaction(): 

225 for taskDef in self._task_iter(graph): 

226 # Config dataset ref is stored in task init outputs, but it 

227 # may be also be missing. 

228 task_output_refs = graph.initOutputRefs(taskDef) 

229 if task_output_refs is None: 

230 continue 

231 

232 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName) 

233 if config_ref is None: 

234 continue 

235 

236 if old_config is not None: 

237 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch): 

238 raise TypeError( 

239 f"Config does not match existing task config {taskDef.configDatasetName!r} in " 

240 "butler; tasks configurations must be consistent within the same run collection" 

241 ) 

242 else: 

243 _LOG.debug( 

244 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName 

245 ) 

246 self.butler.put(taskDef.config, config_ref) 

247 

248 def savePackageVersions(self, graph: QuantumGraph) -> None: 

249 """Write versions of software packages to butler. 

250 

251 Parameters 

252 ---------- 

253 graph : `~lsst.pipe.base.QuantumGraph` 

254 Execution graph. 

255 

256 Raises 

257 ------ 

258 TypeError 

259 Raised if existing object in butler is incompatible with new data. 

260 """ 

261 packages = Packages.fromSystem() 

262 _LOG.debug("want to save packages: %s", packages) 

263 

264 # start transaction to rollback any changes on exceptions 

265 with self.transaction(): 

266 # Packages dataset ref is stored in graph's global init outputs, 

267 # but it may be also be missing. 

268 

269 packages_ref, old_packages = self._find_dataset( 

270 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName 

271 ) 

272 if packages_ref is None: 

273 return 

274 

275 if old_packages is not None: 

276 # Note that because we can only detect python modules that have 

277 # been imported, the stored list of products may be more or 

278 # less complete than what we have now. What's important is 

279 # that the products that are in common have the same version. 

280 _compare_packages(old_packages, packages) 

281 # Update the old set of packages in case we have more packages 

282 # that haven't been persisted. 

283 extra = packages.extra(old_packages) 

284 if extra: 

285 _LOG.debug("extra packages: %s", extra) 

286 old_packages.update(packages) 

287 # have to remove existing dataset first, butler has no 

288 # replace option. 

289 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True) 

290 self.butler.put(old_packages, packages_ref) 

291 else: 

292 self.butler.put(packages, packages_ref) 

293 

294 def _find_dataset( 

295 self, refs: Iterable[DatasetRef], dataset_type: str 

296 ) -> tuple[DatasetRef | None, Any | None]: 

297 """Find a ref with a given dataset type name in a list of references 

298 and try to retrieve its data from butler. 

299 

300 Parameters 

301 ---------- 

302 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

303 References to check for matching dataset type. 

304 dataset_type : `str` 

305 Name of a dtaset type to look for. 

306 

307 Returns 

308 ------- 

309 ref : `DatasetRef` or `None` 

310 Dataset reference or `None` if there is no matching dataset type. 

311 data : `Any` 

312 An existing object extracted from butler, `None` if ``ref`` is 

313 `None` or if there is no existing object for that reference. 

314 """ 

315 ref: DatasetRef | None = None 

316 for ref in refs: 

317 if ref.datasetType.name == dataset_type: 

318 break 

319 else: 

320 return None, None 

321 

322 try: 

323 data = self.butler.get(ref) 

324 if data is not None and not self.extendRun: 

325 # It must not exist unless we are extending run. 

326 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler") 

327 except (LookupError, FileNotFoundError): 

328 data = None 

329 return ref, data 

330 

331 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]: 

332 """Iterate over TaskDefs in a graph, return only tasks that have one or 

333 more associated quanta. 

334 """ 

335 for taskDef in graph.iterTaskGraph(): 

336 if graph.getNumberOfQuantaForTask(taskDef) > 0: 

337 yield taskDef 

338 

339 @contextmanager 

340 def transaction(self) -> Iterator[None]: 

341 """Context manager for transaction. 

342 

343 Default implementation has no transaction support. 

344 """ 

345 yield 

346 

347 

348class PreExecInit(PreExecInitBase): 

349 """Initialization of registry for QuantumGraph execution. 

350 

351 This class encapsulates all necessary operations that have to be performed 

352 on butler and registry to prepare them for QuantumGraph execution. 

353 

354 Parameters 

355 ---------- 

356 butler : `~lsst.daf.butler.Butler` 

357 Data butler instance. 

358 taskFactory : `~lsst.pipe.base.TaskFactory` 

359 Task factory. 

360 extendRun : `bool`, optional 

361 If `True` then do not try to overwrite any datasets that might exist 

362 in ``butler.run``; instead compare them when appropriate/possible. If 

363 `False`, then any existing conflicting dataset will cause a butler 

364 exception to be raised. 

365 mock : `bool`, optional 

366 If `True` then also do initialization needed for pipeline mocking. 

367 """ 

368 

369 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False, mock: bool = False): 

370 super().__init__(butler, taskFactory, extendRun) 

371 self.full_butler = butler 

372 self.mock = mock 

373 if self.extendRun and self.full_butler.run is None: 

374 raise RuntimeError( 

375 "Cannot perform extendRun logic unless butler is initialized " 

376 "with a default output RUN collection." 

377 ) 

378 

379 @contextmanager 

380 def transaction(self) -> Iterator[None]: 

381 # dosctring inherited 

382 with self.full_butler.transaction(): 

383 yield 

384 

385 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

386 # docstring inherited 

387 pipeline = graph.taskGraph 

388 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

389 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True 

390 ) 

391 

392 for datasetTypes, is_input in ( 

393 (pipelineDatasetTypes.initIntermediates, True), 

394 (pipelineDatasetTypes.initOutputs, False), 

395 (pipelineDatasetTypes.intermediates, True), 

396 (pipelineDatasetTypes.outputs, False), 

397 ): 

398 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input) 

399 

400 if self.mock: 

401 # register special mock data types, skip logs and metadata 

402 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline} 

403 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline} 

404 for datasetTypes, is_input in ( 

405 (pipelineDatasetTypes.intermediates, True), 

406 (pipelineDatasetTypes.outputs, False), 

407 ): 

408 mockDatasetTypes = [] 

409 for datasetType in datasetTypes: 

410 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()): 

411 mockDatasetTypes.append( 

412 DatasetType( 

413 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name), 

414 datasetType.dimensions, 

415 "StructuredDataDict", 

416 ) 

417 ) 

418 if mockDatasetTypes: 

419 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input) 

420 

421 def _register_output_dataset_types( 

422 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool 

423 ) -> None: 

424 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool: 

425 # These are output dataset types so check for compatibility on put. 

426 is_compatible = expected.is_compatible_with(datasetType) 

427 

428 if is_input: 

429 # This dataset type is also used for input so must be 

430 # compatible on get as ell. 

431 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

432 

433 if is_compatible: 

434 _LOG.debug( 

435 "The dataset type configurations differ (%s from task != %s from registry) " 

436 "but the storage classes are compatible. Can continue.", 

437 datasetType, 

438 expected, 

439 ) 

440 return is_compatible 

441 

442 missing_datasetTypes = set() 

443 for datasetType in datasetTypes: 

444 # Only composites are registered, no components, and by this point 

445 # the composite should already exist. 

446 if registerDatasetTypes and not datasetType.isComponent(): 

447 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

448 # this is a no-op if it already exists and is consistent, 

449 # and it raises if it is inconsistent. 

450 try: 

451 self.full_butler.registry.registerDatasetType(datasetType) 

452 except ConflictingDefinitionError: 

453 if not _check_compatibility( 

454 datasetType, self.full_butler.registry.getDatasetType(datasetType.name), is_input 

455 ): 

456 raise 

457 else: 

458 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

459 try: 

460 expected = self.full_butler.registry.getDatasetType(datasetType.name) 

461 except KeyError: 

462 # Likely means that --register-dataset-types is forgotten. 

463 missing_datasetTypes.add(datasetType.name) 

464 continue 

465 if expected != datasetType: 

466 if not _check_compatibility(datasetType, expected, is_input): 

467 raise ValueError( 

468 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

469 ) 

470 

471 if missing_datasetTypes: 

472 plural = "s" if len(missing_datasetTypes) != 1 else "" 

473 raise KeyError( 

474 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

475 "Dataset types have to be registered with either `butler register-dataset-type` or " 

476 "passing `--register-dataset-types` option to `pipetask run`." 

477 ) 

478 

479 

480class PreExecInitLimited(PreExecInitBase): 

481 """Initialization of registry for QuantumGraph execution. 

482 

483 This class works with LimitedButler and expects that all references in 

484 QuantumGraph are resolved. 

485 

486 Parameters 

487 ---------- 

488 butler : `~lsst.daf.butler.LimitedButler` 

489 Limited data butler instance. 

490 taskFactory : `~lsst.pipe.base.TaskFactory` 

491 Task factory. 

492 """ 

493 

494 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

495 super().__init__(butler, taskFactory, False) 

496 

497 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

498 # docstring inherited 

499 # With LimitedButler we never create or check dataset types. 

500 pass