Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 18%

203 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-01 10:07 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["PreExecInit"] 

25 

26# ------------------------------- 

27# Imports of standard modules -- 

28# ------------------------------- 

29import abc 

30import logging 

31from collections.abc import Iterable, Iterator 

32from contextlib import contextmanager 

33from typing import TYPE_CHECKING, Any 

34 

35# ----------------------------- 

36# Imports for other modules -- 

37# ----------------------------- 

38from lsst.daf.butler import DataCoordinate, DatasetIdFactory, DatasetRef, DatasetType 

39from lsst.daf.butler.registry import ConflictingDefinitionError 

40from lsst.pipe.base import PipelineDatasetTypes 

41from lsst.utils.packages import Packages 

42 

43from .mock_task import MockButlerQuantumContext 

44 

45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true

46 from lsst.daf.butler import Butler, LimitedButler 

47 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory 

48 

49_LOG = logging.getLogger(__name__) 

50 

51 

52class MissingReferenceError(Exception): 

53 """Exception raised when resolved reference is missing from graph.""" 

54 

55 pass 

56 

57 

58def _compare_packages(old_packages: Packages, new_packages: Packages) -> None: 

59 """Compare two versions of Packages. 

60 

61 Parameters 

62 ---------- 

63 old_packages : `Packages` 

64 Previously recorded package versions. 

65 new_packages : `Packages` 

66 New set of package versions. 

67 

68 Raises 

69 ------ 

70 TypeError 

71 Raised if parameters are inconsistent. 

72 """ 

73 diff = new_packages.difference(old_packages) 

74 if diff: 

75 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

76 raise TypeError(f"Package versions mismatch: ({versions_str})") 

77 else: 

78 _LOG.debug("new packages are consistent with old") 

79 

80 

81class PreExecInitBase(abc.ABC): 

82 """Common part of the implementation of PreExecInit classes that does not 

83 depend on Butler type. 

84 """ 

85 

86 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

87 self.butler = butler 

88 self.taskFactory = taskFactory 

89 

90 def initialize( 

91 self, 

92 graph: QuantumGraph, 

93 saveInitOutputs: bool = True, 

94 registerDatasetTypes: bool = False, 

95 saveVersions: bool = True, 

96 ) -> None: 

97 """Perform all initialization steps. 

98 

99 Convenience method to execute all initialization steps. Instead of 

100 calling this method and providing all options it is also possible to 

101 call methods individually. 

102 

103 Parameters 

104 ---------- 

105 graph : `~lsst.pipe.base.QuantumGraph` 

106 Execution graph. 

107 saveInitOutputs : `bool`, optional 

108 If ``True`` (default) then save "init outputs", configurations, 

109 and package versions to butler. 

110 registerDatasetTypes : `bool`, optional 

111 If ``True`` then register dataset types in registry, otherwise 

112 they must be already registered. 

113 saveVersions : `bool`, optional 

114 If ``False`` then do not save package versions even if 

115 ``saveInitOutputs`` is set to ``True``. 

116 """ 

117 # register dataset types or check consistency 

118 self.initializeDatasetTypes(graph, registerDatasetTypes) 

119 

120 # Save task initialization data or check that saved data 

121 # is consistent with what tasks would save 

122 if saveInitOutputs: 

123 self.saveInitOutputs(graph) 

124 self.saveConfigs(graph) 

125 if saveVersions: 

126 self.savePackageVersions(graph) 

127 

128 @abc.abstractmethod 

129 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

130 """Save or check DatasetTypes output by the tasks in a graph. 

131 

132 Iterates over all DatasetTypes for all tasks in a graph and either 

133 tries to add them to registry or compares them to existing ones. 

134 

135 Parameters 

136 ---------- 

137 graph : `~lsst.pipe.base.QuantumGraph` 

138 Execution graph. 

139 registerDatasetTypes : `bool`, optional 

140 If ``True`` then register dataset types in registry, otherwise 

141 they must be already registered. 

142 

143 Raises 

144 ------ 

145 ValueError 

146 Raised if existing DatasetType is different from DatasetType 

147 in a graph. 

148 KeyError 

149 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

150 does not exist in registry. 

151 """ 

152 raise NotImplementedError() 

153 

154 def saveInitOutputs(self, graph: QuantumGraph) -> None: 

155 """Write any datasets produced by initializing tasks in a graph. 

156 

157 Parameters 

158 ---------- 

159 graph : `~lsst.pipe.base.QuantumGraph` 

160 Execution graph. 

161 

162 Raises 

163 ------ 

164 TypeError 

165 Raised if the type of existing object in butler is different from 

166 new data. 

167 """ 

168 _LOG.debug("Will save InitOutputs for all tasks") 

169 for taskDef in graph.iterTaskGraph(): 

170 init_input_refs = self.find_init_input_refs(taskDef, graph) 

171 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs) 

172 for name in taskDef.connections.initOutputs: 

173 attribute = getattr(taskDef.connections, name) 

174 obj_from_store, init_output_ref = self.find_init_output(taskDef, attribute.name, graph) 

175 if init_output_ref is None: 

176 raise ValueError(f"Cannot find or make dataset reference for init output {name}") 

177 init_output_var = getattr(task, name) 

178 

179 if obj_from_store is not None: 

180 _LOG.debug( 

181 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name 

182 ) 

183 obj_from_store = self.butler.getDirect(init_output_ref) 

184 # Types are supposed to be identical. 

185 # TODO: Check that object contents is identical too. 

186 if type(obj_from_store) is not type(init_output_var): 

187 raise TypeError( 

188 f"Stored initOutput object type {type(obj_from_store)} " 

189 f"is different from task-generated type " 

190 f"{type(init_output_var)} for task {taskDef}" 

191 ) 

192 else: 

193 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

194 # This can still raise if there is a concurrent write. 

195 self.butler.putDirect(init_output_var, init_output_ref) 

196 

197 def saveConfigs(self, graph: QuantumGraph) -> None: 

198 """Write configurations for pipeline tasks to butler or check that 

199 existing configurations are equal to the new ones. 

200 

201 Parameters 

202 ---------- 

203 graph : `~lsst.pipe.base.QuantumGraph` 

204 Execution graph. 

205 

206 Raises 

207 ------ 

208 TypeError 

209 Raised if existing object in butler is different from new data. 

210 Exception 

211 Raised if ``extendRun`` is `False` and datasets already exists. 

212 Content of a butler collection should not be changed if exception 

213 is raised. 

214 """ 

215 

216 def logConfigMismatch(msg: str) -> None: 

217 """Log messages about configuration mismatch.""" 

218 _LOG.fatal("Comparing configuration: %s", msg) 

219 

220 _LOG.debug("Will save Configs for all tasks") 

221 # start transaction to rollback any changes on exceptions 

222 with self.transaction(): 

223 for taskDef in graph.iterTaskGraph(): 

224 config_name = taskDef.configDatasetName 

225 

226 old_config, dataset_ref = self.find_init_output(taskDef, taskDef.configDatasetName, graph) 

227 

228 if old_config is not None: 

229 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch): 

230 raise TypeError( 

231 f"Config does not match existing task config {taskDef.configDatasetName!r} in " 

232 "butler; tasks configurations must be consistent within the same run collection" 

233 ) 

234 else: 

235 # butler will raise exception if dataset is already there 

236 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, config_name) 

237 self.butler.putDirect(taskDef.config, dataset_ref) 

238 

239 def savePackageVersions(self, graph: QuantumGraph) -> None: 

240 """Write versions of software packages to butler. 

241 

242 Parameters 

243 ---------- 

244 graph : `~lsst.pipe.base.QuantumGraph` 

245 Execution graph. 

246 

247 Raises 

248 ------ 

249 TypeError 

250 Raised if existing object in butler is incompatible with new data. 

251 """ 

252 packages = Packages.fromSystem() 

253 _LOG.debug("want to save packages: %s", packages) 

254 

255 # start transaction to rollback any changes on exceptions 

256 with self.transaction(): 

257 

258 old_packages, dataset_ref = self.find_packages(graph) 

259 

260 if old_packages is not None: 

261 # Note that because we can only detect python modules that have 

262 # been imported, the stored list of products may be more or 

263 # less complete than what we have now. What's important is 

264 # that the products that are in common have the same version. 

265 _compare_packages(old_packages, packages) 

266 # Update the old set of packages in case we have more packages 

267 # that haven't been persisted. 

268 extra = packages.extra(old_packages) 

269 if extra: 

270 _LOG.debug("extra packages: %s", extra) 

271 old_packages.update(packages) 

272 # have to remove existing dataset first, butler has no 

273 # replace option. 

274 self.butler.pruneDatasets([dataset_ref], unstore=True, purge=True) 

275 self.butler.putDirect(old_packages, dataset_ref) 

276 else: 

277 self.butler.putDirect(packages, dataset_ref) 

278 

279 @abc.abstractmethod 

280 def find_init_input_refs(self, taskDef: TaskDef, graph: QuantumGraph) -> Iterable[DatasetRef]: 

281 """Return the list of resolved dataset references for task init inputs. 

282 

283 Parameters 

284 ---------- 

285 taskDef : `~lsst.pipe.base.TaskDef` 

286 Pipeline task definition. 

287 graph : `~lsst.pipe.base.QuantumGraph` 

288 Quantum graph. 

289 

290 Returns 

291 ------- 

292 refs : `~collections.abc.Iterable` [`~lsst.daf.butler.DatasetRef`] 

293 Resolved dataset references. 

294 """ 

295 raise NotImplementedError() 

296 

297 @abc.abstractmethod 

298 def find_init_output( 

299 self, taskDef: TaskDef, dataset_type: str, graph: QuantumGraph 

300 ) -> tuple[Any | None, DatasetRef]: 

301 """Find task init output for given dataset type. 

302 

303 Parameters 

304 ---------- 

305 taskDef : `~lsst.pipe.base.TaskDef` 

306 Pipeline task definition. 

307 dataset_type : `str` 

308 Dataset type name. 

309 graph : `~lsst.pipe.base.QuantumGraph` 

310 Quantum graph. 

311 

312 Returns 

313 ------- 

314 data 

315 Existing init output object retrieved from butler, `None` if butler 

316 has no existing object. 

317 ref : `~lsst.daf.butler.DatasetRef` 

318 Resolved reference for init output to be stored in butler. 

319 

320 Raises 

321 ------ 

322 MissingReferenceError 

323 Raised if reference cannot be found or generated. 

324 """ 

325 raise NotImplementedError() 

326 

327 @abc.abstractmethod 

328 def find_packages(self, graph: QuantumGraph) -> tuple[Packages | None, DatasetRef]: 

329 """Find packages information. 

330 

331 Parameters 

332 ---------- 

333 graph : `~lsst.pipe.base.QuantumGraph` 

334 Quantum graph. 

335 

336 Returns 

337 ------- 

338 packages : `lsst.utils.packages.Packages` or `None` 

339 Existing packages data retrieved from butler, or `None`. 

340 ref : `~lsst.daf.butler.DatasetRef` 

341 Resolved reference for packages to be stored in butler. 

342 

343 Raises 

344 ------ 

345 MissingReferenceError 

346 Raised if reference cannot be found or generated. 

347 """ 

348 raise NotImplementedError() 

349 

350 @contextmanager 

351 def transaction(self) -> Iterator[None]: 

352 """Context manager for transaction. 

353 

354 Default implementation has no transaction support. 

355 """ 

356 yield 

357 

358 

359class PreExecInit(PreExecInitBase): 

360 """Initialization of registry for QuantumGraph execution. 

361 

362 This class encapsulates all necessary operations that have to be performed 

363 on butler and registry to prepare them for QuantumGraph execution. 

364 

365 Parameters 

366 ---------- 

367 butler : `~lsst.daf.butler.Butler` 

368 Data butler instance. 

369 taskFactory : `~lsst.pipe.base.TaskFactory` 

370 Task factory. 

371 extendRun : `bool`, optional 

372 If `True` then do not try to overwrite any datasets that might exist 

373 in ``butler.run``; instead compare them when appropriate/possible. If 

374 `False`, then any existing conflicting dataset will cause a butler 

375 exception to be raised. 

376 mock : `bool`, optional 

377 If `True` then also do initialization needed for pipeline mocking. 

378 """ 

379 

380 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False, mock: bool = False): 

381 super().__init__(butler, taskFactory) 

382 self.full_butler = butler 

383 self.extendRun = extendRun 

384 self.mock = mock 

385 if self.extendRun and self.full_butler.run is None: 

386 raise RuntimeError( 

387 "Cannot perform extendRun logic unless butler is initialized " 

388 "with a default output RUN collection." 

389 ) 

390 

391 @contextmanager 

392 def transaction(self) -> Iterator[None]: 

393 # dosctring inherited 

394 with self.full_butler.transaction(): 

395 yield 

396 

397 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

398 # docstring inherited 

399 pipeline = graph.taskGraph 

400 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline( 

401 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True 

402 ) 

403 

404 for datasetTypes, is_input in ( 

405 (pipelineDatasetTypes.initIntermediates, True), 

406 (pipelineDatasetTypes.initOutputs, False), 

407 (pipelineDatasetTypes.intermediates, True), 

408 (pipelineDatasetTypes.outputs, False), 

409 ): 

410 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input) 

411 

412 if self.mock: 

413 # register special mock data types, skip logs and metadata 

414 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline} 

415 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline} 

416 for datasetTypes, is_input in ( 

417 (pipelineDatasetTypes.intermediates, True), 

418 (pipelineDatasetTypes.outputs, False), 

419 ): 

420 mockDatasetTypes = [] 

421 for datasetType in datasetTypes: 

422 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()): 

423 mockDatasetTypes.append( 

424 DatasetType( 

425 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name), 

426 datasetType.dimensions, 

427 "StructuredDataDict", 

428 ) 

429 ) 

430 if mockDatasetTypes: 

431 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input) 

432 

433 def _register_output_dataset_types( 

434 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool 

435 ) -> None: 

436 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool: 

437 # These are output dataset types so check for compatibility on put. 

438 is_compatible = expected.is_compatible_with(datasetType) 

439 

440 if is_input: 

441 # This dataset type is also used for input so must be 

442 # compatible on get as ell. 

443 is_compatible = is_compatible and datasetType.is_compatible_with(expected) 

444 

445 if is_compatible: 

446 _LOG.debug( 

447 "The dataset type configurations differ (%s from task != %s from registry) " 

448 "but the storage classes are compatible. Can continue.", 

449 datasetType, 

450 expected, 

451 ) 

452 return is_compatible 

453 

454 missing_datasetTypes = set() 

455 for datasetType in datasetTypes: 

456 # Only composites are registered, no components, and by this point 

457 # the composite should already exist. 

458 if registerDatasetTypes and not datasetType.isComponent(): 

459 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

460 # this is a no-op if it already exists and is consistent, 

461 # and it raises if it is inconsistent. 

462 try: 

463 self.full_butler.registry.registerDatasetType(datasetType) 

464 except ConflictingDefinitionError: 

465 if not _check_compatibility( 

466 datasetType, self.full_butler.registry.getDatasetType(datasetType.name), is_input 

467 ): 

468 raise 

469 else: 

470 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

471 try: 

472 expected = self.full_butler.registry.getDatasetType(datasetType.name) 

473 except KeyError: 

474 # Likely means that --register-dataset-types is forgotten. 

475 missing_datasetTypes.add(datasetType.name) 

476 continue 

477 if expected != datasetType: 

478 if not _check_compatibility(datasetType, expected, is_input): 

479 raise ValueError( 

480 f"DatasetType configuration does not match Registry: {datasetType} != {expected}" 

481 ) 

482 

483 if missing_datasetTypes: 

484 plural = "s" if len(missing_datasetTypes) != 1 else "" 

485 raise KeyError( 

486 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. " 

487 "Dataset types have to be registered with either `butler register-dataset-type` or " 

488 "passing `--register-dataset-types` option to `pipetask run`." 

489 ) 

490 

491 def find_init_input_refs(self, taskDef: TaskDef, graph: QuantumGraph) -> Iterable[DatasetRef]: 

492 # docstring inherited 

493 refs: list[DatasetRef] = [] 

494 for name in taskDef.connections.initInputs: 

495 attribute = getattr(taskDef.connections, name) 

496 dataId = DataCoordinate.makeEmpty(self.full_butler.dimensions) 

497 dataset_type = DatasetType(attribute.name, graph.universe.empty, attribute.storageClass) 

498 ref = self.full_butler.registry.findDataset(dataset_type, dataId) 

499 if ref is None: 

500 raise ValueError(f"InitInput does not exist in butler for dataset type {dataset_type}") 

501 refs.append(ref) 

502 return refs 

503 

504 def find_init_output( 

505 self, taskDef: TaskDef, dataset_type_name: str, graph: QuantumGraph 

506 ) -> tuple[Any | None, DatasetRef]: 

507 # docstring inherited 

508 dataset_type = self.full_butler.registry.getDatasetType(dataset_type_name) 

509 dataId = DataCoordinate.makeEmpty(self.full_butler.dimensions) 

510 return self._find_existing(dataset_type, dataId) 

511 

512 def find_packages(self, graph: QuantumGraph) -> tuple[Packages | None, DatasetRef]: 

513 # docstring inherited 

514 dataset_type = self.full_butler.registry.getDatasetType(PipelineDatasetTypes.packagesDatasetName) 

515 dataId = DataCoordinate.makeEmpty(self.full_butler.dimensions) 

516 return self._find_existing(dataset_type, dataId) 

517 

518 def _find_existing( 

519 self, dataset_type: DatasetType, dataId: DataCoordinate 

520 ) -> tuple[Any | None, DatasetRef]: 

521 """Make a reference of a given dataset type and try to retrieve it from 

522 butler. If not found then generate new resolved reference. 

523 """ 

524 run = self.full_butler.run 

525 assert run is not None 

526 

527 ref = self.full_butler.registry.findDataset(dataset_type, dataId, collections=[run]) 

528 if self.extendRun and ref is not None: 

529 try: 

530 config = self.butler.getDirect(ref) 

531 return config, ref 

532 except (LookupError, FileNotFoundError): 

533 return None, ref 

534 else: 

535 # make new resolved dataset ref 

536 ref = DatasetRef(dataset_type, dataId) 

537 ref = DatasetIdFactory().resolveRef(ref, run) 

538 return None, ref 

539 

540 

541class PreExecInitLimited(PreExecInitBase): 

542 """Initialization of registry for QuantumGraph execution. 

543 

544 This class works with LimitedButler and expects that all references in 

545 QuantumGraph are resolved. 

546 

547 Parameters 

548 ---------- 

549 butler : `~lsst.daf.butler.LimitedButler` 

550 Limited data butler instance. 

551 taskFactory : `~lsst.pipe.base.TaskFactory` 

552 Task factory. 

553 """ 

554 

555 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory): 

556 super().__init__(butler, taskFactory) 

557 

558 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None: 

559 # docstring inherited 

560 # With LimitedButler we never create or check dataset types. 

561 pass 

562 

563 def find_init_input_refs(self, taskDef: TaskDef, graph: QuantumGraph) -> Iterable[DatasetRef]: 

564 # docstring inherited 

565 return graph.initInputRefs(taskDef) or [] 

566 

567 def find_init_output( 

568 self, taskDef: TaskDef, dataset_type: str, graph: QuantumGraph 

569 ) -> tuple[Any | None, DatasetRef]: 

570 # docstring inherited 

571 return self._find_existing(graph.initOutputRefs(taskDef) or [], dataset_type) 

572 

573 def find_packages(self, graph: QuantumGraph) -> tuple[Packages | None, DatasetRef]: 

574 # docstring inherited 

575 return self._find_existing(graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName) 

576 

577 def _find_existing(self, refs: Iterable[DatasetRef], dataset_type: str) -> tuple[Any | None, DatasetRef]: 

578 """Find a reference of a given dataset type in the list of references 

579 and try to retrieve it from butler. 

580 """ 

581 for ref in refs: 

582 if ref.datasetType.name == dataset_type: 

583 try: 

584 data = self.butler.getDirect(ref) 

585 return data, ref 

586 except (LookupError, FileNotFoundError): 

587 return None, ref 

588 raise MissingReferenceError(f"Failed to find reference for dataset type {dataset_type}")