Coverage for python/lsst/ctrl/bps/drivers.py: 12%

192 statements  

« prev     ^ index     » next       coverage.py v7.3.3, created at 2023-12-20 17:34 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Driver functions for each subcommand. 

29 

30Driver functions ensure that ensure all setup work is done before running 

31the subcommand method. 

32""" 

33 

34 

35__all__ = [ 

36 "acquire_qgraph_driver", 

37 "cluster_qgraph_driver", 

38 "transform_driver", 

39 "prepare_driver", 

40 "submit_driver", 

41 "report_driver", 

42 "restart_driver", 

43 "cancel_driver", 

44 "ping_driver", 

45] 

46 

47 

48import errno 

49import getpass 

50import logging 

51import os 

52import re 

53import shutil 

54from collections.abc import Iterable 

55from pathlib import Path 

56 

57from lsst.pipe.base import Instrument 

58from lsst.utils import doImport 

59from lsst.utils.timer import time_this 

60from lsst.utils.usage import get_peak_mem_usage 

61 

62from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig 

63from .bps_utils import _dump_env_info, _dump_pkg_info 

64from .cancel import cancel 

65from .ping import ping 

66from .pre_transform import acquire_quantum_graph, cluster_quanta 

67from .prepare import prepare 

68from .report import report 

69from .restart import restart 

70from .submit import submit 

71from .transform import transform 

72 

73_LOG = logging.getLogger(__name__) 

74 

75 

76def _init_submission_driver(config_file, **kwargs): 

77 """Initialize runtime environment. 

78 

79 Parameters 

80 ---------- 

81 config_file : `str` 

82 Name of the configuration file. 

83 

84 Returns 

85 ------- 

86 config : `lsst.ctrl.bps.BpsConfig` 

87 Batch Processing Service configuration. 

88 """ 

89 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

90 

91 # Override config with command-line values. 

92 # Handle diffs between pipetask argument names vs bps yaml 

93 translation = { 

94 "input": "inCollection", 

95 "output_run": "outputRun", 

96 "qgraph": "qgraphFile", 

97 "pipeline": "pipelineYaml", 

98 "wms_service": "wmsServiceClass", 

99 "compute_site": "computeSite", 

100 } 

101 for key, value in kwargs.items(): 

102 # Don't want to override config with None or empty string values. 

103 if value: 

104 # pipetask argument parser converts some values to list, 

105 # but bps will want string. 

106 if not isinstance(value, str) and isinstance(value, Iterable): 

107 value = ",".join(value) 

108 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

109 config[f".bps_cmdline.{new_key}"] = value 

110 

111 # If the WMS service class was not defined neither at the command line nor 

112 # explicitly in config file, use the value provided by the environmental 

113 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

114 # the package default. 

115 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

116 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

117 default_config = BpsConfig(BPS_DEFAULTS) 

118 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

119 config["wmsServiceClass"] = wms_service 

120 

121 # Set some initial values 

122 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

123 if "operator" not in config: 

124 config[".bps_defined.operator"] = getpass.getuser() 

125 

126 if "outCollection" in config: 

127 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

128 

129 if "outputRun" not in config: 

130 raise KeyError("Must specify the output run collection using outputRun") 

131 

132 if "uniqProcName" not in config: 

133 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

134 

135 if "submitPath" not in config: 

136 raise KeyError("Must specify the submit-side run directory using submitPath") 

137 

138 # If requested, run WMS plugin checks early in submission process to 

139 # ensure WMS has what it will need for prepare() or submit(). 

140 if kwargs.get("runWmsSubmissionChecks", False): 

141 found, wms_class = config.search("wmsServiceClass") 

142 if not found: 

143 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

144 

145 # Check that can import wms service class. 

146 wms_service_class = doImport(wms_class) 

147 wms_service = wms_service_class(config) 

148 

149 try: 

150 wms_service.run_submission_checks() 

151 except NotImplementedError: 

152 # Allow various plugins to implement only when needed to do extra 

153 # checks. 

154 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

155 else: 

156 _LOG.debug("Skipping submission checks.") 

157 

158 # Make submit directory to contain all outputs. 

159 submit_path = Path(config["submitPath"]) 

160 try: 

161 submit_path.mkdir(parents=True, exist_ok=False) 

162 except OSError as exc: 

163 if exc.errno == errno.EEXIST: 

164 reason = "Directory already exists" 

165 else: 

166 reason = exc.strerror 

167 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

168 config[".bps_defined.submitPath"] = str(submit_path) 

169 print(f"Submit dir: {submit_path}") 

170 

171 # save copy of configs (orig and expanded config) 

172 shutil.copy2(config_file, submit_path) 

173 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

174 config.dump(fh) 

175 

176 # Dump information about runtime environment and software versions in use. 

177 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

178 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

179 

180 return config 

181 

182 

183def acquire_qgraph_driver(config_file, **kwargs): 

184 """Read a quantum graph from a file or create one from pipeline definition. 

185 

186 Parameters 

187 ---------- 

188 config_file : `str` 

189 Name of the configuration file. 

190 **kwargs : `~typing.Any` 

191 Additional modifiers to the configuration. 

192 

193 Returns 

194 ------- 

195 config : `lsst.ctrl.bps.BpsConfig` 

196 Updated configuration. 

197 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

198 A graph representing quanta. 

199 """ 

200 _LOG.info("Initializing execution environment") 

201 with time_this( 

202 log=_LOG, 

203 level=logging.INFO, 

204 prefix=None, 

205 msg="Initializing execution environment completed", 

206 mem_usage=True, 

207 mem_unit=DEFAULT_MEM_UNIT, 

208 mem_fmt=DEFAULT_MEM_FMT, 

209 ): 

210 config = _init_submission_driver(config_file, **kwargs) 

211 submit_path = config[".bps_defined.submitPath"] 

212 if _LOG.isEnabledFor(logging.INFO): 

213 _LOG.info( 

214 "Peak memory usage for bps process %s (main), %s (largest child process)", 

215 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

216 ) 

217 

218 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

219 with time_this( 

220 log=_LOG, 

221 level=logging.INFO, 

222 prefix=None, 

223 msg="Acquire stage completed", 

224 mem_usage=True, 

225 mem_unit=DEFAULT_MEM_UNIT, 

226 mem_fmt=DEFAULT_MEM_FMT, 

227 ): 

228 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

229 if _LOG.isEnabledFor(logging.INFO): 

230 _LOG.info( 

231 "Peak memory usage for bps process %s (main), %s (largest child process)", 

232 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

233 ) 

234 

235 # When using QBB (and neither 'executionButlerTemplate' nor 

236 # 'executionButlerDir' is set) acquire_quantum_graph() will set 

237 # 'execution_butler_dir' to the submit directory. This will trick 

238 # 'ctrl_bps_parsl' to use a non-existent execution butler and the run will 

239 # fail. See ParslJob.get_command_line() for details. 

240 # 

241 # This simple trick should keep 'ctrl_bps_parsl' working for the time being 

242 # without making more complex changes in the logic which will be removed 

243 # soon anyway (see DM-40342). 

244 if os.path.normpath(execution_butler_dir) != os.path.normpath(submit_path): 

245 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

246 config[".bps_defined.runQgraphFile"] = qgraph_file 

247 return config, qgraph 

248 

249 

250def cluster_qgraph_driver(config_file, **kwargs): 

251 """Group quanta into clusters. 

252 

253 Parameters 

254 ---------- 

255 config_file : `str` 

256 Name of the configuration file. 

257 **kwargs : `~typing.Any` 

258 Additional modifiers to the configuration. 

259 

260 Returns 

261 ------- 

262 config : `lsst.ctrl.bps.BpsConfig` 

263 Updated configuration. 

264 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

265 A graph representing clustered quanta. 

266 """ 

267 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

268 

269 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

270 with time_this( 

271 log=_LOG, 

272 level=logging.INFO, 

273 prefix=None, 

274 msg="Cluster stage completed", 

275 mem_usage=True, 

276 mem_unit=DEFAULT_MEM_UNIT, 

277 mem_fmt=DEFAULT_MEM_FMT, 

278 ): 

279 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

280 if _LOG.isEnabledFor(logging.INFO): 

281 _LOG.info( 

282 "Peak memory usage for bps process %s (main), %s (largest child process)", 

283 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

284 ) 

285 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph)) 

286 

287 submit_path = config[".bps_defined.submitPath"] 

288 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

289 if save_clustered_qgraph: 

290 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

291 _, save_dot = config.search("saveDot", opt={"default": False}) 

292 if save_dot: 

293 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

294 return config, clustered_qgraph 

295 

296 

297def transform_driver(config_file, **kwargs): 

298 """Create a workflow for a specific workflow management system. 

299 

300 Parameters 

301 ---------- 

302 config_file : `str` 

303 Name of the configuration file. 

304 **kwargs : `~typing.Any` 

305 Additional modifiers to the configuration. 

306 

307 Returns 

308 ------- 

309 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

310 Configuration to use when creating the workflow. 

311 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

312 Representation of the abstract/scientific workflow specific to a given 

313 workflow management system. 

314 """ 

315 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

316 submit_path = config[".bps_defined.submitPath"] 

317 

318 _LOG.info("Starting transform stage (creating generic workflow)") 

319 with time_this( 

320 log=_LOG, 

321 level=logging.INFO, 

322 prefix=None, 

323 msg="Transform stage completed", 

324 mem_usage=True, 

325 mem_unit=DEFAULT_MEM_UNIT, 

326 mem_fmt=DEFAULT_MEM_FMT, 

327 ): 

328 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

329 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

330 if _LOG.isEnabledFor(logging.INFO): 

331 _LOG.info( 

332 "Peak memory usage for bps process %s (main), %s (largest child process)", 

333 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

334 ) 

335 num_jobs = sum(generic_workflow.job_counts.values()) 

336 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs) 

337 

338 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

339 if save_workflow: 

340 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

341 generic_workflow.save(outfh, "pickle") 

342 _, save_dot = config.search("saveDot", opt={"default": False}) 

343 if save_dot: 

344 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

345 generic_workflow.draw(outfh, "dot") 

346 return generic_workflow_config, generic_workflow 

347 

348 

349def prepare_driver(config_file, **kwargs): 

350 """Create a representation of the generic workflow. 

351 

352 Parameters 

353 ---------- 

354 config_file : `str` 

355 Name of the configuration file. 

356 **kwargs : `~typing.Any` 

357 Additional modifiers to the configuration. 

358 

359 Returns 

360 ------- 

361 wms_config : `lsst.ctrl.bps.BpsConfig` 

362 Configuration to use when creating the workflow. 

363 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

364 Representation of the abstract/scientific workflow specific to a given 

365 workflow management system. 

366 """ 

367 kwargs.setdefault("runWmsSubmissionChecks", True) 

368 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

369 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

370 

371 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

372 with time_this( 

373 log=_LOG, 

374 level=logging.INFO, 

375 prefix=None, 

376 msg="Prepare stage completed", 

377 mem_usage=True, 

378 mem_unit=DEFAULT_MEM_UNIT, 

379 mem_fmt=DEFAULT_MEM_FMT, 

380 ): 

381 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

382 if _LOG.isEnabledFor(logging.INFO): 

383 _LOG.info( 

384 "Peak memory usage for bps process %s (main), %s (largest child process)", 

385 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

386 ) 

387 

388 wms_workflow_config = generic_workflow_config 

389 return wms_workflow_config, wms_workflow 

390 

391 

392def submit_driver(config_file, **kwargs): 

393 """Submit workflow for execution. 

394 

395 Parameters 

396 ---------- 

397 config_file : `str` 

398 Name of the configuration file. 

399 **kwargs : `~typing.Any` 

400 Additional modifiers to the configuration. 

401 """ 

402 kwargs.setdefault("runWmsSubmissionChecks", True) 

403 

404 _LOG.info( 

405 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

406 "not accurately reflect actual memory usage by the bps process." 

407 ) 

408 

409 _LOG.info("Starting submission process") 

410 with time_this( 

411 log=_LOG, 

412 level=logging.INFO, 

413 prefix=None, 

414 msg="Completed entire submission process", 

415 mem_usage=True, 

416 mem_unit=DEFAULT_MEM_UNIT, 

417 mem_fmt=DEFAULT_MEM_FMT, 

418 ): 

419 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

420 

421 _LOG.info("Starting submit stage") 

422 with time_this( 

423 log=_LOG, 

424 level=logging.INFO, 

425 prefix=None, 

426 msg="Completed submit stage", 

427 mem_usage=True, 

428 mem_unit=DEFAULT_MEM_UNIT, 

429 mem_fmt=DEFAULT_MEM_FMT, 

430 ): 

431 submit(wms_workflow_config, wms_workflow) 

432 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

433 if _LOG.isEnabledFor(logging.INFO): 

434 _LOG.info( 

435 "Peak memory usage for bps process %s (main), %s (largest child process)", 

436 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

437 ) 

438 

439 print(f"Run Id: {wms_workflow.run_id}") 

440 print(f"Run Name: {wms_workflow.name}") 

441 

442 

443def restart_driver(wms_service, run_id): 

444 """Restart a failed workflow. 

445 

446 Parameters 

447 ---------- 

448 wms_service : `str` 

449 Name of the class. 

450 run_id : `str` 

451 Id or path of workflow that need to be restarted. 

452 """ 

453 if wms_service is None: 

454 default_config = BpsConfig(BPS_DEFAULTS) 

455 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

456 

457 new_run_id, run_name, message = restart(wms_service, run_id) 

458 if new_run_id is not None: 

459 path = Path(run_id) 

460 if path.exists(): 

461 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

462 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

463 print(f"Run Id: {new_run_id}") 

464 print(f"Run Name: {run_name}") 

465 else: 

466 if message: 

467 print(f"Restart failed: {message}") 

468 else: 

469 print("Restart failed: Unknown error") 

470 

471 

472def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False, return_exit_codes=False): 

473 """Print out summary of jobs submitted for execution. 

474 

475 Parameters 

476 ---------- 

477 wms_service : `str` 

478 Name of the class. 

479 run_id : `str` 

480 A run id the report will be restricted to. 

481 user : `str` 

482 A user name the report will be restricted to. 

483 hist_days : int 

484 Number of days. 

485 pass_thru : `str` 

486 A string to pass directly to the WMS service class. 

487 is_global : `bool`, optional 

488 If set, all available job queues will be queried for job information. 

489 Defaults to False which means that only a local job queue will be 

490 queried for information. 

491 

492 Only applicable in the context of a WMS using distributed job queues 

493 (e.g., HTCondor). 

494 return_exit_codes : `bool`, optional 

495 If set, return exit codes related to jobs with a 

496 non-success status. Defaults to False, which means that only 

497 the summary state is returned. 

498 

499 Only applicable in the context of a WMS with associated 

500 handlers to return exit codes from jobs. 

501 """ 

502 if wms_service is None: 

503 default_config = BpsConfig(BPS_DEFAULTS) 

504 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

505 report( 

506 wms_service, 

507 run_id, 

508 user, 

509 hist_days, 

510 pass_thru, 

511 is_global=is_global, 

512 return_exit_codes=return_exit_codes, 

513 ) 

514 

515 

516def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

517 """Cancel submitted workflows. 

518 

519 Parameters 

520 ---------- 

521 wms_service : `str` 

522 Name of the Workload Management System service class. 

523 run_id : `str` 

524 ID or path of job that should be canceled. 

525 user : `str` 

526 User whose submitted jobs should be canceled. 

527 require_bps : `bool` 

528 Whether to require given run_id/user to be a bps submitted job. 

529 pass_thru : `str` 

530 Information to pass through to WMS. 

531 is_global : `bool`, optional 

532 If set, all available job queues will be checked for jobs to cancel. 

533 Defaults to False which means that only a local job queue will be 

534 checked. 

535 

536 Only applicable in the context of a WMS using distributed job queues 

537 (e.g., HTCondor). 

538 """ 

539 if wms_service is None: 

540 default_config = BpsConfig(BPS_DEFAULTS) 

541 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

542 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global) 

543 

544 

545def ping_driver(wms_service=None, pass_thru=None): 

546 """Check whether WMS services are up, reachable, and any authentication, 

547 if needed, succeeds. 

548 

549 The services to be checked are those needed for submit, report, cancel, 

550 restart, but ping cannot guarantee whether jobs would actually run 

551 successfully. 

552 

553 Parameters 

554 ---------- 

555 wms_service : `str`, optional 

556 Name of the Workload Management System service class. 

557 pass_thru : `str`, optional 

558 Information to pass through to WMS. 

559 

560 Returns 

561 ------- 

562 success : `int` 

563 Whether services are up and usable (0) or not (non-zero). 

564 """ 

565 if wms_service is None: 

566 default_config = BpsConfig(BPS_DEFAULTS) 

567 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

568 status, message = ping(wms_service, pass_thru) 

569 

570 if message: 

571 if not status: 

572 _LOG.info(message) 

573 else: 

574 _LOG.error(message) 

575 

576 # Log overall status message 

577 if not status: 

578 _LOG.info("Ping successful.") 

579 else: 

580 _LOG.error("Ping failed (%d).", status) 

581 

582 return status