Coverage for python/lsst/ctrl/bps/drivers.py: 12%

192 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-07 17:21 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Driver functions for each subcommand. 

29 

30Driver functions ensure that ensure all setup work is done before running 

31the subcommand method. 

32""" 

33 

34 

35__all__ = [ 

36 "acquire_qgraph_driver", 

37 "cluster_qgraph_driver", 

38 "transform_driver", 

39 "prepare_driver", 

40 "submit_driver", 

41 "report_driver", 

42 "restart_driver", 

43 "cancel_driver", 

44 "ping_driver", 

45] 

46 

47 

48import errno 

49import getpass 

50import logging 

51import os 

52import re 

53import shutil 

54from collections.abc import Iterable 

55from pathlib import Path 

56 

57from lsst.pipe.base import Instrument 

58from lsst.utils import doImport 

59from lsst.utils.timer import time_this 

60from lsst.utils.usage import get_peak_mem_usage 

61 

62from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig 

63from .bps_utils import _dump_env_info, _dump_pkg_info 

64from .cancel import cancel 

65from .ping import ping 

66from .pre_transform import acquire_quantum_graph, cluster_quanta 

67from .prepare import prepare 

68from .report import report 

69from .restart import restart 

70from .submit import submit 

71from .transform import transform 

72 

73_LOG = logging.getLogger(__name__) 

74 

75 

76def _init_submission_driver(config_file, **kwargs): 

77 """Initialize runtime environment. 

78 

79 Parameters 

80 ---------- 

81 config_file : `str` 

82 Name of the configuration file. 

83 

84 Returns 

85 ------- 

86 config : `lsst.ctrl.bps.BpsConfig` 

87 Batch Processing Service configuration. 

88 """ 

89 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

90 

91 # Override config with command-line values. 

92 # Handle diffs between pipetask argument names vs bps yaml 

93 translation = { 

94 "input": "inCollection", 

95 "output_run": "outputRun", 

96 "qgraph": "qgraphFile", 

97 "pipeline": "pipelineYaml", 

98 "wms_service": "wmsServiceClass", 

99 "compute_site": "computeSite", 

100 } 

101 for key, value in kwargs.items(): 

102 # Don't want to override config with None or empty string values. 

103 if value: 

104 # pipetask argument parser converts some values to list, 

105 # but bps will want string. 

106 if not isinstance(value, str) and isinstance(value, Iterable): 

107 value = ",".join(value) 

108 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

109 config[f".bps_cmdline.{new_key}"] = value 

110 

111 # If the WMS service class was not defined neither at the command line nor 

112 # explicitly in config file, use the value provided by the environmental 

113 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

114 # the package default. 

115 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

116 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

117 default_config = BpsConfig(BPS_DEFAULTS) 

118 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

119 config["wmsServiceClass"] = wms_service 

120 

121 # Set some initial values 

122 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

123 if "operator" not in config: 

124 config[".bps_defined.operator"] = getpass.getuser() 

125 

126 if "outCollection" in config: 

127 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

128 

129 if "outputRun" not in config: 

130 raise KeyError("Must specify the output run collection using outputRun") 

131 

132 if "uniqProcName" not in config: 

133 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

134 

135 if "submitPath" not in config: 

136 raise KeyError("Must specify the submit-side run directory using submitPath") 

137 

138 # If requested, run WMS plugin checks early in submission process to 

139 # ensure WMS has what it will need for prepare() or submit(). 

140 if kwargs.get("runWmsSubmissionChecks", False): 

141 found, wms_class = config.search("wmsServiceClass") 

142 if not found: 

143 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

144 

145 # Check that can import wms service class. 

146 wms_service_class = doImport(wms_class) 

147 wms_service = wms_service_class(config) 

148 

149 try: 

150 wms_service.run_submission_checks() 

151 except NotImplementedError: 

152 # Allow various plugins to implement only when needed to do extra 

153 # checks. 

154 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

155 else: 

156 _LOG.debug("Skipping submission checks.") 

157 

158 # Make submit directory to contain all outputs. 

159 submit_path = Path(config["submitPath"]) 

160 try: 

161 submit_path.mkdir(parents=True, exist_ok=False) 

162 except OSError as exc: 

163 if exc.errno == errno.EEXIST: 

164 reason = "Directory already exists" 

165 else: 

166 reason = exc.strerror 

167 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

168 config[".bps_defined.submitPath"] = str(submit_path) 

169 print(f"Submit dir: {submit_path}") 

170 

171 # save copy of configs (orig and expanded config) 

172 shutil.copy2(config_file, submit_path) 

173 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

174 config.dump(fh) 

175 

176 # Dump information about runtime environment and software versions in use. 

177 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

178 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

179 

180 return config 

181 

182 

183def acquire_qgraph_driver(config_file, **kwargs): 

184 """Read a quantum graph from a file or create one from pipeline definition. 

185 

186 Parameters 

187 ---------- 

188 config_file : `str` 

189 Name of the configuration file. 

190 

191 Returns 

192 ------- 

193 config : `lsst.ctrl.bps.BpsConfig` 

194 Updated configuration. 

195 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

196 A graph representing quanta. 

197 """ 

198 _LOG.info("Initializing execution environment") 

199 with time_this( 

200 log=_LOG, 

201 level=logging.INFO, 

202 prefix=None, 

203 msg="Initializing execution environment completed", 

204 mem_usage=True, 

205 mem_unit=DEFAULT_MEM_UNIT, 

206 mem_fmt=DEFAULT_MEM_FMT, 

207 ): 

208 config = _init_submission_driver(config_file, **kwargs) 

209 submit_path = config[".bps_defined.submitPath"] 

210 if _LOG.isEnabledFor(logging.INFO): 

211 _LOG.info( 

212 "Peak memory usage for bps process %s (main), %s (largest child process)", 

213 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

214 ) 

215 

216 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

217 with time_this( 

218 log=_LOG, 

219 level=logging.INFO, 

220 prefix=None, 

221 msg="Acquire stage completed", 

222 mem_usage=True, 

223 mem_unit=DEFAULT_MEM_UNIT, 

224 mem_fmt=DEFAULT_MEM_FMT, 

225 ): 

226 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

227 if _LOG.isEnabledFor(logging.INFO): 

228 _LOG.info( 

229 "Peak memory usage for bps process %s (main), %s (largest child process)", 

230 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

231 ) 

232 

233 # When using QBB (and neither 'executionButlerTemplate' nor 

234 # 'executionButlerDir' is set) acquire_quantum_graph() will set 

235 # 'execution_butler_dir' to the submit directory. This will trick 

236 # 'ctrl_bps_parsl' to use a non-existent execution butler and the run will 

237 # fail. See ParslJob.get_command_line() for details. 

238 # 

239 # This simple trick should keep 'ctrl_bps_parsl' working for the time being 

240 # without making more complex changes in the logic which will be removed 

241 # soon anyway (see DM-40342). 

242 if os.path.normpath(execution_butler_dir) != os.path.normpath(submit_path): 

243 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

244 config[".bps_defined.runQgraphFile"] = qgraph_file 

245 return config, qgraph 

246 

247 

248def cluster_qgraph_driver(config_file, **kwargs): 

249 """Group quanta into clusters. 

250 

251 Parameters 

252 ---------- 

253 config_file : `str` 

254 Name of the configuration file. 

255 

256 Returns 

257 ------- 

258 config : `lsst.ctrl.bps.BpsConfig` 

259 Updated configuration. 

260 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

261 A graph representing clustered quanta. 

262 """ 

263 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

264 

265 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

266 with time_this( 

267 log=_LOG, 

268 level=logging.INFO, 

269 prefix=None, 

270 msg="Cluster stage completed", 

271 mem_usage=True, 

272 mem_unit=DEFAULT_MEM_UNIT, 

273 mem_fmt=DEFAULT_MEM_FMT, 

274 ): 

275 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

276 if _LOG.isEnabledFor(logging.INFO): 

277 _LOG.info( 

278 "Peak memory usage for bps process %s (main), %s (largest child process)", 

279 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

280 ) 

281 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph)) 

282 

283 submit_path = config[".bps_defined.submitPath"] 

284 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

285 if save_clustered_qgraph: 

286 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

287 _, save_dot = config.search("saveDot", opt={"default": False}) 

288 if save_dot: 

289 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

290 return config, clustered_qgraph 

291 

292 

293def transform_driver(config_file, **kwargs): 

294 """Create a workflow for a specific workflow management system. 

295 

296 Parameters 

297 ---------- 

298 config_file : `str` 

299 Name of the configuration file. 

300 

301 Returns 

302 ------- 

303 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

304 Configuration to use when creating the workflow. 

305 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

306 Representation of the abstract/scientific workflow specific to a given 

307 workflow management system. 

308 """ 

309 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

310 submit_path = config[".bps_defined.submitPath"] 

311 

312 _LOG.info("Starting transform stage (creating generic workflow)") 

313 with time_this( 

314 log=_LOG, 

315 level=logging.INFO, 

316 prefix=None, 

317 msg="Transform stage completed", 

318 mem_usage=True, 

319 mem_unit=DEFAULT_MEM_UNIT, 

320 mem_fmt=DEFAULT_MEM_FMT, 

321 ): 

322 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

323 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

324 if _LOG.isEnabledFor(logging.INFO): 

325 _LOG.info( 

326 "Peak memory usage for bps process %s (main), %s (largest child process)", 

327 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

328 ) 

329 num_jobs = sum(generic_workflow.job_counts.values()) 

330 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs) 

331 

332 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

333 if save_workflow: 

334 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

335 generic_workflow.save(outfh, "pickle") 

336 _, save_dot = config.search("saveDot", opt={"default": False}) 

337 if save_dot: 

338 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

339 generic_workflow.draw(outfh, "dot") 

340 return generic_workflow_config, generic_workflow 

341 

342 

343def prepare_driver(config_file, **kwargs): 

344 """Create a representation of the generic workflow. 

345 

346 Parameters 

347 ---------- 

348 config_file : `str` 

349 Name of the configuration file. 

350 

351 Returns 

352 ------- 

353 wms_config : `lsst.ctrl.bps.BpsConfig` 

354 Configuration to use when creating the workflow. 

355 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

356 Representation of the abstract/scientific workflow specific to a given 

357 workflow management system. 

358 """ 

359 kwargs.setdefault("runWmsSubmissionChecks", True) 

360 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

361 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

362 

363 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

364 with time_this( 

365 log=_LOG, 

366 level=logging.INFO, 

367 prefix=None, 

368 msg="Prepare stage completed", 

369 mem_usage=True, 

370 mem_unit=DEFAULT_MEM_UNIT, 

371 mem_fmt=DEFAULT_MEM_FMT, 

372 ): 

373 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

374 if _LOG.isEnabledFor(logging.INFO): 

375 _LOG.info( 

376 "Peak memory usage for bps process %s (main), %s (largest child process)", 

377 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

378 ) 

379 

380 wms_workflow_config = generic_workflow_config 

381 return wms_workflow_config, wms_workflow 

382 

383 

384def submit_driver(config_file, **kwargs): 

385 """Submit workflow for execution. 

386 

387 Parameters 

388 ---------- 

389 config_file : `str` 

390 Name of the configuration file. 

391 """ 

392 kwargs.setdefault("runWmsSubmissionChecks", True) 

393 

394 _LOG.info( 

395 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

396 "not accurately reflect actual memory usage by the bps process." 

397 ) 

398 

399 _LOG.info("Starting submission process") 

400 with time_this( 

401 log=_LOG, 

402 level=logging.INFO, 

403 prefix=None, 

404 msg="Completed entire submission process", 

405 mem_usage=True, 

406 mem_unit=DEFAULT_MEM_UNIT, 

407 mem_fmt=DEFAULT_MEM_FMT, 

408 ): 

409 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

410 

411 _LOG.info("Starting submit stage") 

412 with time_this( 

413 log=_LOG, 

414 level=logging.INFO, 

415 prefix=None, 

416 msg="Completed submit stage", 

417 mem_usage=True, 

418 mem_unit=DEFAULT_MEM_UNIT, 

419 mem_fmt=DEFAULT_MEM_FMT, 

420 ): 

421 submit(wms_workflow_config, wms_workflow) 

422 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

423 if _LOG.isEnabledFor(logging.INFO): 

424 _LOG.info( 

425 "Peak memory usage for bps process %s (main), %s (largest child process)", 

426 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

427 ) 

428 

429 print(f"Run Id: {wms_workflow.run_id}") 

430 print(f"Run Name: {wms_workflow.name}") 

431 

432 

433def restart_driver(wms_service, run_id): 

434 """Restart a failed workflow. 

435 

436 Parameters 

437 ---------- 

438 wms_service : `str` 

439 Name of the class. 

440 run_id : `str` 

441 Id or path of workflow that need to be restarted. 

442 """ 

443 if wms_service is None: 

444 default_config = BpsConfig(BPS_DEFAULTS) 

445 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

446 

447 new_run_id, run_name, message = restart(wms_service, run_id) 

448 if new_run_id is not None: 

449 path = Path(run_id) 

450 if path.exists(): 

451 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

452 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

453 print(f"Run Id: {new_run_id}") 

454 print(f"Run Name: {run_name}") 

455 else: 

456 if message: 

457 print(f"Restart failed: {message}") 

458 else: 

459 print("Restart failed: Unknown error") 

460 

461 

462def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

463 """Print out summary of jobs submitted for execution. 

464 

465 Parameters 

466 ---------- 

467 wms_service : `str` 

468 Name of the class. 

469 run_id : `str` 

470 A run id the report will be restricted to. 

471 user : `str` 

472 A user name the report will be restricted to. 

473 hist_days : int 

474 Number of days 

475 pass_thru : `str` 

476 A string to pass directly to the WMS service class. 

477 is_global : `bool`, optional 

478 If set, all available job queues will be queried for job information. 

479 Defaults to False which means that only a local job queue will be 

480 queried for information. 

481 

482 Only applicable in the context of a WMS using distributed job queues 

483 (e.g., HTCondor). 

484 """ 

485 if wms_service is None: 

486 default_config = BpsConfig(BPS_DEFAULTS) 

487 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

488 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

489 

490 

491def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

492 """Cancel submitted workflows. 

493 

494 Parameters 

495 ---------- 

496 wms_service : `str` 

497 Name of the Workload Management System service class. 

498 run_id : `str` 

499 ID or path of job that should be canceled. 

500 user : `str` 

501 User whose submitted jobs should be canceled. 

502 require_bps : `bool` 

503 Whether to require given run_id/user to be a bps submitted job. 

504 pass_thru : `str` 

505 Information to pass through to WMS. 

506 is_global : `bool`, optional 

507 If set, all available job queues will be checked for jobs to cancel. 

508 Defaults to False which means that only a local job queue will be 

509 checked. 

510 

511 Only applicable in the context of a WMS using distributed job queues 

512 (e.g., HTCondor). 

513 """ 

514 if wms_service is None: 

515 default_config = BpsConfig(BPS_DEFAULTS) 

516 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

517 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global) 

518 

519 

520def ping_driver(wms_service=None, pass_thru=None): 

521 """Check whether WMS services are up, reachable, and any authentication, 

522 if needed, succeeds. 

523 

524 The services to be checked are those needed for submit, report, cancel, 

525 restart, but ping cannot guarantee whether jobs would actually run 

526 successfully. 

527 

528 Parameters 

529 ---------- 

530 wms_service : `str`, optional 

531 Name of the Workload Management System service class. 

532 pass_thru : `str`, optional 

533 Information to pass through to WMS. 

534 

535 Returns 

536 ------- 

537 success : `int` 

538 Whether services are up and usable (0) or not (non-zero). 

539 """ 

540 if wms_service is None: 

541 default_config = BpsConfig(BPS_DEFAULTS) 

542 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

543 status, message = ping(wms_service, pass_thru) 

544 

545 if message: 

546 if not status: 

547 _LOG.info(message) 

548 else: 

549 _LOG.error(message) 

550 

551 # Log overall status message 

552 if not status: 

553 _LOG.info("Ping successful.") 

554 else: 

555 _LOG.error("Ping failed (%d).", status) 

556 

557 return status