Coverage for python/lsst/ctrl/bps/drivers.py: 12%

192 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-23 10:45 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38 "ping_driver", 

39] 

40 

41 

42import errno 

43import getpass 

44import logging 

45import os 

46import re 

47import shutil 

48from collections.abc import Iterable 

49from pathlib import Path 

50 

51from lsst.pipe.base import Instrument 

52from lsst.utils import doImport 

53from lsst.utils.timer import time_this 

54from lsst.utils.usage import get_peak_mem_usage 

55 

56from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig 

57from .bps_utils import _dump_env_info, _dump_pkg_info 

58from .cancel import cancel 

59from .ping import ping 

60from .pre_transform import acquire_quantum_graph, cluster_quanta 

61from .prepare import prepare 

62from .report import report 

63from .restart import restart 

64from .submit import submit 

65from .transform import transform 

66 

67_LOG = logging.getLogger(__name__) 

68 

69 

70def _init_submission_driver(config_file, **kwargs): 

71 """Initialize runtime environment. 

72 

73 Parameters 

74 ---------- 

75 config_file : `str` 

76 Name of the configuration file. 

77 

78 Returns 

79 ------- 

80 config : `lsst.ctrl.bps.BpsConfig` 

81 Batch Processing Service configuration. 

82 """ 

83 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

84 

85 # Override config with command-line values. 

86 # Handle diffs between pipetask argument names vs bps yaml 

87 translation = { 

88 "input": "inCollection", 

89 "output_run": "outputRun", 

90 "qgraph": "qgraphFile", 

91 "pipeline": "pipelineYaml", 

92 "wms_service": "wmsServiceClass", 

93 "compute_site": "computeSite", 

94 } 

95 for key, value in kwargs.items(): 

96 # Don't want to override config with None or empty string values. 

97 if value: 

98 # pipetask argument parser converts some values to list, 

99 # but bps will want string. 

100 if not isinstance(value, str) and isinstance(value, Iterable): 

101 value = ",".join(value) 

102 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

103 config[f".bps_cmdline.{new_key}"] = value 

104 

105 # If the WMS service class was not defined neither at the command line nor 

106 # explicitly in config file, use the value provided by the environmental 

107 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

108 # the package default. 

109 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

110 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

111 default_config = BpsConfig(BPS_DEFAULTS) 

112 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

113 config["wmsServiceClass"] = wms_service 

114 

115 # Set some initial values 

116 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

117 if "operator" not in config: 

118 config[".bps_defined.operator"] = getpass.getuser() 

119 

120 if "outCollection" in config: 

121 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

122 

123 if "outputRun" not in config: 

124 raise KeyError("Must specify the output run collection using outputRun") 

125 

126 if "uniqProcName" not in config: 

127 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

128 

129 if "submitPath" not in config: 

130 raise KeyError("Must specify the submit-side run directory using submitPath") 

131 

132 # If requested, run WMS plugin checks early in submission process to 

133 # ensure WMS has what it will need for prepare() or submit(). 

134 if kwargs.get("runWmsSubmissionChecks", False): 

135 found, wms_class = config.search("wmsServiceClass") 

136 if not found: 

137 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

138 

139 # Check that can import wms service class. 

140 wms_service_class = doImport(wms_class) 

141 wms_service = wms_service_class(config) 

142 

143 try: 

144 wms_service.run_submission_checks() 

145 except NotImplementedError: 

146 # Allow various plugins to implement only when needed to do extra 

147 # checks. 

148 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

149 else: 

150 _LOG.debug("Skipping submission checks.") 

151 

152 # Make submit directory to contain all outputs. 

153 submit_path = Path(config["submitPath"]) 

154 try: 

155 submit_path.mkdir(parents=True, exist_ok=False) 

156 except OSError as exc: 

157 if exc.errno == errno.EEXIST: 

158 reason = "Directory already exists" 

159 else: 

160 reason = exc.strerror 

161 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

162 config[".bps_defined.submitPath"] = str(submit_path) 

163 print(f"Submit dir: {submit_path}") 

164 

165 # save copy of configs (orig and expanded config) 

166 shutil.copy2(config_file, submit_path) 

167 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

168 config.dump(fh) 

169 

170 # Dump information about runtime environment and software versions in use. 

171 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

172 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

173 

174 return config 

175 

176 

177def acquire_qgraph_driver(config_file, **kwargs): 

178 """Read a quantum graph from a file or create one from pipeline definition. 

179 

180 Parameters 

181 ---------- 

182 config_file : `str` 

183 Name of the configuration file. 

184 

185 Returns 

186 ------- 

187 config : `lsst.ctrl.bps.BpsConfig` 

188 Updated configuration. 

189 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

190 A graph representing quanta. 

191 """ 

192 _LOG.info("Initializing execution environment") 

193 with time_this( 

194 log=_LOG, 

195 level=logging.INFO, 

196 prefix=None, 

197 msg="Initializing execution environment completed", 

198 mem_usage=True, 

199 mem_unit=DEFAULT_MEM_UNIT, 

200 mem_fmt=DEFAULT_MEM_FMT, 

201 ): 

202 config = _init_submission_driver(config_file, **kwargs) 

203 submit_path = config[".bps_defined.submitPath"] 

204 if _LOG.isEnabledFor(logging.INFO): 

205 _LOG.info( 

206 "Peak memory usage for bps process %s (main), %s (largest child process)", 

207 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

208 ) 

209 

210 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

211 with time_this( 

212 log=_LOG, 

213 level=logging.INFO, 

214 prefix=None, 

215 msg="Acquire stage completed", 

216 mem_usage=True, 

217 mem_unit=DEFAULT_MEM_UNIT, 

218 mem_fmt=DEFAULT_MEM_FMT, 

219 ): 

220 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

221 if _LOG.isEnabledFor(logging.INFO): 

222 _LOG.info( 

223 "Peak memory usage for bps process %s (main), %s (largest child process)", 

224 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

225 ) 

226 

227 # When using QBB (and neither 'executionButlerTemplate' nor 

228 # 'executionButlerDir' is set) acquire_quantum_graph() will set 

229 # 'execution_butler_dir' to the submit directory. This will trick 

230 # 'ctrl_bps_parsl' to use a non-existent execution butler and the run will 

231 # fail. See ParslJob.get_command_line() for details. 

232 # 

233 # This simple trick should keep 'ctrl_bps_parsl' working for the time being 

234 # without making more complex changes in the logic which will be removed 

235 # soon anyway (see DM-40342). 

236 if os.path.normpath(execution_butler_dir) != os.path.normpath(submit_path): 

237 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

238 config[".bps_defined.runQgraphFile"] = qgraph_file 

239 return config, qgraph 

240 

241 

242def cluster_qgraph_driver(config_file, **kwargs): 

243 """Group quanta into clusters. 

244 

245 Parameters 

246 ---------- 

247 config_file : `str` 

248 Name of the configuration file. 

249 

250 Returns 

251 ------- 

252 config : `lsst.ctrl.bps.BpsConfig` 

253 Updated configuration. 

254 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

255 A graph representing clustered quanta. 

256 """ 

257 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

258 

259 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

260 with time_this( 

261 log=_LOG, 

262 level=logging.INFO, 

263 prefix=None, 

264 msg="Cluster stage completed", 

265 mem_usage=True, 

266 mem_unit=DEFAULT_MEM_UNIT, 

267 mem_fmt=DEFAULT_MEM_FMT, 

268 ): 

269 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

270 if _LOG.isEnabledFor(logging.INFO): 

271 _LOG.info( 

272 "Peak memory usage for bps process %s (main), %s (largest child process)", 

273 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

274 ) 

275 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph)) 

276 

277 submit_path = config[".bps_defined.submitPath"] 

278 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

279 if save_clustered_qgraph: 

280 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

281 _, save_dot = config.search("saveDot", opt={"default": False}) 

282 if save_dot: 

283 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

284 return config, clustered_qgraph 

285 

286 

287def transform_driver(config_file, **kwargs): 

288 """Create a workflow for a specific workflow management system. 

289 

290 Parameters 

291 ---------- 

292 config_file : `str` 

293 Name of the configuration file. 

294 

295 Returns 

296 ------- 

297 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

298 Configuration to use when creating the workflow. 

299 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

300 Representation of the abstract/scientific workflow specific to a given 

301 workflow management system. 

302 """ 

303 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

304 submit_path = config[".bps_defined.submitPath"] 

305 

306 _LOG.info("Starting transform stage (creating generic workflow)") 

307 with time_this( 

308 log=_LOG, 

309 level=logging.INFO, 

310 prefix=None, 

311 msg="Transform stage completed", 

312 mem_usage=True, 

313 mem_unit=DEFAULT_MEM_UNIT, 

314 mem_fmt=DEFAULT_MEM_FMT, 

315 ): 

316 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

317 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

318 if _LOG.isEnabledFor(logging.INFO): 

319 _LOG.info( 

320 "Peak memory usage for bps process %s (main), %s (largest child process)", 

321 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

322 ) 

323 num_jobs = sum(generic_workflow.job_counts.values()) 

324 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs) 

325 

326 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

327 if save_workflow: 

328 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

329 generic_workflow.save(outfh, "pickle") 

330 _, save_dot = config.search("saveDot", opt={"default": False}) 

331 if save_dot: 

332 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

333 generic_workflow.draw(outfh, "dot") 

334 return generic_workflow_config, generic_workflow 

335 

336 

337def prepare_driver(config_file, **kwargs): 

338 """Create a representation of the generic workflow. 

339 

340 Parameters 

341 ---------- 

342 config_file : `str` 

343 Name of the configuration file. 

344 

345 Returns 

346 ------- 

347 wms_config : `lsst.ctrl.bps.BpsConfig` 

348 Configuration to use when creating the workflow. 

349 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

350 Representation of the abstract/scientific workflow specific to a given 

351 workflow management system. 

352 """ 

353 kwargs.setdefault("runWmsSubmissionChecks", True) 

354 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

355 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

356 

357 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

358 with time_this( 

359 log=_LOG, 

360 level=logging.INFO, 

361 prefix=None, 

362 msg="Prepare stage completed", 

363 mem_usage=True, 

364 mem_unit=DEFAULT_MEM_UNIT, 

365 mem_fmt=DEFAULT_MEM_FMT, 

366 ): 

367 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

368 if _LOG.isEnabledFor(logging.INFO): 

369 _LOG.info( 

370 "Peak memory usage for bps process %s (main), %s (largest child process)", 

371 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

372 ) 

373 

374 wms_workflow_config = generic_workflow_config 

375 return wms_workflow_config, wms_workflow 

376 

377 

378def submit_driver(config_file, **kwargs): 

379 """Submit workflow for execution. 

380 

381 Parameters 

382 ---------- 

383 config_file : `str` 

384 Name of the configuration file. 

385 """ 

386 kwargs.setdefault("runWmsSubmissionChecks", True) 

387 

388 _LOG.info( 

389 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

390 "not accurately reflect actual memory usage by the bps process." 

391 ) 

392 

393 _LOG.info("Starting submission process") 

394 with time_this( 

395 log=_LOG, 

396 level=logging.INFO, 

397 prefix=None, 

398 msg="Completed entire submission process", 

399 mem_usage=True, 

400 mem_unit=DEFAULT_MEM_UNIT, 

401 mem_fmt=DEFAULT_MEM_FMT, 

402 ): 

403 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

404 

405 _LOG.info("Starting submit stage") 

406 with time_this( 

407 log=_LOG, 

408 level=logging.INFO, 

409 prefix=None, 

410 msg="Completed submit stage", 

411 mem_usage=True, 

412 mem_unit=DEFAULT_MEM_UNIT, 

413 mem_fmt=DEFAULT_MEM_FMT, 

414 ): 

415 submit(wms_workflow_config, wms_workflow) 

416 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

417 if _LOG.isEnabledFor(logging.INFO): 

418 _LOG.info( 

419 "Peak memory usage for bps process %s (main), %s (largest child process)", 

420 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

421 ) 

422 

423 print(f"Run Id: {wms_workflow.run_id}") 

424 print(f"Run Name: {wms_workflow.name}") 

425 

426 

427def restart_driver(wms_service, run_id): 

428 """Restart a failed workflow. 

429 

430 Parameters 

431 ---------- 

432 wms_service : `str` 

433 Name of the class. 

434 run_id : `str` 

435 Id or path of workflow that need to be restarted. 

436 """ 

437 if wms_service is None: 

438 default_config = BpsConfig(BPS_DEFAULTS) 

439 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

440 

441 new_run_id, run_name, message = restart(wms_service, run_id) 

442 if new_run_id is not None: 

443 path = Path(run_id) 

444 if path.exists(): 

445 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

446 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

447 print(f"Run Id: {new_run_id}") 

448 print(f"Run Name: {run_name}") 

449 else: 

450 if message: 

451 print(f"Restart failed: {message}") 

452 else: 

453 print("Restart failed: Unknown error") 

454 

455 

456def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

457 """Print out summary of jobs submitted for execution. 

458 

459 Parameters 

460 ---------- 

461 wms_service : `str` 

462 Name of the class. 

463 run_id : `str` 

464 A run id the report will be restricted to. 

465 user : `str` 

466 A user name the report will be restricted to. 

467 hist_days : int 

468 Number of days 

469 pass_thru : `str` 

470 A string to pass directly to the WMS service class. 

471 is_global : `bool`, optional 

472 If set, all available job queues will be queried for job information. 

473 Defaults to False which means that only a local job queue will be 

474 queried for information. 

475 

476 Only applicable in the context of a WMS using distributed job queues 

477 (e.g., HTCondor). 

478 """ 

479 if wms_service is None: 

480 default_config = BpsConfig(BPS_DEFAULTS) 

481 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

482 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

483 

484 

485def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

486 """Cancel submitted workflows. 

487 

488 Parameters 

489 ---------- 

490 wms_service : `str` 

491 Name of the Workload Management System service class. 

492 run_id : `str` 

493 ID or path of job that should be canceled. 

494 user : `str` 

495 User whose submitted jobs should be canceled. 

496 require_bps : `bool` 

497 Whether to require given run_id/user to be a bps submitted job. 

498 pass_thru : `str` 

499 Information to pass through to WMS. 

500 is_global : `bool`, optional 

501 If set, all available job queues will be checked for jobs to cancel. 

502 Defaults to False which means that only a local job queue will be 

503 checked. 

504 

505 Only applicable in the context of a WMS using distributed job queues 

506 (e.g., HTCondor). 

507 """ 

508 if wms_service is None: 

509 default_config = BpsConfig(BPS_DEFAULTS) 

510 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

511 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global) 

512 

513 

514def ping_driver(wms_service=None, pass_thru=None): 

515 """Check whether WMS services are up, reachable, and any authentication, 

516 if needed, succeeds. 

517 

518 The services to be checked are those needed for submit, report, cancel, 

519 restart, but ping cannot guarantee whether jobs would actually run 

520 successfully. 

521 

522 Parameters 

523 ---------- 

524 wms_service : `str`, optional 

525 Name of the Workload Management System service class. 

526 pass_thru : `str`, optional 

527 Information to pass through to WMS. 

528 

529 Returns 

530 ------- 

531 success : `int` 

532 Whether services are up and usable (0) or not (non-zero). 

533 """ 

534 if wms_service is None: 

535 default_config = BpsConfig(BPS_DEFAULTS) 

536 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

537 status, message = ping(wms_service, pass_thru) 

538 

539 if message: 

540 if not status: 

541 _LOG.info(message) 

542 else: 

543 _LOG.error(message) 

544 

545 # Log overall status message 

546 if not status: 

547 _LOG.info("Ping successful.") 

548 else: 

549 _LOG.error("Ping failed (%d).", status) 

550 

551 return status