Coverage for python/lsst/ctrl/bps/drivers.py: 12%

191 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-11 02:59 -0800

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38 "ping_driver", 

39] 

40 

41 

42import errno 

43import getpass 

44import logging 

45import os 

46import re 

47import shutil 

48from collections.abc import Iterable 

49from pathlib import Path 

50 

51from lsst.pipe.base import Instrument 

52from lsst.utils import doImport 

53from lsst.utils.timer import time_this 

54from lsst.utils.usage import get_peak_mem_usage 

55 

56from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig 

57from .bps_utils import _dump_env_info, _dump_pkg_info 

58from .cancel import cancel 

59from .ping import ping 

60from .pre_transform import acquire_quantum_graph, cluster_quanta 

61from .prepare import prepare 

62from .report import report 

63from .restart import restart 

64from .submit import submit 

65from .transform import transform 

66 

67_LOG = logging.getLogger(__name__) 

68 

69 

70def _init_submission_driver(config_file, **kwargs): 

71 """Initialize runtime environment. 

72 

73 Parameters 

74 ---------- 

75 config_file : `str` 

76 Name of the configuration file. 

77 

78 Returns 

79 ------- 

80 config : `lsst.ctrl.bps.BpsConfig` 

81 Batch Processing Service configuration. 

82 """ 

83 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

84 

85 # Override config with command-line values. 

86 # Handle diffs between pipetask argument names vs bps yaml 

87 translation = { 

88 "input": "inCollection", 

89 "output_run": "outputRun", 

90 "qgraph": "qgraphFile", 

91 "pipeline": "pipelineYaml", 

92 "wms_service": "wmsServiceClass", 

93 "compute_site": "computeSite", 

94 } 

95 for key, value in kwargs.items(): 

96 # Don't want to override config with None or empty string values. 

97 if value: 

98 # pipetask argument parser converts some values to list, 

99 # but bps will want string. 

100 if not isinstance(value, str) and isinstance(value, Iterable): 

101 value = ",".join(value) 

102 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

103 config[f".bps_cmdline.{new_key}"] = value 

104 

105 # If the WMS service class was not defined neither at the command line nor 

106 # explicitly in config file, use the value provided by the environmental 

107 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

108 # the package default. 

109 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

110 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

111 default_config = BpsConfig(BPS_DEFAULTS) 

112 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

113 config["wmsServiceClass"] = wms_service 

114 

115 # Set some initial values 

116 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

117 if "operator" not in config: 

118 config[".bps_defined.operator"] = getpass.getuser() 

119 

120 if "outCollection" in config: 

121 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

122 

123 if "outputRun" not in config: 

124 raise KeyError("Must specify the output run collection using outputRun") 

125 

126 if "uniqProcName" not in config: 

127 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

128 

129 if "submitPath" not in config: 

130 raise KeyError("Must specify the submit-side run directory using submitPath") 

131 

132 # If requested, run WMS plugin checks early in submission process to 

133 # ensure WMS has what it will need for prepare() or submit(). 

134 if kwargs.get("runWmsSubmissionChecks", False): 

135 found, wms_class = config.search("wmsServiceClass") 

136 if not found: 

137 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

138 

139 # Check that can import wms service class. 

140 wms_service_class = doImport(wms_class) 

141 wms_service = wms_service_class(config) 

142 

143 try: 

144 wms_service.run_submission_checks() 

145 except NotImplementedError: 

146 # Allow various plugins to implement only when needed to do extra 

147 # checks. 

148 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

149 else: 

150 _LOG.debug("Skipping submission checks.") 

151 

152 # Make submit directory to contain all outputs. 

153 submit_path = Path(config["submitPath"]) 

154 try: 

155 submit_path.mkdir(parents=True, exist_ok=False) 

156 except OSError as exc: 

157 if exc.errno == errno.EEXIST: 

158 reason = "Directory already exists" 

159 else: 

160 reason = exc.strerror 

161 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

162 config[".bps_defined.submitPath"] = str(submit_path) 

163 print(f"Submit dir: {submit_path}") 

164 

165 # save copy of configs (orig and expanded config) 

166 shutil.copy2(config_file, submit_path) 

167 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

168 config.dump(fh) 

169 

170 # Dump information about runtime environment and software versions in use. 

171 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

172 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

173 

174 return config 

175 

176 

177def acquire_qgraph_driver(config_file, **kwargs): 

178 """Read a quantum graph from a file or create one from pipeline definition. 

179 

180 Parameters 

181 ---------- 

182 config_file : `str` 

183 Name of the configuration file. 

184 

185 Returns 

186 ------- 

187 config : `lsst.ctrl.bps.BpsConfig` 

188 Updated configuration. 

189 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

190 A graph representing quanta. 

191 """ 

192 _LOG.info("Initializing execution environment") 

193 with time_this( 

194 log=_LOG, 

195 level=logging.INFO, 

196 prefix=None, 

197 msg="Initializing execution environment completed", 

198 mem_usage=True, 

199 mem_unit=DEFAULT_MEM_UNIT, 

200 mem_fmt=DEFAULT_MEM_FMT, 

201 ): 

202 config = _init_submission_driver(config_file, **kwargs) 

203 submit_path = config[".bps_defined.submitPath"] 

204 if _LOG.isEnabledFor(logging.INFO): 

205 _LOG.info( 

206 "Peak memory usage for bps process %s (main), %s (largest child process)", 

207 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

208 ) 

209 

210 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

211 with time_this( 

212 log=_LOG, 

213 level=logging.INFO, 

214 prefix=None, 

215 msg="Acquire stage completed", 

216 mem_usage=True, 

217 mem_unit=DEFAULT_MEM_UNIT, 

218 mem_fmt=DEFAULT_MEM_FMT, 

219 ): 

220 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

221 if _LOG.isEnabledFor(logging.INFO): 

222 _LOG.info( 

223 "Peak memory usage for bps process %s (main), %s (largest child process)", 

224 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

225 ) 

226 

227 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

228 config[".bps_defined.runQgraphFile"] = qgraph_file 

229 return config, qgraph 

230 

231 

232def cluster_qgraph_driver(config_file, **kwargs): 

233 """Group quanta into clusters. 

234 

235 Parameters 

236 ---------- 

237 config_file : `str` 

238 Name of the configuration file. 

239 

240 Returns 

241 ------- 

242 config : `lsst.ctrl.bps.BpsConfig` 

243 Updated configuration. 

244 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

245 A graph representing clustered quanta. 

246 """ 

247 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

248 

249 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

250 with time_this( 

251 log=_LOG, 

252 level=logging.INFO, 

253 prefix=None, 

254 msg="Cluster stage completed", 

255 mem_usage=True, 

256 mem_unit=DEFAULT_MEM_UNIT, 

257 mem_fmt=DEFAULT_MEM_FMT, 

258 ): 

259 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

260 if _LOG.isEnabledFor(logging.INFO): 

261 _LOG.info( 

262 "Peak memory usage for bps process %s (main), %s (largest child process)", 

263 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

264 ) 

265 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph)) 

266 

267 submit_path = config[".bps_defined.submitPath"] 

268 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

269 if save_clustered_qgraph: 

270 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

271 _, save_dot = config.search("saveDot", opt={"default": False}) 

272 if save_dot: 

273 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

274 return config, clustered_qgraph 

275 

276 

277def transform_driver(config_file, **kwargs): 

278 """Create a workflow for a specific workflow management system. 

279 

280 Parameters 

281 ---------- 

282 config_file : `str` 

283 Name of the configuration file. 

284 

285 Returns 

286 ------- 

287 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

288 Configuration to use when creating the workflow. 

289 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

290 Representation of the abstract/scientific workflow specific to a given 

291 workflow management system. 

292 """ 

293 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

294 submit_path = config[".bps_defined.submitPath"] 

295 

296 _LOG.info("Starting transform stage (creating generic workflow)") 

297 with time_this( 

298 log=_LOG, 

299 level=logging.INFO, 

300 prefix=None, 

301 msg="Transform stage completed", 

302 mem_usage=True, 

303 mem_unit=DEFAULT_MEM_UNIT, 

304 mem_fmt=DEFAULT_MEM_FMT, 

305 ): 

306 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

307 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

308 if _LOG.isEnabledFor(logging.INFO): 

309 _LOG.info( 

310 "Peak memory usage for bps process %s (main), %s (largest child process)", 

311 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

312 ) 

313 num_jobs = sum(generic_workflow.job_counts.values()) 

314 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs) 

315 

316 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

317 if save_workflow: 

318 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

319 generic_workflow.save(outfh, "pickle") 

320 _, save_dot = config.search("saveDot", opt={"default": False}) 

321 if save_dot: 

322 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

323 generic_workflow.draw(outfh, "dot") 

324 return generic_workflow_config, generic_workflow 

325 

326 

327def prepare_driver(config_file, **kwargs): 

328 """Create a representation of the generic workflow. 

329 

330 Parameters 

331 ---------- 

332 config_file : `str` 

333 Name of the configuration file. 

334 

335 Returns 

336 ------- 

337 wms_config : `lsst.ctrl.bps.BpsConfig` 

338 Configuration to use when creating the workflow. 

339 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

340 Representation of the abstract/scientific workflow specific to a given 

341 workflow management system. 

342 """ 

343 kwargs.setdefault("runWmsSubmissionChecks", True) 

344 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

345 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

346 

347 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

348 with time_this( 

349 log=_LOG, 

350 level=logging.INFO, 

351 prefix=None, 

352 msg="Prepare stage completed", 

353 mem_usage=True, 

354 mem_unit=DEFAULT_MEM_UNIT, 

355 mem_fmt=DEFAULT_MEM_FMT, 

356 ): 

357 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

358 if _LOG.isEnabledFor(logging.INFO): 

359 _LOG.info( 

360 "Peak memory usage for bps process %s (main), %s (largest child process)", 

361 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

362 ) 

363 

364 wms_workflow_config = generic_workflow_config 

365 return wms_workflow_config, wms_workflow 

366 

367 

368def submit_driver(config_file, **kwargs): 

369 """Submit workflow for execution. 

370 

371 Parameters 

372 ---------- 

373 config_file : `str` 

374 Name of the configuration file. 

375 """ 

376 kwargs.setdefault("runWmsSubmissionChecks", True) 

377 

378 _LOG.info( 

379 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

380 "not accurately reflect actual memory usage by the bps process." 

381 ) 

382 

383 _LOG.info("Starting submission process") 

384 with time_this( 

385 log=_LOG, 

386 level=logging.INFO, 

387 prefix=None, 

388 msg="Completed entire submission process", 

389 mem_usage=True, 

390 mem_unit=DEFAULT_MEM_UNIT, 

391 mem_fmt=DEFAULT_MEM_FMT, 

392 ): 

393 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

394 

395 _LOG.info("Starting submit stage") 

396 with time_this( 

397 log=_LOG, 

398 level=logging.INFO, 

399 prefix=None, 

400 msg="Completed submit stage", 

401 mem_usage=True, 

402 mem_unit=DEFAULT_MEM_UNIT, 

403 mem_fmt=DEFAULT_MEM_FMT, 

404 ): 

405 submit(wms_workflow_config, wms_workflow) 

406 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

407 if _LOG.isEnabledFor(logging.INFO): 

408 _LOG.info( 

409 "Peak memory usage for bps process %s (main), %s (largest child process)", 

410 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

411 ) 

412 

413 print(f"Run Id: {wms_workflow.run_id}") 

414 print(f"Run Name: {wms_workflow.name}") 

415 

416 

417def restart_driver(wms_service, run_id): 

418 """Restart a failed workflow. 

419 

420 Parameters 

421 ---------- 

422 wms_service : `str` 

423 Name of the class. 

424 run_id : `str` 

425 Id or path of workflow that need to be restarted. 

426 """ 

427 if wms_service is None: 

428 default_config = BpsConfig(BPS_DEFAULTS) 

429 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

430 

431 new_run_id, run_name, message = restart(wms_service, run_id) 

432 if new_run_id is not None: 

433 path = Path(run_id) 

434 if path.exists(): 

435 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

436 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

437 print(f"Run Id: {new_run_id}") 

438 print(f"Run Name: {run_name}") 

439 else: 

440 if message: 

441 print(f"Restart failed: {message}") 

442 else: 

443 print("Restart failed: Unknown error") 

444 

445 

446def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

447 """Print out summary of jobs submitted for execution. 

448 

449 Parameters 

450 ---------- 

451 wms_service : `str` 

452 Name of the class. 

453 run_id : `str` 

454 A run id the report will be restricted to. 

455 user : `str` 

456 A user name the report will be restricted to. 

457 hist_days : int 

458 Number of days 

459 pass_thru : `str` 

460 A string to pass directly to the WMS service class. 

461 is_global : `bool`, optional 

462 If set, all available job queues will be queried for job information. 

463 Defaults to False which means that only a local job queue will be 

464 queried for information. 

465 

466 Only applicable in the context of a WMS using distributed job queues 

467 (e.g., HTCondor). 

468 """ 

469 if wms_service is None: 

470 default_config = BpsConfig(BPS_DEFAULTS) 

471 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

472 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

473 

474 

475def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

476 """Cancel submitted workflows. 

477 

478 Parameters 

479 ---------- 

480 wms_service : `str` 

481 Name of the Workload Management System service class. 

482 run_id : `str` 

483 ID or path of job that should be canceled. 

484 user : `str` 

485 User whose submitted jobs should be canceled. 

486 require_bps : `bool` 

487 Whether to require given run_id/user to be a bps submitted job. 

488 pass_thru : `str` 

489 Information to pass through to WMS. 

490 is_global : `bool`, optional 

491 If set, all available job queues will be checked for jobs to cancel. 

492 Defaults to False which means that only a local job queue will be 

493 checked. 

494 

495 Only applicable in the context of a WMS using distributed job queues 

496 (e.g., HTCondor). 

497 """ 

498 if wms_service is None: 

499 default_config = BpsConfig(BPS_DEFAULTS) 

500 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

501 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global) 

502 

503 

504def ping_driver(wms_service=None, pass_thru=None): 

505 """Checks whether WMS services are up, reachable, and any authentication, 

506 if needed, succeeds. 

507 

508 The services to be checked are those needed for submit, report, cancel, 

509 restart, but ping cannot guarantee whether jobs would actually run 

510 successfully. 

511 

512 Parameters 

513 ---------- 

514 wms_service : `str`, optional 

515 Name of the Workload Management System service class. 

516 pass_thru : `str`, optional 

517 Information to pass through to WMS. 

518 

519 Returns 

520 ------- 

521 success : `int` 

522 Whether services are up and usable (0) or not (non-zero). 

523 """ 

524 if wms_service is None: 

525 default_config = BpsConfig(BPS_DEFAULTS) 

526 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

527 status, message = ping(wms_service, pass_thru) 

528 

529 if message: 

530 if not status: 

531 _LOG.info(message) 

532 else: 

533 _LOG.error(message) 

534 

535 # Log overall status message 

536 if not status: 

537 _LOG.info("Ping successful.") 

538 else: 

539 _LOG.error("Ping failed (%d).", status) 

540 

541 return status