Coverage for python/lsst/ctrl/bps/drivers.py: 12%

191 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-26 09:15 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38 "ping_driver", 

39] 

40 

41 

42import errno 

43import getpass 

44import logging 

45import os 

46import re 

47import shutil 

48from collections.abc import Iterable 

49from pathlib import Path 

50 

51from lsst.pipe.base import Instrument 

52from lsst.utils import doImport 

53from lsst.utils.timer import time_this 

54from lsst.utils.usage import get_peak_mem_usage 

55 

56from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig 

57from .bps_utils import _dump_env_info, _dump_pkg_info 

58from .cancel import cancel 

59from .ping import ping 

60from .pre_transform import acquire_quantum_graph, cluster_quanta 

61from .prepare import prepare 

62from .report import report 

63from .restart import restart 

64from .submit import submit 

65from .transform import transform 

66 

67_LOG = logging.getLogger(__name__) 

68 

69 

70def _init_submission_driver(config_file, **kwargs): 

71 """Initialize runtime environment. 

72 

73 Parameters 

74 ---------- 

75 config_file : `str` 

76 Name of the configuration file. 

77 

78 Returns 

79 ------- 

80 config : `lsst.ctrl.bps.BpsConfig` 

81 Batch Processing Service configuration. 

82 """ 

83 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

84 

85 # Override config with command-line values. 

86 # Handle diffs between pipetask argument names vs bps yaml 

87 translation = { 

88 "input": "inCollection", 

89 "output_run": "outputRun", 

90 "qgraph": "qgraphFile", 

91 "pipeline": "pipelineYaml", 

92 "wms_service": "wmsServiceClass", 

93 } 

94 for key, value in kwargs.items(): 

95 # Don't want to override config with None or empty string values. 

96 if value: 

97 # pipetask argument parser converts some values to list, 

98 # but bps will want string. 

99 if not isinstance(value, str) and isinstance(value, Iterable): 

100 value = ",".join(value) 

101 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

102 config[f".bps_cmdline.{new_key}"] = value 

103 

104 # If the WMS service class was not defined neither at the command line nor 

105 # explicitly in config file, use the value provided by the environmental 

106 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

107 # the package default. 

108 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

109 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

110 default_config = BpsConfig(BPS_DEFAULTS) 

111 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

112 config["wmsServiceClass"] = wms_service 

113 

114 # Set some initial values 

115 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

116 if "operator" not in config: 

117 config[".bps_defined.operator"] = getpass.getuser() 

118 

119 if "outCollection" in config: 

120 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

121 

122 if "outputRun" not in config: 

123 raise KeyError("Must specify the output run collection using outputRun") 

124 

125 if "uniqProcName" not in config: 

126 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

127 

128 if "submitPath" not in config: 

129 raise KeyError("Must specify the submit-side run directory using submitPath") 

130 

131 # If requested, run WMS plugin checks early in submission process to 

132 # ensure WMS has what it will need for prepare() or submit(). 

133 if kwargs.get("runWmsSubmissionChecks", False): 

134 found, wms_class = config.search("wmsServiceClass") 

135 if not found: 

136 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

137 

138 # Check that can import wms service class. 

139 wms_service_class = doImport(wms_class) 

140 wms_service = wms_service_class(config) 

141 

142 try: 

143 wms_service.run_submission_checks() 

144 except NotImplementedError: 

145 # Allow various plugins to implement only when needed to do extra 

146 # checks. 

147 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

148 else: 

149 _LOG.debug("Skipping submission checks.") 

150 

151 # Make submit directory to contain all outputs. 

152 submit_path = Path(config["submitPath"]) 

153 try: 

154 submit_path.mkdir(parents=True, exist_ok=False) 

155 except OSError as exc: 

156 if exc.errno == errno.EEXIST: 

157 reason = "Directory already exists" 

158 else: 

159 reason = exc.strerror 

160 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

161 config[".bps_defined.submitPath"] = str(submit_path) 

162 print(f"Submit dir: {submit_path}") 

163 

164 # save copy of configs (orig and expanded config) 

165 shutil.copy2(config_file, submit_path) 

166 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

167 config.dump(fh) 

168 

169 # Dump information about runtime environment and software versions in use. 

170 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

171 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

172 

173 return config 

174 

175 

176def acquire_qgraph_driver(config_file, **kwargs): 

177 """Read a quantum graph from a file or create one from pipeline definition. 

178 

179 Parameters 

180 ---------- 

181 config_file : `str` 

182 Name of the configuration file. 

183 

184 Returns 

185 ------- 

186 config : `lsst.ctrl.bps.BpsConfig` 

187 Updated configuration. 

188 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

189 A graph representing quanta. 

190 """ 

191 _LOG.info("Initializing execution environment") 

192 with time_this( 

193 log=_LOG, 

194 level=logging.INFO, 

195 prefix=None, 

196 msg="Initializing execution environment completed", 

197 mem_usage=True, 

198 mem_unit=DEFAULT_MEM_UNIT, 

199 mem_fmt=DEFAULT_MEM_FMT, 

200 ): 

201 config = _init_submission_driver(config_file, **kwargs) 

202 submit_path = config[".bps_defined.submitPath"] 

203 if _LOG.isEnabledFor(logging.INFO): 

204 _LOG.info( 

205 "Peak memory usage for bps process %s (main), %s (largest child process)", 

206 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

207 ) 

208 

209 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

210 with time_this( 

211 log=_LOG, 

212 level=logging.INFO, 

213 prefix=None, 

214 msg="Acquire stage completed", 

215 mem_usage=True, 

216 mem_unit=DEFAULT_MEM_UNIT, 

217 mem_fmt=DEFAULT_MEM_FMT, 

218 ): 

219 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

220 if _LOG.isEnabledFor(logging.INFO): 

221 _LOG.info( 

222 "Peak memory usage for bps process %s (main), %s (largest child process)", 

223 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

224 ) 

225 

226 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

227 config[".bps_defined.runQgraphFile"] = qgraph_file 

228 return config, qgraph 

229 

230 

231def cluster_qgraph_driver(config_file, **kwargs): 

232 """Group quanta into clusters. 

233 

234 Parameters 

235 ---------- 

236 config_file : `str` 

237 Name of the configuration file. 

238 

239 Returns 

240 ------- 

241 config : `lsst.ctrl.bps.BpsConfig` 

242 Updated configuration. 

243 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

244 A graph representing clustered quanta. 

245 """ 

246 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

247 

248 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

249 with time_this( 

250 log=_LOG, 

251 level=logging.INFO, 

252 prefix=None, 

253 msg="Cluster stage completed", 

254 mem_usage=True, 

255 mem_unit=DEFAULT_MEM_UNIT, 

256 mem_fmt=DEFAULT_MEM_FMT, 

257 ): 

258 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

259 if _LOG.isEnabledFor(logging.INFO): 

260 _LOG.info( 

261 "Peak memory usage for bps process %s (main), %s (largest child process)", 

262 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

263 ) 

264 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph)) 

265 

266 submit_path = config[".bps_defined.submitPath"] 

267 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

268 if save_clustered_qgraph: 

269 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

270 _, save_dot = config.search("saveDot", opt={"default": False}) 

271 if save_dot: 

272 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

273 return config, clustered_qgraph 

274 

275 

276def transform_driver(config_file, **kwargs): 

277 """Create a workflow for a specific workflow management system. 

278 

279 Parameters 

280 ---------- 

281 config_file : `str` 

282 Name of the configuration file. 

283 

284 Returns 

285 ------- 

286 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

287 Configuration to use when creating the workflow. 

288 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

289 Representation of the abstract/scientific workflow specific to a given 

290 workflow management system. 

291 """ 

292 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

293 submit_path = config[".bps_defined.submitPath"] 

294 

295 _LOG.info("Starting transform stage (creating generic workflow)") 

296 with time_this( 

297 log=_LOG, 

298 level=logging.INFO, 

299 prefix=None, 

300 msg="Transform stage completed", 

301 mem_usage=True, 

302 mem_unit=DEFAULT_MEM_UNIT, 

303 mem_fmt=DEFAULT_MEM_FMT, 

304 ): 

305 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

306 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

307 if _LOG.isEnabledFor(logging.INFO): 

308 _LOG.info( 

309 "Peak memory usage for bps process %s (main), %s (largest child process)", 

310 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

311 ) 

312 num_jobs = sum(generic_workflow.job_counts.values()) 

313 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs) 

314 

315 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

316 if save_workflow: 

317 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

318 generic_workflow.save(outfh, "pickle") 

319 _, save_dot = config.search("saveDot", opt={"default": False}) 

320 if save_dot: 

321 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

322 generic_workflow.draw(outfh, "dot") 

323 return generic_workflow_config, generic_workflow 

324 

325 

326def prepare_driver(config_file, **kwargs): 

327 """Create a representation of the generic workflow. 

328 

329 Parameters 

330 ---------- 

331 config_file : `str` 

332 Name of the configuration file. 

333 

334 Returns 

335 ------- 

336 wms_config : `lsst.ctrl.bps.BpsConfig` 

337 Configuration to use when creating the workflow. 

338 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

339 Representation of the abstract/scientific workflow specific to a given 

340 workflow management system. 

341 """ 

342 kwargs.setdefault("runWmsSubmissionChecks", True) 

343 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

344 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

345 

346 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

347 with time_this( 

348 log=_LOG, 

349 level=logging.INFO, 

350 prefix=None, 

351 msg="Prepare stage completed", 

352 mem_usage=True, 

353 mem_unit=DEFAULT_MEM_UNIT, 

354 mem_fmt=DEFAULT_MEM_FMT, 

355 ): 

356 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

357 if _LOG.isEnabledFor(logging.INFO): 

358 _LOG.info( 

359 "Peak memory usage for bps process %s (main), %s (largest child process)", 

360 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

361 ) 

362 

363 wms_workflow_config = generic_workflow_config 

364 return wms_workflow_config, wms_workflow 

365 

366 

367def submit_driver(config_file, **kwargs): 

368 """Submit workflow for execution. 

369 

370 Parameters 

371 ---------- 

372 config_file : `str` 

373 Name of the configuration file. 

374 """ 

375 kwargs.setdefault("runWmsSubmissionChecks", True) 

376 

377 _LOG.info( 

378 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

379 "not accurately reflect actual memory usage by the bps process." 

380 ) 

381 

382 _LOG.info("Starting submission process") 

383 with time_this( 

384 log=_LOG, 

385 level=logging.INFO, 

386 prefix=None, 

387 msg="Completed entire submission process", 

388 mem_usage=True, 

389 mem_unit=DEFAULT_MEM_UNIT, 

390 mem_fmt=DEFAULT_MEM_FMT, 

391 ): 

392 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

393 

394 _LOG.info("Starting submit stage") 

395 with time_this( 

396 log=_LOG, 

397 level=logging.INFO, 

398 prefix=None, 

399 msg="Completed submit stage", 

400 mem_usage=True, 

401 mem_unit=DEFAULT_MEM_UNIT, 

402 mem_fmt=DEFAULT_MEM_FMT, 

403 ): 

404 submit(wms_workflow_config, wms_workflow) 

405 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

406 if _LOG.isEnabledFor(logging.INFO): 

407 _LOG.info( 

408 "Peak memory usage for bps process %s (main), %s (largest child process)", 

409 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

410 ) 

411 

412 print(f"Run Id: {wms_workflow.run_id}") 

413 print(f"Run Name: {wms_workflow.name}") 

414 

415 

416def restart_driver(wms_service, run_id): 

417 """Restart a failed workflow. 

418 

419 Parameters 

420 ---------- 

421 wms_service : `str` 

422 Name of the class. 

423 run_id : `str` 

424 Id or path of workflow that need to be restarted. 

425 """ 

426 if wms_service is None: 

427 default_config = BpsConfig(BPS_DEFAULTS) 

428 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

429 

430 new_run_id, run_name, message = restart(wms_service, run_id) 

431 if new_run_id is not None: 

432 path = Path(run_id) 

433 if path.exists(): 

434 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

435 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

436 print(f"Run Id: {new_run_id}") 

437 print(f"Run Name: {run_name}") 

438 else: 

439 if message: 

440 print(f"Restart failed: {message}") 

441 else: 

442 print("Restart failed: Unknown error") 

443 

444 

445def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

446 """Print out summary of jobs submitted for execution. 

447 

448 Parameters 

449 ---------- 

450 wms_service : `str` 

451 Name of the class. 

452 run_id : `str` 

453 A run id the report will be restricted to. 

454 user : `str` 

455 A user name the report will be restricted to. 

456 hist_days : int 

457 Number of days 

458 pass_thru : `str` 

459 A string to pass directly to the WMS service class. 

460 is_global : `bool`, optional 

461 If set, all available job queues will be queried for job information. 

462 Defaults to False which means that only a local job queue will be 

463 queried for information. 

464 

465 Only applicable in the context of a WMS using distributed job queues 

466 (e.g., HTCondor). 

467 """ 

468 if wms_service is None: 

469 default_config = BpsConfig(BPS_DEFAULTS) 

470 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

471 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

472 

473 

474def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

475 """Cancel submitted workflows. 

476 

477 Parameters 

478 ---------- 

479 wms_service : `str` 

480 Name of the Workload Management System service class. 

481 run_id : `str` 

482 ID or path of job that should be canceled. 

483 user : `str` 

484 User whose submitted jobs should be canceled. 

485 require_bps : `bool` 

486 Whether to require given run_id/user to be a bps submitted job. 

487 pass_thru : `str` 

488 Information to pass through to WMS. 

489 is_global : `bool`, optional 

490 If set, all available job queues will be checked for jobs to cancel. 

491 Defaults to False which means that only a local job queue will be 

492 checked. 

493 

494 Only applicable in the context of a WMS using distributed job queues 

495 (e.g., HTCondor). 

496 """ 

497 if wms_service is None: 

498 default_config = BpsConfig(BPS_DEFAULTS) 

499 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

500 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global) 

501 

502 

503def ping_driver(wms_service=None, pass_thru=None): 

504 """Checks whether WMS services are up, reachable, and any authentication, 

505 if needed, succeeds. 

506 

507 The services to be checked are those needed for submit, report, cancel, 

508 restart, but ping cannot guarantee whether jobs would actually run 

509 successfully. 

510 

511 Parameters 

512 ---------- 

513 wms_service : `str`, optional 

514 Name of the Workload Management System service class. 

515 pass_thru : `str`, optional 

516 Information to pass through to WMS. 

517 

518 Returns 

519 ------- 

520 success : `int` 

521 Whether services are up and usable (0) or not (non-zero). 

522 """ 

523 if wms_service is None: 

524 default_config = BpsConfig(BPS_DEFAULTS) 

525 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

526 status, message = ping(wms_service, pass_thru) 

527 

528 if message: 

529 if not status: 

530 _LOG.info(message) 

531 else: 

532 _LOG.error(message) 

533 

534 # Log overall status message 

535 if not status: 

536 _LOG.info("Ping successful.") 

537 else: 

538 _LOG.error("Ping failed (%d).", status) 

539 

540 return status