Coverage for python/lsst/ctrl/bps/drivers.py: 12%

177 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-15 02:15 -0700

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38] 

39 

40 

41import errno 

42import getpass 

43import logging 

44import os 

45import re 

46import shutil 

47from collections.abc import Iterable 

48from pathlib import Path 

49 

50from lsst.pipe.base import Instrument 

51from lsst.utils import doImport 

52from lsst.utils.timer import time_this 

53from lsst.utils.usage import get_peak_mem_usage 

54 

55from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig 

56from .bps_utils import _dump_env_info, _dump_pkg_info 

57from .cancel import cancel 

58from .pre_transform import acquire_quantum_graph, cluster_quanta 

59from .prepare import prepare 

60from .report import report 

61from .restart import restart 

62from .submit import submit 

63from .transform import transform 

64 

65_LOG = logging.getLogger(__name__) 

66 

67 

68def _init_submission_driver(config_file, **kwargs): 

69 """Initialize runtime environment. 

70 

71 Parameters 

72 ---------- 

73 config_file : `str` 

74 Name of the configuration file. 

75 

76 Returns 

77 ------- 

78 config : `lsst.ctrl.bps.BpsConfig` 

79 Batch Processing Service configuration. 

80 """ 

81 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

82 

83 # Override config with command-line values. 

84 # Handle diffs between pipetask argument names vs bps yaml 

85 translation = { 

86 "input": "inCollection", 

87 "output_run": "outputRun", 

88 "qgraph": "qgraphFile", 

89 "pipeline": "pipelineYaml", 

90 "wms_service": "wmsServiceClass", 

91 } 

92 for key, value in kwargs.items(): 

93 # Don't want to override config with None or empty string values. 

94 if value: 

95 # pipetask argument parser converts some values to list, 

96 # but bps will want string. 

97 if not isinstance(value, str) and isinstance(value, Iterable): 

98 value = ",".join(value) 

99 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

100 config[f".bps_cmdline.{new_key}"] = value 

101 

102 # If the WMS service class was not defined neither at the command line nor 

103 # explicitly in config file, use the value provided by the environmental 

104 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

105 # the package default. 

106 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

107 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

108 default_config = BpsConfig(BPS_DEFAULTS) 

109 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

110 config["wmsServiceClass"] = wms_service 

111 

112 # Set some initial values 

113 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

114 if "operator" not in config: 

115 config[".bps_defined.operator"] = getpass.getuser() 

116 

117 if "outCollection" in config: 

118 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

119 

120 if "outputRun" not in config: 

121 raise KeyError("Must specify the output run collection using outputRun") 

122 

123 if "uniqProcName" not in config: 

124 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

125 

126 if "submitPath" not in config: 

127 raise KeyError("Must specify the submit-side run directory using submitPath") 

128 

129 # If requested, run WMS plugin checks early in submission process to 

130 # ensure WMS has what it will need for prepare() or submit(). 

131 if kwargs.get("runWmsSubmissionChecks", False): 

132 found, wms_class = config.search("wmsServiceClass") 

133 if not found: 

134 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

135 

136 # Check that can import wms service class. 

137 wms_service_class = doImport(wms_class) 

138 wms_service = wms_service_class(config) 

139 

140 try: 

141 wms_service.run_submission_checks() 

142 except NotImplementedError: 

143 # Allow various plugins to implement only when needed to do extra 

144 # checks. 

145 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

146 else: 

147 _LOG.debug("Skipping submission checks.") 

148 

149 # Make submit directory to contain all outputs. 

150 submit_path = Path(config["submitPath"]) 

151 try: 

152 submit_path.mkdir(parents=True, exist_ok=False) 

153 except OSError as exc: 

154 if exc.errno == errno.EEXIST: 

155 reason = "Directory already exists" 

156 else: 

157 reason = exc.strerror 

158 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

159 config[".bps_defined.submitPath"] = str(submit_path) 

160 print(f"Submit dir: {submit_path}") 

161 

162 # save copy of configs (orig and expanded config) 

163 shutil.copy2(config_file, submit_path) 

164 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

165 config.dump(fh) 

166 

167 # Dump information about runtime environment and software versions in use. 

168 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

169 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

170 

171 return config 

172 

173 

174def acquire_qgraph_driver(config_file, **kwargs): 

175 """Read a quantum graph from a file or create one from pipeline definition. 

176 

177 Parameters 

178 ---------- 

179 config_file : `str` 

180 Name of the configuration file. 

181 

182 Returns 

183 ------- 

184 config : `lsst.ctrl.bps.BpsConfig` 

185 Updated configuration. 

186 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

187 A graph representing quanta. 

188 """ 

189 _LOG.info("Initializing execution environment") 

190 with time_this( 

191 log=_LOG, 

192 level=logging.INFO, 

193 prefix=None, 

194 msg="Initializing execution environment completed", 

195 mem_usage=True, 

196 mem_unit=DEFAULT_MEM_UNIT, 

197 mem_fmt=DEFAULT_MEM_FMT, 

198 ): 

199 config = _init_submission_driver(config_file, **kwargs) 

200 submit_path = config[".bps_defined.submitPath"] 

201 if _LOG.isEnabledFor(logging.INFO): 

202 _LOG.info( 

203 "Peak memory usage for bps process %s (main), %s (largest child process)", 

204 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

205 ) 

206 

207 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

208 with time_this( 

209 log=_LOG, 

210 level=logging.INFO, 

211 prefix=None, 

212 msg="Acquire stage completed", 

213 mem_usage=True, 

214 mem_unit=DEFAULT_MEM_UNIT, 

215 mem_fmt=DEFAULT_MEM_FMT, 

216 ): 

217 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

218 if _LOG.isEnabledFor(logging.INFO): 

219 _LOG.info( 

220 "Peak memory usage for bps process %s (main), %s (largest child process)", 

221 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

222 ) 

223 

224 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

225 config[".bps_defined.runQgraphFile"] = qgraph_file 

226 return config, qgraph 

227 

228 

229def cluster_qgraph_driver(config_file, **kwargs): 

230 """Group quanta into clusters. 

231 

232 Parameters 

233 ---------- 

234 config_file : `str` 

235 Name of the configuration file. 

236 

237 Returns 

238 ------- 

239 config : `lsst.ctrl.bps.BpsConfig` 

240 Updated configuration. 

241 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

242 A graph representing clustered quanta. 

243 """ 

244 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

245 

246 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

247 with time_this( 

248 log=_LOG, 

249 level=logging.INFO, 

250 prefix=None, 

251 msg="Cluster stage completed", 

252 mem_usage=True, 

253 mem_unit=DEFAULT_MEM_UNIT, 

254 mem_fmt=DEFAULT_MEM_FMT, 

255 ): 

256 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

257 if _LOG.isEnabledFor(logging.INFO): 

258 _LOG.info( 

259 "Peak memory usage for bps process %s (main), %s (largest child process)", 

260 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

261 ) 

262 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph)) 

263 

264 submit_path = config[".bps_defined.submitPath"] 

265 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

266 if save_clustered_qgraph: 

267 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

268 _, save_dot = config.search("saveDot", opt={"default": False}) 

269 if save_dot: 

270 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

271 return config, clustered_qgraph 

272 

273 

274def transform_driver(config_file, **kwargs): 

275 """Create a workflow for a specific workflow management system. 

276 

277 Parameters 

278 ---------- 

279 config_file : `str` 

280 Name of the configuration file. 

281 

282 Returns 

283 ------- 

284 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

285 Configuration to use when creating the workflow. 

286 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

287 Representation of the abstract/scientific workflow specific to a given 

288 workflow management system. 

289 """ 

290 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

291 submit_path = config[".bps_defined.submitPath"] 

292 

293 _LOG.info("Starting transform stage (creating generic workflow)") 

294 with time_this( 

295 log=_LOG, 

296 level=logging.INFO, 

297 prefix=None, 

298 msg="Transform stage completed", 

299 mem_usage=True, 

300 mem_unit=DEFAULT_MEM_UNIT, 

301 mem_fmt=DEFAULT_MEM_FMT, 

302 ): 

303 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

304 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

305 if _LOG.isEnabledFor(logging.INFO): 

306 _LOG.info( 

307 "Peak memory usage for bps process %s (main), %s (largest child process)", 

308 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

309 ) 

310 num_jobs = sum(generic_workflow.job_counts.values()) 

311 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs) 

312 

313 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

314 if save_workflow: 

315 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

316 generic_workflow.save(outfh, "pickle") 

317 _, save_dot = config.search("saveDot", opt={"default": False}) 

318 if save_dot: 

319 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

320 generic_workflow.draw(outfh, "dot") 

321 return generic_workflow_config, generic_workflow 

322 

323 

324def prepare_driver(config_file, **kwargs): 

325 """Create a representation of the generic workflow. 

326 

327 Parameters 

328 ---------- 

329 config_file : `str` 

330 Name of the configuration file. 

331 

332 Returns 

333 ------- 

334 wms_config : `lsst.ctrl.bps.BpsConfig` 

335 Configuration to use when creating the workflow. 

336 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

337 Representation of the abstract/scientific workflow specific to a given 

338 workflow management system. 

339 """ 

340 kwargs.setdefault("runWmsSubmissionChecks", True) 

341 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

342 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

343 

344 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

345 with time_this( 

346 log=_LOG, 

347 level=logging.INFO, 

348 prefix=None, 

349 msg="Prepare stage completed", 

350 mem_usage=True, 

351 mem_unit=DEFAULT_MEM_UNIT, 

352 mem_fmt=DEFAULT_MEM_FMT, 

353 ): 

354 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

355 if _LOG.isEnabledFor(logging.INFO): 

356 _LOG.info( 

357 "Peak memory usage for bps process %s (main), %s (largest child process)", 

358 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

359 ) 

360 

361 wms_workflow_config = generic_workflow_config 

362 return wms_workflow_config, wms_workflow 

363 

364 

365def submit_driver(config_file, **kwargs): 

366 """Submit workflow for execution. 

367 

368 Parameters 

369 ---------- 

370 config_file : `str` 

371 Name of the configuration file. 

372 """ 

373 kwargs.setdefault("runWmsSubmissionChecks", True) 

374 

375 _LOG.info( 

376 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

377 "not accurately reflect actual memory usage by the bps process." 

378 ) 

379 

380 _LOG.info("Starting submission process") 

381 with time_this( 

382 log=_LOG, 

383 level=logging.INFO, 

384 prefix=None, 

385 msg="Completed entire submission process", 

386 mem_usage=True, 

387 mem_unit=DEFAULT_MEM_UNIT, 

388 mem_fmt=DEFAULT_MEM_FMT, 

389 ): 

390 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

391 

392 _LOG.info("Starting submit stage") 

393 with time_this( 

394 log=_LOG, 

395 level=logging.INFO, 

396 prefix=None, 

397 msg="Completed submit stage", 

398 mem_usage=True, 

399 mem_unit=DEFAULT_MEM_UNIT, 

400 mem_fmt=DEFAULT_MEM_FMT, 

401 ): 

402 submit(wms_workflow_config, wms_workflow) 

403 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

404 if _LOG.isEnabledFor(logging.INFO): 

405 _LOG.info( 

406 "Peak memory usage for bps process %s (main), %s (largest child process)", 

407 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()), 

408 ) 

409 

410 print(f"Run Id: {wms_workflow.run_id}") 

411 print(f"Run Name: {wms_workflow.name}") 

412 

413 

414def restart_driver(wms_service, run_id): 

415 """Restart a failed workflow. 

416 

417 Parameters 

418 ---------- 

419 wms_service : `str` 

420 Name of the class. 

421 run_id : `str` 

422 Id or path of workflow that need to be restarted. 

423 """ 

424 if wms_service is None: 

425 default_config = BpsConfig(BPS_DEFAULTS) 

426 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

427 

428 new_run_id, run_name, message = restart(wms_service, run_id) 

429 if new_run_id is not None: 

430 path = Path(run_id) 

431 if path.exists(): 

432 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

433 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

434 print(f"Run Id: {new_run_id}") 

435 print(f"Run Name: {run_name}") 

436 else: 

437 if message: 

438 print(f"Restart failed: {message}") 

439 else: 

440 print("Restart failed: Unknown error") 

441 

442 

443def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

444 """Print out summary of jobs submitted for execution. 

445 

446 Parameters 

447 ---------- 

448 wms_service : `str` 

449 Name of the class. 

450 run_id : `str` 

451 A run id the report will be restricted to. 

452 user : `str` 

453 A user name the report will be restricted to. 

454 hist_days : int 

455 Number of days 

456 pass_thru : `str` 

457 A string to pass directly to the WMS service class. 

458 is_global : `bool`, optional 

459 If set, all available job queues will be queried for job information. 

460 Defaults to False which means that only a local job queue will be 

461 queried for information. 

462 

463 Only applicable in the context of a WMS using distributed job queues 

464 (e.g., HTCondor). 

465 """ 

466 if wms_service is None: 

467 default_config = BpsConfig(BPS_DEFAULTS) 

468 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

469 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

470 

471 

472def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

473 """Cancel submitted workflows. 

474 

475 Parameters 

476 ---------- 

477 wms_service : `str` 

478 Name of the Workload Management System service class. 

479 run_id : `str` 

480 ID or path of job that should be canceled. 

481 user : `str` 

482 User whose submitted jobs should be canceled. 

483 require_bps : `bool` 

484 Whether to require given run_id/user to be a bps submitted job. 

485 pass_thru : `str` 

486 Information to pass through to WMS. 

487 is_global : `bool`, optional 

488 If set, all available job queues will be checked for jobs to cancel. 

489 Defaults to False which means that only a local job queue will be 

490 checked. 

491 

492 Only applicable in the context of a WMS using distributed job queues 

493 (e.g., HTCondor). 

494 """ 

495 if wms_service is None: 

496 default_config = BpsConfig(BPS_DEFAULTS) 

497 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

498 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)