Coverage for python/lsst/ctrl/bps/drivers.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

174 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38] 

39 

40 

41import errno 

42import getpass 

43import logging 

44import os 

45import re 

46import shutil 

47from collections.abc import Iterable 

48from pathlib import Path 

49 

50from lsst.obs.base import Instrument 

51from lsst.utils import doImport 

52from lsst.utils.timer import time_this 

53from lsst.utils.usage import get_peak_mem_usage 

54 

55from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, BpsConfig, DEFAULT_MEM_UNIT, DEFAULT_MEM_FMT 

56from .pre_transform import acquire_quantum_graph, cluster_quanta 

57from .transform import transform 

58from .prepare import prepare 

59from .submit import submit 

60from .cancel import cancel 

61from .report import report 

62from .restart import restart 

63from .bps_utils import _dump_env_info, _dump_pkg_info 

64 

65 

66_LOG = logging.getLogger(__name__) 

67 

68 

69def _init_submission_driver(config_file, **kwargs): 

70 """Initialize runtime environment. 

71 

72 Parameters 

73 ---------- 

74 config_file : `str` 

75 Name of the configuration file. 

76 

77 Returns 

78 ------- 

79 config : `lsst.ctrl.bps.BpsConfig` 

80 Batch Processing Service configuration. 

81 """ 

82 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

83 

84 # Override config with command-line values. 

85 # Handle diffs between pipetask argument names vs bps yaml 

86 translation = { 

87 "input": "inCollection", 

88 "output_run": "outputRun", 

89 "qgraph": "qgraphFile", 

90 "pipeline": "pipelineYaml", 

91 "wms_service": "wmsServiceClass", 

92 } 

93 for key, value in kwargs.items(): 

94 # Don't want to override config with None or empty string values. 

95 if value: 

96 # pipetask argument parser converts some values to list, 

97 # but bps will want string. 

98 if not isinstance(value, str) and isinstance(value, Iterable): 

99 value = ",".join(value) 

100 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

101 config[f".bps_cmdline.{new_key}"] = value 

102 

103 # If the WMS service class was not defined neither at the command line nor 

104 # explicitly in config file, use the value provided by the environmental 

105 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

106 # the package default. 

107 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

108 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

109 default_config = BpsConfig(BPS_DEFAULTS) 

110 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

111 config["wmsServiceClass"] = wms_service 

112 

113 # Set some initial values 

114 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

115 if "operator" not in config: 

116 config[".bps_defined.operator"] = getpass.getuser() 

117 

118 if "outCollection" in config: 

119 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

120 

121 if "outputRun" not in config: 

122 raise KeyError("Must specify the output run collection using outputRun") 

123 

124 if "uniqProcName" not in config: 

125 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

126 

127 if "submitPath" not in config: 

128 raise KeyError("Must specify the submit-side run directory using submitPath") 

129 

130 # If requested, run WMS plugin checks early in submission process to 

131 # ensure WMS has what it will need for prepare() or submit(). 

132 if kwargs.get("runWmsSubmissionChecks", False): 

133 found, wms_class = config.search("wmsServiceClass") 

134 if not found: 

135 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

136 

137 # Check that can import wms service class. 

138 wms_service_class = doImport(wms_class) 

139 wms_service = wms_service_class(config) 

140 

141 try: 

142 wms_service.run_submission_checks() 

143 except NotImplementedError: 

144 # Allow various plugins to implement only when needed to do extra 

145 # checks. 

146 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

147 else: 

148 _LOG.debug("Skipping submission checks.") 

149 

150 # Make submit directory to contain all outputs. 

151 submit_path = Path(config["submitPath"]) 

152 try: 

153 submit_path.mkdir(parents=True, exist_ok=False) 

154 except OSError as exc: 

155 if exc.errno == errno.EEXIST: 

156 reason = "Directory already exists" 

157 else: 

158 reason = exc.strerror 

159 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

160 config[".bps_defined.submitPath"] = str(submit_path) 

161 print(f"Submit dir: {submit_path}") 

162 

163 # save copy of configs (orig and expanded config) 

164 shutil.copy2(config_file, submit_path) 

165 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

166 config.dump(fh) 

167 

168 # Dump information about runtime environment and software versions in use. 

169 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

170 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

171 

172 return config 

173 

174 

175def acquire_qgraph_driver(config_file, **kwargs): 

176 """Read a quantum graph from a file or create one from pipeline definition. 

177 

178 Parameters 

179 ---------- 

180 config_file : `str` 

181 Name of the configuration file. 

182 

183 Returns 

184 ------- 

185 config : `lsst.ctrl.bps.BpsConfig` 

186 Updated configuration. 

187 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

188 A graph representing quanta. 

189 """ 

190 _LOG.info("Initializing execution environment") 

191 with time_this(log=_LOG, level=logging.INFO, prefix=None, 

192 msg="Initializing execution environment completed", mem_usage=True, 

193 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

194 config = _init_submission_driver(config_file, **kwargs) 

195 submit_path = config[".bps_defined.submitPath"] 

196 if _LOG.isEnabledFor(logging.INFO): 

197 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)", 

198 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage())) 

199 

200 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

201 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Acquire stage completed", mem_usage=True, 

202 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

203 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

204 if _LOG.isEnabledFor(logging.INFO): 

205 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)", 

206 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage())) 

207 

208 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

209 config[".bps_defined.runQgraphFile"] = qgraph_file 

210 return config, qgraph 

211 

212 

213def cluster_qgraph_driver(config_file, **kwargs): 

214 """Group quanta into clusters. 

215 

216 Parameters 

217 ---------- 

218 config_file : `str` 

219 Name of the configuration file. 

220 

221 Returns 

222 ------- 

223 config : `lsst.ctrl.bps.BpsConfig` 

224 Updated configuration. 

225 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

226 A graph representing clustered quanta. 

227 """ 

228 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

229 

230 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

231 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Cluster stage completed", mem_usage=True, 

232 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

233 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

234 if _LOG.isEnabledFor(logging.INFO): 

235 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)", 

236 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage())) 

237 

238 submit_path = config[".bps_defined.submitPath"] 

239 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

240 if save_clustered_qgraph: 

241 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

242 _, save_dot = config.search("saveDot", opt={"default": False}) 

243 if save_dot: 

244 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

245 return config, clustered_qgraph 

246 

247 

248def transform_driver(config_file, **kwargs): 

249 """Create a workflow for a specific workflow management system. 

250 

251 Parameters 

252 ---------- 

253 config_file : `str` 

254 Name of the configuration file. 

255 

256 Returns 

257 ------- 

258 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

259 Configuration to use when creating the workflow. 

260 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

261 Representation of the abstract/scientific workflow specific to a given 

262 workflow management system. 

263 """ 

264 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

265 submit_path = config[".bps_defined.submitPath"] 

266 

267 _LOG.info("Starting transform stage (creating generic workflow)") 

268 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Transform stage completed", mem_usage=True, 

269 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

270 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

271 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

272 if _LOG.isEnabledFor(logging.INFO): 

273 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)", 

274 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage())) 

275 

276 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

277 if save_workflow: 

278 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

279 generic_workflow.save(outfh, "pickle") 

280 _, save_dot = config.search("saveDot", opt={"default": False}) 

281 if save_dot: 

282 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

283 generic_workflow.draw(outfh, "dot") 

284 return generic_workflow_config, generic_workflow 

285 

286 

287def prepare_driver(config_file, **kwargs): 

288 """Create a representation of the generic workflow. 

289 

290 Parameters 

291 ---------- 

292 config_file : `str` 

293 Name of the configuration file. 

294 

295 Returns 

296 ------- 

297 wms_config : `lsst.ctrl.bps.BpsConfig` 

298 Configuration to use when creating the workflow. 

299 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

300 Representation of the abstract/scientific workflow specific to a given 

301 workflow management system. 

302 """ 

303 kwargs.setdefault("runWmsSubmissionChecks", True) 

304 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

305 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

306 

307 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

308 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Prepare stage completed", mem_usage=True, 

309 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

310 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

311 if _LOG.isEnabledFor(logging.INFO): 

312 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)", 

313 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage())) 

314 

315 wms_workflow_config = generic_workflow_config 

316 return wms_workflow_config, wms_workflow 

317 

318 

319def submit_driver(config_file, **kwargs): 

320 """Submit workflow for execution. 

321 

322 Parameters 

323 ---------- 

324 config_file : `str` 

325 Name of the configuration file. 

326 """ 

327 kwargs.setdefault("runWmsSubmissionChecks", True) 

328 

329 _LOG.info("DISCLAIMER: All values regarding memory consumption reported below are approximate and may " 

330 "not accurately reflect actual memory usage by the bps process.") 

331 

332 _LOG.info("Starting submission process") 

333 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed entire submission process", 

334 mem_usage=True, mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

335 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

336 

337 _LOG.info("Starting submit stage") 

338 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed submit stage", 

339 mem_usage=True, mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT): 

340 submit(wms_workflow_config, wms_workflow) 

341 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

342 if _LOG.isEnabledFor(logging.INFO): 

343 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)", 

344 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage())) 

345 

346 print(f"Run Id: {wms_workflow.run_id}") 

347 print(f"Run Name: {wms_workflow.name}") 

348 

349 

350def restart_driver(wms_service, run_id): 

351 """Restart a failed workflow. 

352 

353 Parameters 

354 ---------- 

355 wms_service : `str` 

356 Name of the class. 

357 run_id : `str` 

358 Id or path of workflow that need to be restarted. 

359 """ 

360 if wms_service is None: 

361 default_config = BpsConfig(BPS_DEFAULTS) 

362 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

363 

364 new_run_id, run_name, message = restart(wms_service, run_id) 

365 if new_run_id is not None: 

366 path = Path(run_id) 

367 if path.exists(): 

368 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

369 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

370 print(f"Run Id: {new_run_id}") 

371 print(f"Run Name: {run_name}") 

372 else: 

373 if message: 

374 print(f"Restart failed: {message}") 

375 else: 

376 print("Restart failed: Unknown error") 

377 

378 

379def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

380 """Print out summary of jobs submitted for execution. 

381 

382 Parameters 

383 ---------- 

384 wms_service : `str` 

385 Name of the class. 

386 run_id : `str` 

387 A run id the report will be restricted to. 

388 user : `str` 

389 A user name the report will be restricted to. 

390 hist_days : int 

391 Number of days 

392 pass_thru : `str` 

393 A string to pass directly to the WMS service class. 

394 is_global : `bool`, optional 

395 If set, all available job queues will be queried for job information. 

396 Defaults to False which means that only a local job queue will be 

397 queried for information. 

398 

399 Only applicable in the context of a WMS using distributed job queues 

400 (e.g., HTCondor). 

401 """ 

402 if wms_service is None: 

403 default_config = BpsConfig(BPS_DEFAULTS) 

404 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

405 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

406 

407 

408def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

409 """Cancel submitted workflows. 

410 

411 Parameters 

412 ---------- 

413 wms_service : `str` 

414 Name of the Workload Management System service class. 

415 run_id : `str` 

416 ID or path of job that should be canceled. 

417 user : `str` 

418 User whose submitted jobs should be canceled. 

419 require_bps : `bool` 

420 Whether to require given run_id/user to be a bps submitted job. 

421 pass_thru : `str` 

422 Information to pass through to WMS. 

423 is_global : `bool`, optional 

424 If set, all available job queues will be checked for jobs to cancel. 

425 Defaults to False which means that only a local job queue will be 

426 checked. 

427 

428 Only applicable in the context of a WMS using distributed job queues 

429 (e.g., HTCondor). 

430 """ 

431 if wms_service is None: 

432 default_config = BpsConfig(BPS_DEFAULTS) 

433 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

434 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)