Coverage for python/lsst/ctrl/bps/drivers.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

158 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38] 

39 

40 

41import errno 

42import getpass 

43import logging 

44import os 

45import re 

46import shutil 

47from collections.abc import Iterable 

48from pathlib import Path 

49 

50from lsst.obs.base import Instrument 

51from lsst.utils import doImport 

52from lsst.utils.timer import time_this 

53 

54from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, BpsConfig 

55from .pre_transform import acquire_quantum_graph, cluster_quanta 

56from .transform import transform 

57from .prepare import prepare 

58from .submit import submit 

59from .cancel import cancel 

60from .report import report 

61from .restart import restart 

62from .bps_utils import _dump_env_info, _dump_pkg_info 

63 

64 

65_LOG = logging.getLogger(__name__) 

66 

67 

68def _init_submission_driver(config_file, **kwargs): 

69 """Initialize runtime environment. 

70 

71 Parameters 

72 ---------- 

73 config_file : `str` 

74 Name of the configuration file. 

75 

76 Returns 

77 ------- 

78 config : `lsst.ctrl.bps.BpsConfig` 

79 Batch Processing Service configuration. 

80 """ 

81 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

82 

83 # Override config with command-line values. 

84 # Handle diffs between pipetask argument names vs bps yaml 

85 translation = { 

86 "input": "inCollection", 

87 "output_run": "outputRun", 

88 "qgraph": "qgraphFile", 

89 "pipeline": "pipelineYaml", 

90 "wms_service": "wmsServiceClass", 

91 } 

92 for key, value in kwargs.items(): 

93 # Don't want to override config with None or empty string values. 

94 if value: 

95 # pipetask argument parser converts some values to list, 

96 # but bps will want string. 

97 if not isinstance(value, str) and isinstance(value, Iterable): 

98 value = ",".join(value) 

99 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

100 config[f".bps_cmdline.{new_key}"] = value 

101 

102 # If the WMS service class was not defined neither at the command line nor 

103 # explicitly in config file, use the value provided by the environmental 

104 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

105 # the package default. 

106 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

107 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

108 default_config = BpsConfig(BPS_DEFAULTS) 

109 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

110 config["wmsServiceClass"] = wms_service 

111 

112 # Set some initial values 

113 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

114 if "operator" not in config: 

115 config[".bps_defined.operator"] = getpass.getuser() 

116 

117 if "outCollection" in config: 

118 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

119 

120 if "outputRun" not in config: 

121 raise KeyError("Must specify the output run collection using outputRun") 

122 

123 if "uniqProcName" not in config: 

124 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

125 

126 if "submitPath" not in config: 

127 raise KeyError("Must specify the submit-side run directory using submitPath") 

128 

129 # If requested, run WMS plugin checks early in submission process to 

130 # ensure WMS has what it will need for prepare() or submit(). 

131 if kwargs.get("runWmsSubmissionChecks", False): 

132 found, wms_class = config.search("wmsServiceClass") 

133 if not found: 

134 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

135 

136 # Check that can import wms service class. 

137 wms_service_class = doImport(wms_class) 

138 wms_service = wms_service_class(config) 

139 

140 try: 

141 wms_service.run_submission_checks() 

142 except NotImplementedError: 

143 # Allow various plugins to implement only when needed to do extra 

144 # checks. 

145 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

146 else: 

147 _LOG.debug("Skipping submission checks.") 

148 

149 # Make submit directory to contain all outputs. 

150 submit_path = Path(config["submitPath"]) 

151 try: 

152 submit_path.mkdir(parents=True, exist_ok=False) 

153 except OSError as exc: 

154 if exc.errno == errno.EEXIST: 

155 reason = "Directory already exists" 

156 else: 

157 reason = exc.strerror 

158 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

159 config[".bps_defined.submitPath"] = str(submit_path) 

160 

161 # save copy of configs (orig and expanded config) 

162 shutil.copy2(config_file, submit_path) 

163 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

164 config.dump(fh) 

165 

166 # Dump information about runtime environment and software versions in use. 

167 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

168 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

169 

170 return config 

171 

172 

173def acquire_qgraph_driver(config_file, **kwargs): 

174 """Read a quantum graph from a file or create one from pipeline definition. 

175 

176 Parameters 

177 ---------- 

178 config_file : `str` 

179 Name of the configuration file. 

180 

181 Returns 

182 ------- 

183 config : `lsst.ctrl.bps.BpsConfig` 

184 Updated configuration. 

185 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

186 A graph representing quanta. 

187 """ 

188 config = _init_submission_driver(config_file, **kwargs) 

189 submit_path = config[".bps_defined.submitPath"] 

190 

191 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

192 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Acquire stage completed"): 

193 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

194 

195 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

196 config[".bps_defined.runQgraphFile"] = qgraph_file 

197 return config, qgraph 

198 

199 

200def cluster_qgraph_driver(config_file, **kwargs): 

201 """Group quanta into clusters. 

202 

203 Parameters 

204 ---------- 

205 config_file : `str` 

206 Name of the configuration file. 

207 

208 Returns 

209 ------- 

210 config : `lsst.ctrl.bps.BpsConfig` 

211 Updated configuration. 

212 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

213 A graph representing clustered quanta. 

214 """ 

215 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

216 

217 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

218 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Cluster stage completed"): 

219 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

220 

221 submit_path = config[".bps_defined.submitPath"] 

222 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

223 if save_clustered_qgraph: 

224 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

225 _, save_dot = config.search("saveDot", opt={"default": False}) 

226 if save_dot: 

227 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

228 return config, clustered_qgraph 

229 

230 

231def transform_driver(config_file, **kwargs): 

232 """Create a workflow for a specific workflow management system. 

233 

234 Parameters 

235 ---------- 

236 config_file : `str` 

237 Name of the configuration file. 

238 

239 Returns 

240 ------- 

241 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

242 Configuration to use when creating the workflow. 

243 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

244 Representation of the abstract/scientific workflow specific to a given 

245 workflow management system. 

246 """ 

247 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

248 submit_path = config[".bps_defined.submitPath"] 

249 

250 _LOG.info("Starting transform stage (creating generic workflow)") 

251 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Transform stage completed"): 

252 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

253 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

254 

255 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

256 if save_workflow: 

257 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

258 generic_workflow.save(outfh, "pickle") 

259 _, save_dot = config.search("saveDot", opt={"default": False}) 

260 if save_dot: 

261 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

262 generic_workflow.draw(outfh, "dot") 

263 return generic_workflow_config, generic_workflow 

264 

265 

266def prepare_driver(config_file, **kwargs): 

267 """Create a representation of the generic workflow. 

268 

269 Parameters 

270 ---------- 

271 config_file : `str` 

272 Name of the configuration file. 

273 

274 Returns 

275 ------- 

276 wms_config : `lsst.ctrl.bps.BpsConfig` 

277 Configuration to use when creating the workflow. 

278 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

279 Representation of the abstract/scientific workflow specific to a given 

280 workflow management system. 

281 """ 

282 kwargs.setdefault("runWmsSubmissionChecks", True) 

283 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

284 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

285 

286 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

287 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Prepare stage completed"): 

288 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

289 

290 wms_workflow_config = generic_workflow_config 

291 print(f"Submit dir: {wms_workflow.submit_path}") 

292 return wms_workflow_config, wms_workflow 

293 

294 

295def submit_driver(config_file, **kwargs): 

296 """Submit workflow for execution. 

297 

298 Parameters 

299 ---------- 

300 config_file : `str` 

301 Name of the configuration file. 

302 """ 

303 kwargs.setdefault("runWmsSubmissionChecks", True) 

304 

305 _LOG.info("Starting submission process") 

306 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed entire submission process"): 

307 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

308 

309 _LOG.info("Starting submit stage") 

310 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed submit stage"): 

311 submit(wms_workflow_config, wms_workflow) 

312 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

313 

314 print(f"Run Id: {wms_workflow.run_id}") 

315 print(f"Run Name: {wms_workflow.name}") 

316 

317 

318def restart_driver(wms_service, run_id): 

319 """Restart a failed workflow. 

320 

321 Parameters 

322 ---------- 

323 wms_service : `str` 

324 Name of the class. 

325 run_id : `str` 

326 Id or path of workflow that need to be restarted. 

327 """ 

328 if wms_service is None: 

329 default_config = BpsConfig(BPS_DEFAULTS) 

330 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

331 

332 new_run_id, run_name, message = restart(wms_service, run_id) 

333 if new_run_id is not None: 

334 path = Path(run_id) 

335 if path.exists(): 

336 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

337 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

338 print(f"Run Id: {new_run_id}") 

339 print(f"Run Name: {run_name}") 

340 else: 

341 if message: 

342 print(f"Restart failed: {message}") 

343 else: 

344 print("Restart failed: Unknown error") 

345 

346 

347def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

348 """Print out summary of jobs submitted for execution. 

349 

350 Parameters 

351 ---------- 

352 wms_service : `str` 

353 Name of the class. 

354 run_id : `str` 

355 A run id the report will be restricted to. 

356 user : `str` 

357 A user name the report will be restricted to. 

358 hist_days : int 

359 Number of days 

360 pass_thru : `str` 

361 A string to pass directly to the WMS service class. 

362 is_global : `bool`, optional 

363 If set, all available job queues will be queried for job information. 

364 Defaults to False which means that only a local job queue will be 

365 queried for information. 

366 

367 Only applicable in the context of a WMS using distributed job queues 

368 (e.g., HTCondor). 

369 """ 

370 if wms_service is None: 

371 default_config = BpsConfig(BPS_DEFAULTS) 

372 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

373 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

374 

375 

376def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

377 """Cancel submitted workflows. 

378 

379 Parameters 

380 ---------- 

381 wms_service : `str` 

382 Name of the Workload Management System service class. 

383 run_id : `str` 

384 ID or path of job that should be canceled. 

385 user : `str` 

386 User whose submitted jobs should be canceled. 

387 require_bps : `bool` 

388 Whether to require given run_id/user to be a bps submitted job. 

389 pass_thru : `str` 

390 Information to pass through to WMS. 

391 is_global : `bool`, optional 

392 If set, all available job queues will be checked for jobs to cancel. 

393 Defaults to False which means that only a local job queue will be 

394 checked. 

395 

396 Only applicable in the context of a WMS using distributed job queues 

397 (e.g., HTCondor). 

398 """ 

399 if wms_service is None: 

400 default_config = BpsConfig(BPS_DEFAULTS) 

401 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

402 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)