Coverage for python/lsst/ctrl/bps/drivers.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

158 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Driver functions for each subcommand. 

23 

24Driver functions ensure that ensure all setup work is done before running 

25the subcommand method. 

26""" 

27 

28 

29__all__ = [ 

30 "acquire_qgraph_driver", 

31 "cluster_qgraph_driver", 

32 "transform_driver", 

33 "prepare_driver", 

34 "submit_driver", 

35 "report_driver", 

36 "restart_driver", 

37 "cancel_driver", 

38] 

39 

40 

41import errno 

42import getpass 

43import logging 

44import os 

45import re 

46import shutil 

47from collections.abc import Iterable 

48from pathlib import Path 

49 

50from lsst.obs.base import Instrument 

51from lsst.utils import doImport 

52from lsst.utils.timer import time_this 

53 

54from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, BpsConfig 

55from .pre_transform import acquire_quantum_graph, cluster_quanta 

56from .transform import transform 

57from .prepare import prepare 

58from .submit import submit 

59from .cancel import cancel 

60from .report import report 

61from .restart import restart 

62from .bps_utils import _dump_env_info, _dump_pkg_info 

63 

64 

65_LOG = logging.getLogger(__name__) 

66 

67 

68def _init_submission_driver(config_file, **kwargs): 

69 """Initialize runtime environment. 

70 

71 Parameters 

72 ---------- 

73 config_file : `str` 

74 Name of the configuration file. 

75 

76 Returns 

77 ------- 

78 config : `lsst.ctrl.bps.BpsConfig` 

79 Batch Processing Service configuration. 

80 """ 

81 config = BpsConfig(config_file, BPS_SEARCH_ORDER) 

82 

83 # Override config with command-line values. 

84 # Handle diffs between pipetask argument names vs bps yaml 

85 translation = { 

86 "input": "inCollection", 

87 "output_run": "outputRun", 

88 "qgraph": "qgraphFile", 

89 "pipeline": "pipelineYaml", 

90 "wms_service": "wmsServiceClass", 

91 } 

92 for key, value in kwargs.items(): 

93 # Don't want to override config with None or empty string values. 

94 if value: 

95 # pipetask argument parser converts some values to list, 

96 # but bps will want string. 

97 if not isinstance(value, str) and isinstance(value, Iterable): 

98 value = ",".join(value) 

99 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key)) 

100 config[f".bps_cmdline.{new_key}"] = value 

101 

102 # If the WMS service class was not defined neither at the command line nor 

103 # explicitly in config file, use the value provided by the environmental 

104 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to 

105 # the package default. 

106 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None) 

107 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]: 

108 default_config = BpsConfig(BPS_DEFAULTS) 

109 if config["wmsServiceClass"] == default_config["wmsServiceClass"]: 

110 config["wmsServiceClass"] = wms_service 

111 

112 # Set some initial values 

113 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp() 

114 if "operator" not in config: 

115 config[".bps_defined.operator"] = getpass.getuser() 

116 

117 if "outCollection" in config: 

118 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.") 

119 

120 if "outputRun" not in config: 

121 raise KeyError("Must specify the output run collection using outputRun") 

122 

123 if "uniqProcName" not in config: 

124 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_") 

125 

126 if "submitPath" not in config: 

127 raise KeyError("Must specify the submit-side run directory using submitPath") 

128 

129 # If requested, run WMS plugin checks early in submission process to 

130 # ensure WMS has what it will need for prepare() or submit(). 

131 if kwargs.get("runWmsSubmissionChecks", False): 

132 found, wms_class = config.search("wmsServiceClass") 

133 if not found: 

134 raise KeyError("Missing wmsServiceClass in bps config. Aborting.") 

135 

136 # Check that can import wms service class. 

137 wms_service_class = doImport(wms_class) 

138 wms_service = wms_service_class(config) 

139 

140 try: 

141 wms_service.run_submission_checks() 

142 except NotImplementedError: 

143 # Allow various plugins to implement only when needed to do extra 

144 # checks. 

145 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class) 

146 else: 

147 _LOG.debug("Skipping submission checks.") 

148 

149 # Make submit directory to contain all outputs. 

150 submit_path = Path(config["submitPath"]) 

151 try: 

152 submit_path.mkdir(parents=True, exist_ok=False) 

153 except OSError as exc: 

154 if exc.errno == errno.EEXIST: 

155 reason = "Directory already exists" 

156 else: 

157 reason = exc.strerror 

158 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None 

159 config[".bps_defined.submitPath"] = str(submit_path) 

160 print(f"Submit dir: {submit_path}") 

161 

162 # save copy of configs (orig and expanded config) 

163 shutil.copy2(config_file, submit_path) 

164 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh: 

165 config.dump(fh) 

166 

167 # Dump information about runtime environment and software versions in use. 

168 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml") 

169 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml") 

170 

171 return config 

172 

173 

174def acquire_qgraph_driver(config_file, **kwargs): 

175 """Read a quantum graph from a file or create one from pipeline definition. 

176 

177 Parameters 

178 ---------- 

179 config_file : `str` 

180 Name of the configuration file. 

181 

182 Returns 

183 ------- 

184 config : `lsst.ctrl.bps.BpsConfig` 

185 Updated configuration. 

186 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

187 A graph representing quanta. 

188 """ 

189 config = _init_submission_driver(config_file, **kwargs) 

190 submit_path = config[".bps_defined.submitPath"] 

191 

192 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)") 

193 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Acquire stage completed"): 

194 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path) 

195 

196 config[".bps_defined.executionButlerDir"] = execution_butler_dir 

197 config[".bps_defined.runQgraphFile"] = qgraph_file 

198 return config, qgraph 

199 

200 

201def cluster_qgraph_driver(config_file, **kwargs): 

202 """Group quanta into clusters. 

203 

204 Parameters 

205 ---------- 

206 config_file : `str` 

207 Name of the configuration file. 

208 

209 Returns 

210 ------- 

211 config : `lsst.ctrl.bps.BpsConfig` 

212 Updated configuration. 

213 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

214 A graph representing clustered quanta. 

215 """ 

216 config, qgraph = acquire_qgraph_driver(config_file, **kwargs) 

217 

218 _LOG.info("Starting cluster stage (grouping quanta into jobs)") 

219 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Cluster stage completed"): 

220 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"]) 

221 

222 submit_path = config[".bps_defined.submitPath"] 

223 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False}) 

224 if save_clustered_qgraph: 

225 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle")) 

226 _, save_dot = config.search("saveDot", opt={"default": False}) 

227 if save_dot: 

228 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot")) 

229 return config, clustered_qgraph 

230 

231 

232def transform_driver(config_file, **kwargs): 

233 """Create a workflow for a specific workflow management system. 

234 

235 Parameters 

236 ---------- 

237 config_file : `str` 

238 Name of the configuration file. 

239 

240 Returns 

241 ------- 

242 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

243 Configuration to use when creating the workflow. 

244 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

245 Representation of the abstract/scientific workflow specific to a given 

246 workflow management system. 

247 """ 

248 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs) 

249 submit_path = config[".bps_defined.submitPath"] 

250 

251 _LOG.info("Starting transform stage (creating generic workflow)") 

252 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Transform stage completed"): 

253 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path) 

254 _LOG.info("Generic workflow name '%s'", generic_workflow.name) 

255 

256 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False}) 

257 if save_workflow: 

258 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh: 

259 generic_workflow.save(outfh, "pickle") 

260 _, save_dot = config.search("saveDot", opt={"default": False}) 

261 if save_dot: 

262 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh: 

263 generic_workflow.draw(outfh, "dot") 

264 return generic_workflow_config, generic_workflow 

265 

266 

267def prepare_driver(config_file, **kwargs): 

268 """Create a representation of the generic workflow. 

269 

270 Parameters 

271 ---------- 

272 config_file : `str` 

273 Name of the configuration file. 

274 

275 Returns 

276 ------- 

277 wms_config : `lsst.ctrl.bps.BpsConfig` 

278 Configuration to use when creating the workflow. 

279 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

280 Representation of the abstract/scientific workflow specific to a given 

281 workflow management system. 

282 """ 

283 kwargs.setdefault("runWmsSubmissionChecks", True) 

284 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs) 

285 submit_path = generic_workflow_config[".bps_defined.submitPath"] 

286 

287 _LOG.info("Starting prepare stage (creating specific implementation of workflow)") 

288 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Prepare stage completed"): 

289 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path) 

290 

291 wms_workflow_config = generic_workflow_config 

292 return wms_workflow_config, wms_workflow 

293 

294 

295def submit_driver(config_file, **kwargs): 

296 """Submit workflow for execution. 

297 

298 Parameters 

299 ---------- 

300 config_file : `str` 

301 Name of the configuration file. 

302 """ 

303 kwargs.setdefault("runWmsSubmissionChecks", True) 

304 

305 _LOG.info("Starting submission process") 

306 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed entire submission process"): 

307 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs) 

308 

309 _LOG.info("Starting submit stage") 

310 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed submit stage"): 

311 submit(wms_workflow_config, wms_workflow) 

312 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id) 

313 

314 print(f"Run Id: {wms_workflow.run_id}") 

315 print(f"Run Name: {wms_workflow.name}") 

316 

317 

318def restart_driver(wms_service, run_id): 

319 """Restart a failed workflow. 

320 

321 Parameters 

322 ---------- 

323 wms_service : `str` 

324 Name of the class. 

325 run_id : `str` 

326 Id or path of workflow that need to be restarted. 

327 """ 

328 if wms_service is None: 

329 default_config = BpsConfig(BPS_DEFAULTS) 

330 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

331 

332 new_run_id, run_name, message = restart(wms_service, run_id) 

333 if new_run_id is not None: 

334 path = Path(run_id) 

335 if path.exists(): 

336 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml") 

337 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml") 

338 print(f"Run Id: {new_run_id}") 

339 print(f"Run Name: {run_name}") 

340 else: 

341 if message: 

342 print(f"Restart failed: {message}") 

343 else: 

344 print("Restart failed: Unknown error") 

345 

346 

347def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False): 

348 """Print out summary of jobs submitted for execution. 

349 

350 Parameters 

351 ---------- 

352 wms_service : `str` 

353 Name of the class. 

354 run_id : `str` 

355 A run id the report will be restricted to. 

356 user : `str` 

357 A user name the report will be restricted to. 

358 hist_days : int 

359 Number of days 

360 pass_thru : `str` 

361 A string to pass directly to the WMS service class. 

362 is_global : `bool`, optional 

363 If set, all available job queues will be queried for job information. 

364 Defaults to False which means that only a local job queue will be 

365 queried for information. 

366 

367 Only applicable in the context of a WMS using distributed job queues 

368 (e.g., HTCondor). 

369 """ 

370 if wms_service is None: 

371 default_config = BpsConfig(BPS_DEFAULTS) 

372 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

373 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global) 

374 

375 

376def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False): 

377 """Cancel submitted workflows. 

378 

379 Parameters 

380 ---------- 

381 wms_service : `str` 

382 Name of the Workload Management System service class. 

383 run_id : `str` 

384 ID or path of job that should be canceled. 

385 user : `str` 

386 User whose submitted jobs should be canceled. 

387 require_bps : `bool` 

388 Whether to require given run_id/user to be a bps submitted job. 

389 pass_thru : `str` 

390 Information to pass through to WMS. 

391 is_global : `bool`, optional 

392 If set, all available job queues will be checked for jobs to cancel. 

393 Defaults to False which means that only a local job queue will be 

394 checked. 

395 

396 Only applicable in the context of a WMS using distributed job queues 

397 (e.g., HTCondor). 

398 """ 

399 if wms_service is None: 

400 default_config = BpsConfig(BPS_DEFAULTS) 

401 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"]) 

402 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)