Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import dataclasses 

29import os 

30import re 

31import logging 

32from datetime import datetime, timedelta 

33from pathlib import Path 

34 

35import htcondor 

36 

37from ... import ( 

38 BaseWmsWorkflow, 

39 BaseWmsService, 

40 GenericWorkflow, 

41 GenericWorkflowJob, 

42 WmsRunReport, 

43 WmsJobReport, 

44 WmsStates 

45) 

46from ...bps_utils import chdir 

47from .lssthtc import ( 

48 HTCDag, 

49 HTCJob, 

50 MISSING_ID, 

51 JobStatus, 

52 NodeStatus, 

53 htc_check_dagman_output, 

54 htc_escape, 

55 htc_submit_dag, 

56 read_node_status, 

57 read_dag_log, 

58 read_dag_status, 

59 condor_q, 

60 condor_history, 

61 pegasus_name_to_label, 

62 summary_from_dag, 

63) 

64 

65 

66_LOG = logging.getLogger(__name__) 

67 

68 

69class HTCondorService(BaseWmsService): 

70 """HTCondor version of WMS service. 

71 """ 

72 def prepare(self, config, generic_workflow, out_prefix=None): 

73 """Convert generic workflow to an HTCondor DAG ready for submission. 

74 

75 Parameters 

76 ---------- 

77 config : `lsst.ctrl.bps.BpsConfig` 

78 BPS configuration that includes necessary submit/runtime 

79 information. 

80 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

81 The generic workflow (e.g., has executable name and arguments). 

82 out_prefix : `str` 

83 The root directory into which all WMS-specific files are written. 

84 

85 Returns 

86 ---------- 

87 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

88 HTCondor workflow ready to be run. 

89 """ 

90 _LOG.debug("out_prefix = '%s'", out_prefix) 

91 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

92 f"{self.__class__.__module__}." 

93 f"{self.__class__.__name__}") 

94 workflow.write(out_prefix) 

95 return workflow 

96 

97 def submit(self, workflow): 

98 """Submit a single HTCondor workflow. 

99 

100 Parameters 

101 ---------- 

102 workflow : `lsst.ctrl.bps.BaseWorkflow` 

103 A single HTCondor workflow to submit. run_id is updated after 

104 successful submission to WMS. 

105 """ 

106 # For workflow portability, internal paths are all relative. Hence 

107 # the DAG needs to be submitted to HTCondor from inside the submit 

108 # directory. 

109 with chdir(workflow.submit_path): 

110 _LOG.info("Submitting from directory: %s", os.getcwd()) 

111 htc_submit_dag(workflow.dag, dict()) 

112 workflow.run_id = workflow.dag.run_id 

113 

114 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

115 """Query WMS for list of submitted WMS workflows/jobs. 

116 

117 This should be a quick lookup function to create list of jobs for 

118 other functions. 

119 

120 Parameters 

121 ---------- 

122 wms_id : `int` or `str`, optional 

123 Id or path that can be used by WMS service to look up job. 

124 user : `str`, optional 

125 User whose submitted jobs should be listed. 

126 require_bps : `bool`, optional 

127 Whether to require jobs returned in list to be bps-submitted jobs. 

128 pass_thru : `str`, optional 

129 Information to pass through to WMS. 

130 

131 Returns 

132 ------- 

133 job_ids : `list` [`Any`] 

134 Only job ids to be used by cancel and other functions. Typically 

135 this means top-level jobs (i.e., not children jobs). 

136 """ 

137 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s", 

138 wms_id, user, require_bps, pass_thru) 

139 constraint = "" 

140 

141 if wms_id is None: 

142 if user is not None: 

143 constraint = f'(Owner == "{user}")' 

144 else: 

145 cluster_id = _wms_id_to_cluster(wms_id) 

146 if cluster_id != 0: 

147 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

148 

149 if require_bps: 

150 constraint += ' && (bps_isjob == "True")' 

151 

152 if pass_thru: 

153 if "-forcex" in pass_thru: 

154 pass_thru_2 = pass_thru.replace("-forcex", "") 

155 if pass_thru_2 and not pass_thru_2.isspace(): 

156 constraint += f"&& ({pass_thru_2})" 

157 else: 

158 constraint += f" && ({pass_thru})" 

159 

160 _LOG.debug("constraint = %s", constraint) 

161 jobs = condor_q(constraint) 

162 

163 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

164 job_ids = [] 

165 for job_id, job_info in jobs.items(): 

166 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None")) 

167 if "DAGManJobId" not in job_info: # orphaned job 

168 job_ids.append(job_id) 

169 else: 

170 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0") 

171 _LOG.debug("\tin jobs.keys() = %s", jobs.keys()) 

172 if f"{job_info['DAGManJobId']}.0" not in jobs: 

173 job_ids.append(job_id) 

174 

175 _LOG.debug("job_ids = %s", job_ids) 

176 return job_ids 

177 

178 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

179 """Return run information based upon given constraints. 

180 

181 Parameters 

182 ---------- 

183 wms_workflow_id : `str` 

184 Limit to specific run based on id. 

185 user : `str` 

186 Limit results to runs for this user. 

187 hist : `float` 

188 Limit history search to this many days. 

189 pass_thru : `str` 

190 Constraints to pass through to HTCondor. 

191 

192 Returns 

193 ------- 

194 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

195 Information about runs from given job information. 

196 message : `str` 

197 Extra message for report command to print. This could be pointers 

198 to documentation or to WMS specific commands. 

199 """ 

200 message = "" 

201 

202 if wms_workflow_id: 

203 # Explicitly checking if wms_workflow_id can be converted to a 

204 # float instead of using try/except to avoid catching a different 

205 # ValueError from _report_from_id 

206 try: 

207 float(wms_workflow_id) 

208 is_float = True 

209 except ValueError: # Don't need TypeError here as None goes to else branch. 

210 is_float = False 

211 

212 if is_float: 

213 run_reports, message = _report_from_id(float(wms_workflow_id), hist) 

214 else: 

215 run_reports, message = _report_from_path(wms_workflow_id) 

216 else: 

217 run_reports, message = _summary_report(user, hist, pass_thru) 

218 _LOG.debug("report: %s, %s", run_reports, message) 

219 

220 return list(run_reports.values()), message 

221 

222 def cancel(self, wms_id, pass_thru=None): 

223 """Cancel submitted workflows/jobs. 

224 

225 Parameters 

226 ---------- 

227 wms_id : `str` 

228 ID or path of job that should be canceled. 

229 pass_thru : `str`, optional 

230 Information to pass through to WMS. 

231 

232 Returns 

233 -------- 

234 deleted : `bool` 

235 Whether successful deletion or not. Currently, if any doubt or any 

236 individual jobs not deleted, return False. 

237 message : `str` 

238 Any message from WMS (e.g., error details). 

239 """ 

240 _LOG.debug("Canceling wms_id = %s", wms_id) 

241 

242 cluster_id = _wms_id_to_cluster(wms_id) 

243 if cluster_id == 0: 

244 deleted = False 

245 message = "Invalid id" 

246 else: 

247 _LOG.debug("Canceling cluster_id = %s", cluster_id) 

248 schedd = htcondor.Schedd() 

249 constraint = f"ClusterId == {cluster_id}" 

250 if pass_thru is not None and "-forcex" in pass_thru: 

251 pass_thru_2 = pass_thru.replace("-forcex", "") 

252 if pass_thru_2 and not pass_thru_2.isspace(): 

253 constraint += f"&& ({pass_thru_2})" 

254 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

255 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

256 else: 

257 if pass_thru: 

258 constraint += f"&& ({pass_thru})" 

259 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

260 results = schedd.act(htcondor.JobAction.Remove, constraint) 

261 _LOG.debug("Remove results: %s", results) 

262 

263 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

264 deleted = True 

265 message = "" 

266 else: 

267 deleted = False 

268 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

269 message = "no such bps job in batch queue" 

270 else: 

271 message = f"unknown problems deleting: {results}" 

272 

273 _LOG.debug("deleted: %s; message = %s", deleted, message) 

274 return deleted, message 

275 

276 

277class HTCondorWorkflow(BaseWmsWorkflow): 

278 """Single HTCondor workflow. 

279 

280 Parameters 

281 ---------- 

282 name : `str` 

283 Unique name for Workflow used when naming files. 

284 config : `lsst.ctrl.bps.BpsConfig` 

285 BPS configuration that includes necessary submit/runtime information. 

286 """ 

287 def __init__(self, name, config=None): 

288 super().__init__(name, config) 

289 self.dag = None 

290 

291 @classmethod 

292 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

293 # Docstring inherited 

294 htc_workflow = cls(generic_workflow.name, config) 

295 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

296 

297 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

298 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

299 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

300 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}"}) 

301 

302 # Create all DAG jobs 

303 for job_name in generic_workflow: 

304 gwjob = generic_workflow.get_job(job_name) 

305 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix) 

306 htc_workflow.dag.add_job(htc_job) 

307 

308 # Add job dependencies to the DAG 

309 for job_name in generic_workflow: 

310 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

311 

312 # If final job exists in generic workflow, create DAG final job 

313 final = generic_workflow.get_final() 

314 if final and isinstance(final, GenericWorkflowJob): 

315 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix) 

316 if "post" not in final_htjob.dagcmds: 

317 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

318 f" {final.name} $DAG_STATUS $RETURN" 

319 htc_workflow.dag.add_final_job(final_htjob) 

320 elif final and isinstance(final, GenericWorkflow): 

321 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

322 elif final: 

323 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

324 

325 return htc_workflow 

326 

327 @staticmethod 

328 def _create_job(config, generic_workflow, gwjob, out_prefix): 

329 """Convert GenericWorkflow job nodes to DAG jobs. 

330 

331 Parameters 

332 ---------- 

333 config : `lsst.ctrl.bps.BpsConfig` 

334 BPS configuration that includes necessary submit/runtime 

335 information. 

336 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

337 Generic workflow that is being converted. 

338 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

339 The generic job to convert to a HTCondor job. 

340 out_prefix : `str` 

341 Directory prefix for HTCondor files. 

342 

343 Returns 

344 ------- 

345 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

346 The HTCondor job equivalent to the given generic job. 

347 """ 

348 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

349 

350 curvals = dataclasses.asdict(gwjob) 

351 if gwjob.tags: 

352 curvals.update(gwjob.tags) 

353 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals}) 

354 if not found: 

355 subdir = "jobs" 

356 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

357 

358 htc_job_cmds = { 

359 "universe": "vanilla", 

360 "should_transfer_files": "YES", 

361 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

362 "transfer_executable": "False", 

363 "getenv": "True", 

364 

365 # Exceeding memory sometimes triggering SIGBUS error. 

366 # Tell htcondor to put SIGBUS jobs on hold. 

367 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

368 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

369 "on_exit_hold_subcode": "34" 

370 } 

371 

372 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob)) 

373 

374 # job stdout, stderr, htcondor user log. 

375 for key in ("output", "error", "log"): 

376 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

377 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

378 

379 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

380 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix)) 

381 

382 # Add the job cmds dict to the job object. 

383 htc_job.add_job_cmds(htc_job_cmds) 

384 

385 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

386 

387 # Add run level attributes to job. 

388 htc_job.add_job_attrs(generic_workflow.run_attrs) 

389 

390 # Add job attributes to job. 

391 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

392 htc_job.add_job_attrs(gwjob.attrs) 

393 if gwjob.tags: 

394 htc_job.add_job_attrs({"bps_job_quanta": gwjob.tags.get("quanta_summary", "")}) 

395 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

396 "bps_job_label": gwjob.label}) 

397 

398 return htc_job 

399 

400 def write(self, out_prefix): 

401 """Output HTCondor DAGMan files needed for workflow submission. 

402 

403 Parameters 

404 ---------- 

405 out_prefix : `str` 

406 Directory prefix for HTCondor files. 

407 """ 

408 self.submit_path = out_prefix 

409 os.makedirs(out_prefix, exist_ok=True) 

410 

411 # Write down the workflow in HTCondor format. 

412 self.dag.write(out_prefix, "jobs/{self.label}") 

413 

414 

415def _translate_job_cmds(config, generic_workflow, gwjob): 

416 """Translate the job data that are one to one mapping 

417 

418 Parameters 

419 ---------- 

420 config : `lsst.ctrl.bps.BpsConfig` 

421 BPS configuration that includes necessary submit/runtime 

422 information. 

423 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

424 Generic workflow that contains job to being converted. 

425 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

426 Generic workflow job to be converted. 

427 

428 Returns 

429 ------- 

430 htc_job_commands : `dict` [`str`, `Any`] 

431 Contains commands which can appear in the HTCondor submit description 

432 file. 

433 """ 

434 # Values in the job script that just are name mappings. 

435 job_translation = {"mail_to": "notify_user", 

436 "when_to_mail": "notification", 

437 "request_cpus": "request_cpus", 

438 "priority": "priority", 

439 "category": "category"} 

440 

441 jobcmds = {} 

442 for gwkey, htckey in job_translation.items(): 

443 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

444 

445 # job commands that need modification 

446 if gwjob.request_disk: 

447 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

448 

449 if gwjob.request_memory: 

450 jobcmds["request_memory"] = f"{gwjob.request_memory}MB" 

451 

452 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

453 # May need to move to special site-specific implementation if sites use 

454 # other mechanisms. 

455 if gwjob.concurrency_limit: 

456 jobcmds["concurrency_limit"] = ",".join(gwjob.concurrency_limit) 

457 

458 # Handle command line 

459 if gwjob.executable.transfer_executable: 

460 jobcmds["transfer_executable"] = "True" 

461 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

462 else: 

463 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

464 

465 if gwjob.arguments: 

466 arguments = gwjob.arguments 

467 arguments = _replace_cmd_vars(arguments, gwjob) 

468 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob) 

469 arguments = _fix_env_var_syntax(arguments) 

470 jobcmds["arguments"] = arguments 

471 

472 # Add extra "pass-thru" job commands 

473 if gwjob.profile: 

474 for key, val in gwjob.profile.items(): 

475 jobcmds[key] = htc_escape(val) 

476 

477 return jobcmds 

478 

479 

480def _translate_dag_cmds(gwjob): 

481 """Translate job values into DAGMan commands. 

482 

483 Parameters 

484 ---------- 

485 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

486 Job containing values to be translated. 

487 

488 Returns 

489 ------- 

490 dagcmds : `dict` [`str`, `Any`] 

491 DAGMan commands for the job. 

492 """ 

493 # Values in the dag script that just are name mappings. 

494 dag_translation = {"number_of_retries": "retry", 

495 "retry_unless_exit": "retry_unless_exit", 

496 "abort_on_value": "abort_dag_on", 

497 "abort_return_value": "abort_exit"} 

498 

499 dagcmds = {} 

500 for gwkey, htckey in dag_translation.items(): 

501 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

502 

503 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

504 return dagcmds 

505 

506 

507def _fix_env_var_syntax(oldstr): 

508 """Change ENV place holders to HTCondor Env var syntax. 

509 

510 Parameters 

511 ---------- 

512 oldstr : `str` 

513 String in which environment variable syntax is to be fixed. 

514 

515 Returns 

516 ------- 

517 newstr : `str` 

518 Given string with environment variable syntax fixed. 

519 """ 

520 newstr = oldstr 

521 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

522 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

523 return newstr 

524 

525 

526def _replace_file_vars(config, arguments, workflow, gwjob): 

527 """Replace file placeholders in command line arguments with correct 

528 physical file names. 

529 

530 Parameters 

531 ---------- 

532 config : `lsst.ctrl.bps.BpsConfig` 

533 BPS configuration that includes necessary submit/runtime 

534 information. 

535 arguments : `str` 

536 Arguments string in which to replace file placeholders. 

537 workflow : `lsst.ctrl.bps.GenericWorkflow` 

538 Generic workflow that contains file information. 

539 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

540 The job corresponding to the arguments. 

541 

542 Returns 

543 ------- 

544 arguments : `str` 

545 Given arguments string with file placeholders replaced. 

546 """ 

547 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

548 

549 # Replace input file placeholders with paths. 

550 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

551 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared: 

552 uri = os.path.basename(gwfile.src_uri) 

553 else: 

554 uri = gwfile.src_uri 

555 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

556 

557 # Replace output file placeholders with paths. 

558 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

559 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared: 

560 uri = os.path.basename(gwfile.src_uri) 

561 else: 

562 uri = gwfile.src_uri 

563 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

564 return arguments 

565 

566 

567def _replace_cmd_vars(arguments, gwjob): 

568 """Replace format-style placeholders in arguments. 

569 

570 Parameters 

571 ---------- 

572 arguments : `str` 

573 Arguments string in which to replace placeholders. 

574 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

575 Job containing values to be used to replace placeholders 

576 (in particular gwjob.cmdvals). 

577 

578 Returns 

579 ------- 

580 arguments : `str` 

581 Given arguments string with placeholders replaced. 

582 """ 

583 try: 

584 arguments = arguments.format(**gwjob.cmdvals) 

585 except (KeyError, TypeError): # TypeError in case None instead of {} 

586 _LOG.error("Could not replace command variables:\n" 

587 "arguments: %s\n" 

588 "cmdvals: %s", arguments, gwjob.cmdvals) 

589 raise 

590 return arguments 

591 

592 

593def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

594 """Add job input files from generic workflow to job. 

595 

596 Parameters 

597 ---------- 

598 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

599 The generic workflow (e.g., has executable name and arguments). 

600 job_name : `str` 

601 Unique name for the job. 

602 use_shared : `bool` 

603 Whether job has access to files via shared filesystem. 

604 out_prefix : `str` 

605 The root directory into which all WMS-specific files are written. 

606 

607 Returns 

608 ------- 

609 htc_commands : `dict` [`str`, `str`] 

610 HTCondor commands for the job submission script. 

611 """ 

612 htc_commands = {} 

613 inputs = [] 

614 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

615 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

616 if not use_shared or not gwf_file.job_shared: 

617 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix)) 

618 

619 if inputs: 

620 htc_commands["transfer_input_files"] = ",".join(inputs) 

621 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

622 return htc_commands 

623 

624 

625def _report_from_path(wms_path): 

626 """Gather run information from a given run directory. 

627 

628 Parameters 

629 ---------- 

630 wms_path : `str` 

631 The directory containing the submit side files (e.g., HTCondor files). 

632 

633 Returns 

634 ------- 

635 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

636 Run information for the detailed report. The key is the HTCondor id 

637 and the value is a collection of report information for that run. 

638 message : `str` 

639 Message to be printed with the summary report. 

640 """ 

641 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

642 if wms_workflow_id == MISSING_ID: 

643 run_reports = {} 

644 else: 

645 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

646 return run_reports, message 

647 

648 

649def _report_from_id(wms_workflow_id, hist): 

650 """Gather run information from a given run directory. 

651 

652 Parameters 

653 ---------- 

654 wms_workflow_id : `int` or `str` 

655 Limit to specific run based on id. 

656 hist : `float` 

657 Limit history search to this many days. 

658 

659 Returns 

660 ------- 

661 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

662 Run information for the detailed report. The key is the HTCondor id 

663 and the value is a collection of report information for that run. 

664 message : `str` 

665 Message to be printed with the summary report. 

666 """ 

667 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \ 

668 f"{int(float(wms_workflow_id))})" 

669 jobs = condor_q(constraint) 

670 if hist: 

671 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

672 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

673 hist_jobs = condor_history(constraint) 

674 _update_jobs(jobs, hist_jobs) 

675 

676 # keys in dictionary will be strings of format "ClusterId.ProcId" 

677 wms_workflow_id = str(wms_workflow_id) 

678 if not wms_workflow_id.endswith(".0"): 

679 wms_workflow_id += ".0" 

680 

681 if wms_workflow_id in jobs: 

682 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"]) 

683 _update_jobs(jobs, path_jobs) 

684 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

685 else: 

686 run_reports = {} 

687 message = f"Found 0 records for run id {wms_workflow_id}" 

688 return run_reports, message 

689 

690 

691def _get_info_from_path(wms_path): 

692 """Gather run information from a given run directory. 

693 

694 Parameters 

695 ---------- 

696 wms_path : `str` 

697 Directory containing HTCondor files. 

698 

699 Returns 

700 ------- 

701 wms_workflow_id : `str` 

702 The run id which is a DAGman job id. 

703 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

704 Information about jobs read from files in the given directory. 

705 The key is the HTCondor id and the value is a dictionary of HTCondor 

706 keys and values. 

707 message : `str` 

708 Message to be printed with the summary report. 

709 """ 

710 try: 

711 wms_workflow_id, jobs = read_dag_log(wms_path) 

712 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

713 _update_jobs(jobs, read_node_status(wms_path)) 

714 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

715 

716 # Add more info for DAGman job 

717 job = jobs[wms_workflow_id] 

718 job.update(read_dag_status(wms_path)) 

719 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

720 if "bps_run" not in job: 

721 _add_run_info(wms_path, job) 

722 

723 message = htc_check_dagman_output(wms_path) 

724 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

725 jobs[wms_workflow_id]["total_jobs"]) 

726 except StopIteration: 

727 message = f"Could not find HTCondor files in {wms_path}" 

728 _LOG.warning(message) 

729 wms_workflow_id = MISSING_ID 

730 jobs = {} 

731 

732 return wms_workflow_id, jobs, message 

733 

734 

735def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

736 """Gather run information to be used in generating summary reports. 

737 

738 Parameters 

739 ---------- 

740 wms_workflow_id : `str` 

741 Run lookup restricted to given user. 

742 jobs : `float` 

743 How many previous days to search for run information. 

744 

745 Returns 

746 ------- 

747 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

748 Run information for the detailed report. The key is the given HTCondor 

749 id and the value is a collection of report information for that run. 

750 """ 

751 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

752 dag_job = jobs[wms_workflow_id] 

753 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job: 

754 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id) 

755 return {} 

756 report = WmsRunReport(wms_id=wms_workflow_id, 

757 path=dag_job["Iwd"], 

758 label=dag_job.get("bps_job_label", "MISS"), 

759 run=dag_job.get("bps_run", "MISS"), 

760 project=dag_job.get("bps_project", "MISS"), 

761 campaign=dag_job.get("bps_campaign", "MISS"), 

762 payload=dag_job.get("bps_payload", "MISS"), 

763 operator=_get_owner(dag_job), 

764 run_summary=_get_run_summary(dag_job), 

765 state=_htc_status_to_wms_state(dag_job), 

766 jobs=[], 

767 total_number_jobs=dag_job["total_jobs"], 

768 job_state_counts=dag_job["state_counts"]) 

769 

770 try: 

771 for job in jobs.values(): 

772 if job["ClusterId"] != int(float(wms_workflow_id)): 

773 job_report = WmsJobReport(wms_id=job["ClusterId"], 

774 name=job.get("DAGNodeName", str(job["ClusterId"])), 

775 label=job.get("bps_job_label", 

776 pegasus_name_to_label(job["DAGNodeName"])), 

777 state=_htc_status_to_wms_state(job)) 

778 if job_report.label == "init": 

779 job_report.label = "pipetaskInit" 

780 report.jobs.append(job_report) 

781 except KeyError as ex: 

782 _LOG.error("Job missing key '%s': %s", str(ex), job) 

783 raise 

784 

785 run_reports = {report.wms_id: report} 

786 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

787 return run_reports 

788 

789 

790def _summary_report(user, hist, pass_thru): 

791 """Gather run information to be used in generating summary reports. 

792 

793 Parameters 

794 ---------- 

795 user : `str` 

796 Run lookup restricted to given user. 

797 hist : `float` 

798 How many previous days to search for run information. 

799 pass_thru : `str` 

800 Advanced users can define the HTCondor constraint to be used 

801 when searching queue and history. 

802 

803 Returns 

804 ------- 

805 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

806 Run information for the summary report. The keys are HTCondor ids and 

807 the values are collections of report information for each run. 

808 message : `str` 

809 Message to be printed with the summary report. 

810 """ 

811 # only doing summary report so only look for dagman jobs 

812 if pass_thru: 

813 constraint = pass_thru 

814 else: 

815 # Notes: 

816 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

817 # manually restarted. 

818 # * Any job with DAGManJobID isn't a DAG job 

819 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

820 if user: 

821 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

822 

823 # Check runs in queue. 

824 jobs = condor_q(constraint) 

825 

826 if hist: 

827 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

828 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

829 hist_jobs = condor_history(constraint) 

830 _update_jobs(jobs, hist_jobs) 

831 

832 _LOG.debug("Job ids from queue and history %s", jobs.keys()) 

833 

834 # Have list of DAGMan jobs, need to get run_report info. 

835 run_reports = {} 

836 for job in jobs.values(): 

837 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

838 # If didn't get from queue information (e.g., Kerberos bug), 

839 # try reading from file. 

840 if total_jobs == 0: 

841 try: 

842 job.update(read_dag_status(job["Iwd"])) 

843 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

844 except StopIteration: 

845 pass # don't kill report can't find htcondor files 

846 

847 if "bps_run" not in job: 

848 _add_run_info(job["Iwd"], job) 

849 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)), 

850 path=job["Iwd"], 

851 label=job.get("bps_job_label", "MISS"), 

852 run=job.get("bps_run", "MISS"), 

853 project=job.get("bps_project", "MISS"), 

854 campaign=job.get("bps_campaign", "MISS"), 

855 payload=job.get("bps_payload", "MISS"), 

856 operator=_get_owner(job), 

857 run_summary=_get_run_summary(job), 

858 state=_htc_status_to_wms_state(job), 

859 jobs=[], 

860 total_number_jobs=total_jobs, 

861 job_state_counts=state_counts) 

862 

863 run_reports[report.wms_id] = report 

864 

865 return run_reports, "" 

866 

867 

868def _add_run_info(wms_path, job): 

869 """Find BPS run information elsewhere for runs without bps attributes. 

870 

871 Parameters 

872 ---------- 

873 wms_path : `str` 

874 Path to submit files for the run. 

875 job : `dict` [`str`, `Any`] 

876 HTCondor dag job information. 

877 

878 Raises 

879 ------ 

880 StopIteration 

881 If cannot find file it is looking for. Permission errors are 

882 caught and job's run is marked with error. 

883 """ 

884 path = Path(wms_path) / "jobs" 

885 try: 

886 jobdir = next(path.glob("*"), Path(wms_path)) 

887 try: 

888 subfile = next(jobdir.glob("*.sub")) 

889 _LOG.debug("_add_run_info: subfile = %s", subfile) 

890 with open(subfile, "r") as fh: 

891 for line in fh: 

892 if line.startswith("+bps_"): 

893 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

894 if m: 

895 _LOG.debug("Matching line: %s", line) 

896 job[m.group(1)] = m.group(2).replace('"', "") 

897 else: 

898 _LOG.debug("Could not parse attribute: %s", line) 

899 except StopIteration: 

900 job["bps_run"] = "Missing" 

901 

902 except PermissionError: 

903 job["bps_run"] = "PermissionError" 

904 _LOG.debug("After adding job = %s", job) 

905 

906 

907def _get_owner(job): 

908 """Get the owner of a dag job. 

909 

910 Parameters 

911 ---------- 

912 job : `dict` [`str`, `Any`] 

913 HTCondor dag job information. 

914 

915 Returns 

916 ------- 

917 owner : `str` 

918 Owner of the dag job. 

919 """ 

920 owner = job.get("bps_operator", None) 

921 if not owner: 

922 owner = job.get("Owner", None) 

923 if not owner: 

924 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

925 owner = "MISS" 

926 return owner 

927 

928 

929def _get_run_summary(job): 

930 """Get the run summary for a job. 

931 

932 Parameters 

933 ---------- 

934 job : `dict` [`str`, `Any`] 

935 HTCondor dag job information. 

936 

937 Returns 

938 ------- 

939 summary : `str` 

940 Number of jobs per PipelineTask label in approximate pipeline order. 

941 Format: <label>:<count>[;<label>:<count>]+ 

942 """ 

943 summary = job.get("bps_run_summary", None) 

944 if not summary: 

945 summary, _ = summary_from_dag(job["Iwd"]) 

946 if not summary: 

947 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

948 _LOG.debug("_get_run_summary: summary=%s", summary) 

949 

950 # Workaround sometimes using init vs pipetaskInit 

951 summary = summary.replace("init:", "pipetaskInit:") 

952 

953 if "pegasus_version" in job and "pegasus" not in summary: 

954 summary += ";pegasus:0" 

955 

956 return summary 

957 

958 

959def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

960 """Count number of jobs per WMS state. 

961 

962 Parameters 

963 ---------- 

964 wms_workflow_id : `str` 

965 HTCondor job id. 

966 jobs : `dict` [`str`, `Any`] 

967 HTCondor dag job information. 

968 

969 Returns 

970 ------- 

971 total_count : `int` 

972 Total number of dag nodes. 

973 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

974 Keys are the different WMS states and values are counts of jobs 

975 that are in that WMS state. 

976 """ 

977 state_counts = dict.fromkeys(WmsStates, 0) 

978 

979 for jid, jinfo in jobs.items(): 

980 if jid != wms_workflow_id: 

981 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

982 

983 total_counted = sum(state_counts.values()) 

984 if "NodesTotal" in jobs[wms_workflow_id]: 

985 total_count = jobs[wms_workflow_id]["NodesTotal"] 

986 else: 

987 total_count = total_counted 

988 

989 state_counts[WmsStates.UNREADY] += total_count - total_counted 

990 

991 return total_count, state_counts 

992 

993 

994def _get_state_counts_from_dag_job(job): 

995 """Count number of jobs per WMS state. 

996 

997 Parameters 

998 ---------- 

999 job : `dict` [`str`, `Any`] 

1000 HTCondor dag job information. 

1001 

1002 Returns 

1003 ------- 

1004 total_count : `int` 

1005 Total number of dag nodes. 

1006 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1007 Keys are the different WMS states and values are counts of jobs 

1008 that are in that WMS state. 

1009 """ 

1010 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1011 state_counts = dict.fromkeys(WmsStates, 0) 

1012 if "DAG_NodesReady" in job: 

1013 state_counts = { 

1014 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1015 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1016 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1017 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1018 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1019 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1020 total_jobs = job.get("DAG_NodesTotal") 

1021 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1022 elif "NodesFailed" in job: 

1023 state_counts = { 

1024 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1025 WmsStates.READY: job.get("NodesReady", 0), 

1026 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1027 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1028 WmsStates.FAILED: job.get("NodesFailed", 0), 

1029 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1030 try: 

1031 total_jobs = job.get("NodesTotal") 

1032 except KeyError as ex: 

1033 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1034 raise 

1035 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1036 else: 

1037 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1038 # for every DAG. 

1039 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1040 total_jobs = 0 

1041 

1042 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1043 return total_jobs, state_counts 

1044 

1045 

1046def _htc_status_to_wms_state(job): 

1047 """Convert HTCondor job status to generic wms state. 

1048 

1049 Parameters 

1050 ---------- 

1051 job : `dict` [`str`, `Any`] 

1052 HTCondor job information. 

1053 

1054 Returns 

1055 ------- 

1056 wms_state : `WmsStates` 

1057 The equivalent WmsState to given job's status. 

1058 """ 

1059 wms_state = WmsStates.MISFIT 

1060 if "JobStatus" in job: 

1061 wms_state = _htc_job_status_to_wms_state(job) 

1062 elif "NodeStatus" in job: 

1063 wms_state = _htc_node_status_to_wms_state(job) 

1064 return wms_state 

1065 

1066 

1067def _htc_job_status_to_wms_state(job): 

1068 """Convert HTCondor job status to generic wms state. 

1069 

1070 Parameters 

1071 ---------- 

1072 job : `dict` [`str`, `Any`] 

1073 HTCondor job information. 

1074 

1075 Returns 

1076 ------- 

1077 wms_state : `lsst.ctrl.bps.WmsStates` 

1078 The equivalent WmsState to given job's status. 

1079 """ 

1080 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1081 type(job["JobStatus"])) 

1082 job_status = int(job["JobStatus"]) 

1083 wms_state = WmsStates.MISFIT 

1084 

1085 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1086 if job_status == JobStatus.IDLE: 

1087 wms_state = WmsStates.PENDING 

1088 elif job_status == JobStatus.RUNNING: 

1089 wms_state = WmsStates.RUNNING 

1090 elif job_status == JobStatus.REMOVED: 

1091 wms_state = WmsStates.DELETED 

1092 elif job_status == JobStatus.COMPLETED: 

1093 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1094 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1095 job.get("ReturnValue", 0): 

1096 wms_state = WmsStates.FAILED 

1097 else: 

1098 wms_state = WmsStates.SUCCEEDED 

1099 elif job_status == JobStatus.HELD: 

1100 wms_state = WmsStates.HELD 

1101 

1102 return wms_state 

1103 

1104 

1105def _htc_node_status_to_wms_state(job): 

1106 """Convert HTCondor status to generic wms state. 

1107 

1108 Parameters 

1109 ---------- 

1110 job : `dict` [`str`, `Any`] 

1111 HTCondor job information. 

1112 

1113 Returns 

1114 ------- 

1115 wms_state : `lsst.ctrl.bps.WmsStates` 

1116 The equivalent WmsState to given node's status. 

1117 """ 

1118 wms_state = WmsStates.MISFIT 

1119 

1120 status = job["NodeStatus"] 

1121 if status == NodeStatus.NOT_READY: 

1122 wms_state = WmsStates.UNREADY 

1123 elif status == NodeStatus.READY: 

1124 wms_state = WmsStates.READY 

1125 elif status == NodeStatus.PRERUN: 

1126 wms_state = WmsStates.MISFIT 

1127 elif status == NodeStatus.SUBMITTED: 

1128 if job["JobProcsHeld"]: 

1129 wms_state = WmsStates.HELD 

1130 elif job["StatusDetails"] == "not_idle": 

1131 wms_state = WmsStates.RUNNING 

1132 elif job["JobProcsQueued"]: 

1133 wms_state = WmsStates.PENDING 

1134 elif status == NodeStatus.POSTRUN: 

1135 wms_state = WmsStates.MISFIT 

1136 elif status == NodeStatus.DONE: 

1137 wms_state = WmsStates.SUCCEEDED 

1138 elif status == NodeStatus.ERROR: 

1139 wms_state = WmsStates.FAILED 

1140 

1141 return wms_state 

1142 

1143 

1144def _update_jobs(jobs1, jobs2): 

1145 """Update jobs1 with info in jobs2. 

1146 

1147 (Basically an update for nested dictionaries.) 

1148 

1149 Parameters 

1150 ---------- 

1151 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1152 HTCondor job information to be updated. 

1153 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1154 Additional HTCondor job information. 

1155 """ 

1156 for jid, jinfo in jobs2.items(): 

1157 if jid in jobs1: 

1158 jobs1[jid].update(jinfo) 

1159 else: 

1160 jobs1[jid] = jinfo 

1161 

1162 

1163def _wms_id_to_cluster(wms_id): 

1164 """Convert WMS ID to cluster ID. 

1165 

1166 Parameters 

1167 ---------- 

1168 wms_id : `int` or `float` or `str` 

1169 HTCondor job id or path. 

1170 

1171 Returns 

1172 ------- 

1173 cluster_id : `int` 

1174 HTCondor cluster id. 

1175 """ 

1176 # If wms_id represents path, get numeric id. 

1177 try: 

1178 cluster_id = int(float(wms_id)) 

1179 except ValueError: 

1180 wms_path = Path(wms_id) 

1181 if wms_path.exists(): 

1182 try: 

1183 cluster_id, _ = read_dag_log(wms_id) 

1184 cluster_id = int(float(cluster_id)) 

1185 except StopIteration: 

1186 cluster_id = 0 

1187 else: 

1188 cluster_id = 0 

1189 return cluster_id