Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import os 

29import re 

30import logging 

31from datetime import datetime, timedelta 

32from pathlib import Path 

33 

34import htcondor 

35 

36from ... import ( 

37 BaseWmsWorkflow, 

38 BaseWmsService, 

39 GenericWorkflow, 

40 WmsRunReport, 

41 WmsJobReport, 

42 WmsStates 

43) 

44from ...bps_utils import chdir 

45from .lssthtc import ( 

46 HTCDag, 

47 HTCJob, 

48 MISSING_ID, 

49 JobStatus, 

50 NodeStatus, 

51 htc_check_dagman_output, 

52 htc_escape, 

53 htc_submit_dag, 

54 read_node_status, 

55 read_dag_log, 

56 read_dag_status, 

57 condor_q, 

58 condor_history, 

59 pegasus_name_to_label, 

60 summary_from_dag, 

61) 

62 

63 

64_LOG = logging.getLogger(__name__) 

65 

66 

67class HTCondorService(BaseWmsService): 

68 """HTCondor version of WMS service. 

69 """ 

70 def prepare(self, config, generic_workflow, out_prefix=None): 

71 """Convert generic workflow to an HTCondor DAG ready for submission. 

72 

73 Parameters 

74 ---------- 

75 config : `lsst.ctrl.bps.BPSConfig` 

76 BPS configuration that includes necessary submit/runtime 

77 information. 

78 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

79 The generic workflow (e.g., has executable name and arguments). 

80 out_prefix : `str` 

81 The root directory into which all WMS-specific files are written. 

82 

83 Returns 

84 ---------- 

85 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

86 HTCondor workflow ready to be run. 

87 """ 

88 _LOG.debug("out_prefix = '%s'", out_prefix) 

89 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

90 f"{self.__class__.__module__}." 

91 f"{self.__class__.__name__}") 

92 workflow.write(out_prefix) 

93 return workflow 

94 

95 def submit(self, workflow): 

96 """Submit a single HTCondor workflow. 

97 

98 Parameters 

99 ---------- 

100 workflow : `lsst.ctrl.bps.BaseWorkflow` 

101 A single HTCondor workflow to submit. run_id is updated after 

102 successful submission to WMS. 

103 """ 

104 # For workflow portability, internal paths are all relative. Hence 

105 # the DAG needs to be submitted to HTCondor from inside the submit 

106 # directory. 

107 with chdir(workflow.submit_path): 

108 _LOG.info("Submitting from directory: %s", os.getcwd()) 

109 htc_submit_dag(workflow.dag, dict()) 

110 workflow.run_id = workflow.dag.run_id 

111 

112 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

113 """Query WMS for list of submitted WMS workflows/jobs. 

114 

115 This should be a quick lookup function to create list of jobs for 

116 other functions. 

117 

118 Parameters 

119 ---------- 

120 wms_id : `int` or `str`, optional 

121 Id or path that can be used by WMS service to look up job. 

122 user : `str`, optional 

123 User whose submitted jobs should be listed. 

124 require_bps : `bool`, optional 

125 Whether to require jobs returned in list to be bps-submitted jobs. 

126 pass_thru : `str`, optional 

127 Information to pass through to WMS. 

128 

129 Returns 

130 ------- 

131 job_ids : `list` [`Any`] 

132 Only job ids to be used by cancel and other functions. Typically 

133 this means top-level jobs (i.e., not children jobs). 

134 """ 

135 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s", 

136 wms_id, user, require_bps, pass_thru) 

137 constraint = "" 

138 

139 if wms_id is None: 

140 if user is not None: 

141 constraint = f'(Owner == "{user}")' 

142 else: 

143 cluster_id = _wms_id_to_cluster(wms_id) 

144 if cluster_id != 0: 

145 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

146 

147 if require_bps: 

148 constraint += ' && (bps_isjob == "True")' 

149 

150 if pass_thru: 

151 if "-forcex" in pass_thru: 

152 pass_thru_2 = pass_thru.replace("-forcex", "") 

153 if pass_thru_2 and not pass_thru_2.isspace(): 

154 constraint += f"&& ({pass_thru_2})" 

155 else: 

156 constraint += f" && ({pass_thru})" 

157 

158 _LOG.debug("constraint = %s", constraint) 

159 jobs = condor_q(constraint) 

160 

161 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

162 job_ids = [] 

163 for job_id, job_info in jobs.items(): 

164 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None")) 

165 if "DAGManJobId" not in job_info: # orphaned job 

166 job_ids.append(job_id) 

167 else: 

168 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0") 

169 _LOG.debug("\tin jobs.keys() = %s", jobs.keys()) 

170 if f"{job_info['DAGManJobId']}.0" not in jobs: 

171 job_ids.append(job_id) 

172 

173 _LOG.debug("job_ids = %s", job_ids) 

174 return job_ids 

175 

176 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

177 """Return run information based upon given constraints. 

178 

179 Parameters 

180 ---------- 

181 wms_workflow_id : `str` 

182 Limit to specific run based on id. 

183 user : `str` 

184 Limit results to runs for this user. 

185 hist : `float` 

186 Limit history search to this many days. 

187 pass_thru : `str` 

188 Constraints to pass through to HTCondor. 

189 

190 Returns 

191 ------- 

192 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

193 Information about runs from given job information. 

194 message : `str` 

195 Extra message for report command to print. This could be pointers 

196 to documentation or to WMS specific commands. 

197 """ 

198 message = "" 

199 

200 if wms_workflow_id: 

201 # Explicitly checking if wms_workflow_id can be converted to a 

202 # float instead of using try/except to avoid catching a different 

203 # ValueError from _report_from_id 

204 try: 

205 float(wms_workflow_id) 

206 is_float = True 

207 except ValueError: # Don't need TypeError here as None goes to else branch. 

208 is_float = False 

209 

210 if is_float: 

211 run_reports, message = _report_from_id(float(wms_workflow_id), hist) 

212 else: 

213 run_reports, message = _report_from_path(wms_workflow_id) 

214 else: 

215 run_reports, message = _summary_report(user, hist, pass_thru) 

216 _LOG.debug("report: %s, %s", run_reports, message) 

217 

218 return list(run_reports.values()), message 

219 

220 def cancel(self, wms_id, pass_thru=None): 

221 """Cancel submitted workflows/jobs. 

222 

223 Parameters 

224 ---------- 

225 wms_id : `str` 

226 ID or path of job that should be canceled. 

227 pass_thru : `str`, optional 

228 Information to pass through to WMS. 

229 

230 Returns 

231 -------- 

232 deleted : `bool` 

233 Whether successful deletion or not. Currently, if any doubt or any 

234 individual jobs not deleted, return False. 

235 message : `str` 

236 Any message from WMS (e.g., error details). 

237 """ 

238 _LOG.debug("Canceling wms_id = %s", wms_id) 

239 

240 cluster_id = _wms_id_to_cluster(wms_id) 

241 if cluster_id == 0: 

242 deleted = False 

243 message = "Invalid id" 

244 else: 

245 _LOG.debug("Canceling cluster_id = %s", cluster_id) 

246 schedd = htcondor.Schedd() 

247 constraint = f"ClusterId == {cluster_id}" 

248 if pass_thru is not None and "-forcex" in pass_thru: 

249 pass_thru_2 = pass_thru.replace("-forcex", "") 

250 if pass_thru_2 and not pass_thru_2.isspace(): 

251 constraint += f"&& ({pass_thru_2})" 

252 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

253 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

254 else: 

255 if pass_thru: 

256 constraint += f"&& ({pass_thru})" 

257 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

258 results = schedd.act(htcondor.JobAction.Remove, constraint) 

259 _LOG.debug("Remove results: %s", results) 

260 

261 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

262 deleted = True 

263 message = "" 

264 else: 

265 deleted = False 

266 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

267 message = "no such bps job in batch queue" 

268 else: 

269 message = f"unknown problems deleting: {results}" 

270 

271 _LOG.debug("deleted: %s; message = %s", deleted, message) 

272 return deleted, message 

273 

274 

275class HTCondorWorkflow(BaseWmsWorkflow): 

276 """Single HTCondor workflow. 

277 

278 Parameters 

279 ---------- 

280 name : `str` 

281 Unique name for Workflow used when naming files. 

282 config : `lsst.ctrl.bps.BpsConfig` 

283 BPS configuration that includes necessary submit/runtime information. 

284 """ 

285 def __init__(self, name, config=None): 

286 super().__init__(name, config) 

287 self.dag = None 

288 

289 @classmethod 

290 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

291 # Docstring inherited 

292 htc_workflow = cls(generic_workflow.name, config) 

293 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

294 

295 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

296 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

297 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

298 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}"}) 

299 

300 # Create all DAG jobs 

301 for job_name in generic_workflow: 

302 gwf_job = generic_workflow.get_job(job_name) 

303 htc_job = HTCondorWorkflow._create_job(generic_workflow, gwf_job, generic_workflow.run_attrs, 

304 out_prefix) 

305 htc_workflow.dag.add_job(htc_job) 

306 

307 # Add job dependencies to the DAG 

308 for job_name in generic_workflow: 

309 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

310 return htc_workflow 

311 

312 @staticmethod 

313 def _create_job(generic_workflow, gwf_job, run_attrs, out_prefix): 

314 """Convert GenericWorkflow job nodes to DAG jobs. 

315 

316 Parameters 

317 ---------- 

318 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

319 Generic workflow that is being converted. 

320 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob` 

321 The generic job to convert to a HTCondor job. 

322 run_attrs : `dict` [`str`, `str`] 

323 Attributes common to entire run that should be added to job. 

324 out_prefix : `str` 

325 Directory prefix for HTCondor files. 

326 

327 Returns 

328 ------- 

329 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

330 The HTCondor job equivalent to the given generic job. 

331 """ 

332 htc_job = HTCJob(gwf_job.name, label=gwf_job.label) 

333 

334 htc_job_cmds = { 

335 "universe": "vanilla", 

336 "should_transfer_files": "YES", 

337 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

338 "transfer_executable": "False", 

339 "getenv": "True", 

340 } 

341 

342 htc_job_cmds.update(_translate_job_cmds(generic_workflow, gwf_job)) 

343 

344 # job stdout, stderr, htcondor user log. 

345 htc_job_cmds["output"] = f"{gwf_job.name}.$(Cluster).out" 

346 htc_job_cmds["error"] = f"{gwf_job.name}.$(Cluster).err" 

347 htc_job_cmds["log"] = f"{gwf_job.name}.$(Cluster).log" 

348 for key in ("output", "error", "log"): 

349 htc_job_cmds[key] = f"{gwf_job.name}.$(Cluster).{key[:3]}" 

350 if gwf_job.label: 

351 htc_job_cmds[key] = os.path.join(gwf_job.label, htc_job_cmds[key]) 

352 htc_job_cmds[key] = os.path.join("jobs", htc_job_cmds[key]) 

353 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

354 

355 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwf_job.name, out_prefix)) 

356 

357 # Add the job cmds dict to the job object. 

358 htc_job.add_job_cmds(htc_job_cmds) 

359 

360 # Add run level attributes to job. 

361 htc_job.add_job_attrs(run_attrs) 

362 

363 # Add job attributes to job. 

364 _LOG.debug("gwf_job.attrs = %s", gwf_job.attrs) 

365 htc_job.add_job_attrs(gwf_job.attrs) 

366 htc_job.add_job_attrs({"bps_job_name": gwf_job.name, 

367 "bps_job_label": gwf_job.label, 

368 "bps_job_quanta": gwf_job.tags.get("quanta_summary", "")}) 

369 

370 return htc_job 

371 

372 def write(self, out_prefix): 

373 """Output HTCondor DAGMan files needed for workflow submission. 

374 

375 Parameters 

376 ---------- 

377 out_prefix : `str` 

378 Directory prefix for HTCondor files. 

379 """ 

380 self.submit_path = out_prefix 

381 os.makedirs(out_prefix, exist_ok=True) 

382 

383 # Write down the workflow in HTCondor format. 

384 self.dag.write(out_prefix, "jobs/{self.label}") 

385 

386 

387def _translate_job_cmds(generic_workflow, generic_workflow_job): 

388 """Translate the job data that are one to one mapping 

389 

390 Parameters 

391 ---------- 

392 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

393 Generic workflow that contains job to being converted. 

394 generic_workflow_job : `lsst.ctrl.bps.GenericWorkflowJob` 

395 Generic workflow job to be converted. 

396 

397 Returns 

398 ------- 

399 htc_job_commands : `dict` [`str`, `Any`] 

400 Contains commands which can appear in the HTCondor submit description 

401 file. 

402 """ 

403 jobcmds = {} 

404 

405 if generic_workflow_job.mail_to: 

406 jobcmds["notify_user"] = generic_workflow_job.mail_to 

407 

408 if generic_workflow_job.when_to_mail: 

409 jobcmds["notification"] = generic_workflow_job.when_to_mail 

410 

411 if generic_workflow_job.request_cpus: 

412 jobcmds["request_cpus"] = generic_workflow_job.request_cpus 

413 

414 if generic_workflow_job.request_disk: 

415 jobcmds["request_disk"] = f"{generic_workflow_job.request_disk}MB" 

416 

417 if generic_workflow_job.request_memory: 

418 jobcmds["request_memory"] = f"{generic_workflow_job.request_memory}MB" 

419 

420 if generic_workflow_job.priority: 

421 jobcmds["priority"] = generic_workflow_job.priority 

422 

423 try: 

424 cmd_parts = generic_workflow_job.cmdline.split(" ", 1) 

425 except AttributeError: 

426 print(generic_workflow_job) 

427 raise 

428 jobcmds["executable"] = _fix_env_var_syntax(cmd_parts[0]) 

429 if len(cmd_parts) > 1: 

430 jobcmds["arguments"] = cmd_parts[1] 

431 arguments = cmd_parts[1] 

432 arguments = _replace_cmd_vars(arguments, 

433 generic_workflow_job) 

434 arguments = _replace_file_vars(arguments, 

435 generic_workflow, 

436 generic_workflow_job) 

437 arguments = _fix_env_var_syntax(arguments) 

438 jobcmds["arguments"] = arguments 

439 

440 # Add extra "pass-thru" job commands 

441 if generic_workflow_job.profile: 

442 for key, val in generic_workflow_job.profile.items(): 

443 jobcmds[key] = htc_escape(val) 

444 

445 return jobcmds 

446 

447 

448def _fix_env_var_syntax(oldstr): 

449 """Change ENV place holders to HTCondor Env var syntax. 

450 

451 Parameters 

452 ---------- 

453 oldstr : `str` 

454 String in which environment variable syntax is to be fixed. 

455 

456 Returns 

457 ------- 

458 newstr : `str` 

459 Given string with environment variable syntax fixed. 

460 """ 

461 newstr = oldstr 

462 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

463 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

464 return newstr 

465 

466 

467def _replace_file_vars(arguments, workflow, gwjob): 

468 """Replace file placeholders in command line arguments with correct 

469 physical file names. 

470 

471 Parameters 

472 ---------- 

473 arguments : `str` 

474 Arguments string in which to replace file placeholders. 

475 workflow : `lsst.ctrl.bps.GenericWorkflow` 

476 Generic workflow that contains file information. 

477 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

478 The job corresponding to the arguments. 

479 

480 Returns 

481 ------- 

482 arguments : `str` 

483 Given arguments string with file placeholders replaced. 

484 """ 

485 # Replace input file placeholders with paths. 

486 for gwfile in workflow.get_job_inputs(gwjob.name): 

487 if gwfile.wms_transfer: 

488 uri = os.path.basename(gwfile.src_uri) 

489 else: 

490 uri = gwfile.src_uri 

491 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

492 

493 # Replace input file placeholders with paths. 

494 for gwfile in workflow.get_job_outputs(gwjob.name): 

495 if gwfile.wms_transfer: 

496 uri = os.path.basename(gwfile.src_uri) 

497 else: 

498 uri = gwfile.src_uri 

499 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

500 return arguments 

501 

502 

503def _replace_cmd_vars(arguments, gwjob): 

504 """Replace format-style placeholders in arguments. 

505 

506 Parameters 

507 ---------- 

508 arguments : `str` 

509 Arguments string in which to replace placeholders. 

510 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

511 Job containing values to be used to replace placeholders 

512 (in particular gwjob.cmdvals). 

513 

514 Returns 

515 ------- 

516 arguments : `str` 

517 Given arguments string with placeholders replaced. 

518 """ 

519 arguments = arguments.format(**gwjob.cmdvals) 

520 return arguments 

521 

522 

523def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, out_prefix): 

524 """Add job input files from generic workflow to job. 

525 

526 Parameters 

527 ---------- 

528 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

529 The generic workflow (e.g., has executable name and arguments). 

530 job_name : `str` 

531 Unique name for the job. 

532 out_prefix : `str` 

533 The root directory into which all WMS-specific files are written. 

534 

535 Returns 

536 ------- 

537 htc_commands : `dict` [`str`, `str`] 

538 HTCondor commands for the job submission. 

539 """ 

540 htc_commands = {} 

541 inputs = [] 

542 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

543 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix)) 

544 

545 if inputs: 

546 htc_commands["transfer_input_files"] = ",".join(inputs) 

547 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

548 return htc_commands 

549 

550 

551def _report_from_path(wms_path): 

552 """Gather run information from a given run directory. 

553 

554 Parameters 

555 ---------- 

556 wms_path : `str` 

557 The directory containing the submit side files (e.g., HTCondor files). 

558 

559 Returns 

560 ------- 

561 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

562 Run information for the detailed report. The key is the HTCondor id 

563 and the value is a collection of report information for that run. 

564 message : `str` 

565 Message to be printed with the summary report. 

566 """ 

567 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

568 if wms_workflow_id == MISSING_ID: 

569 run_reports = {} 

570 else: 

571 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

572 return run_reports, message 

573 

574 

575def _report_from_id(wms_workflow_id, hist): 

576 """Gather run information from a given run directory. 

577 

578 Parameters 

579 ---------- 

580 wms_workflow_id : `int` or `str` 

581 Limit to specific run based on id. 

582 hist : `float` 

583 Limit history search to this many days. 

584 

585 Returns 

586 ------- 

587 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

588 Run information for the detailed report. The key is the HTCondor id 

589 and the value is a collection of report information for that run. 

590 message : `str` 

591 Message to be printed with the summary report. 

592 """ 

593 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \ 

594 f"{int(float(wms_workflow_id))})" 

595 jobs = condor_q(constraint) 

596 if hist: 

597 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

598 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

599 hist_jobs = condor_history(constraint) 

600 _update_jobs(jobs, hist_jobs) 

601 

602 # keys in dictionary will be strings of format "ClusterId.ProcId" 

603 wms_workflow_id = str(wms_workflow_id) 

604 if not wms_workflow_id.endswith(".0"): 

605 wms_workflow_id += ".0" 

606 

607 if wms_workflow_id in jobs: 

608 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"]) 

609 _update_jobs(jobs, path_jobs) 

610 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

611 else: 

612 run_reports = {} 

613 message = f"Found 0 records for run id {wms_workflow_id}" 

614 return run_reports, message 

615 

616 

617def _get_info_from_path(wms_path): 

618 """Gather run information from a given run directory. 

619 

620 Parameters 

621 ---------- 

622 wms_path : `str` 

623 Directory containing HTCondor files. 

624 

625 Returns 

626 ------- 

627 wms_workflow_id : `str` 

628 The run id which is a DAGman job id. 

629 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

630 Information about jobs read from files in the given directory. 

631 The key is the HTCondor id and the value is a dictionary of HTCondor 

632 keys and values. 

633 message : `str` 

634 Message to be printed with the summary report. 

635 """ 

636 try: 

637 wms_workflow_id, jobs = read_dag_log(wms_path) 

638 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

639 _update_jobs(jobs, read_node_status(wms_path)) 

640 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

641 

642 # Add more info for DAGman job 

643 job = jobs[wms_workflow_id] 

644 job.update(read_dag_status(wms_path)) 

645 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

646 if "bps_run" not in job: 

647 _add_run_info(wms_path, job) 

648 

649 message = htc_check_dagman_output(wms_path) 

650 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

651 jobs[wms_workflow_id]["total_jobs"]) 

652 except StopIteration: 

653 message = f"Could not find HTCondor files in {wms_path}" 

654 _LOG.warning(message) 

655 wms_workflow_id = MISSING_ID 

656 jobs = {} 

657 

658 return wms_workflow_id, jobs, message 

659 

660 

661def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

662 """Gather run information to be used in generating summary reports. 

663 

664 Parameters 

665 ---------- 

666 wms_workflow_id : `str` 

667 Run lookup restricted to given user. 

668 jobs : `float` 

669 How many previous days to search for run information. 

670 

671 Returns 

672 ------- 

673 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

674 Run information for the detailed report. The key is the given HTCondor 

675 id and the value is a collection of report information for that run. 

676 """ 

677 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

678 dag_job = jobs[wms_workflow_id] 

679 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job: 

680 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id) 

681 return {} 

682 report = WmsRunReport(wms_id=wms_workflow_id, 

683 path=dag_job["Iwd"], 

684 label=dag_job.get("bps_job_label", "MISS"), 

685 run=dag_job.get("bps_run", "MISS"), 

686 project=dag_job.get("bps_project", "MISS"), 

687 campaign=dag_job.get("bps_campaign", "MISS"), 

688 payload=dag_job.get("bps_payload", "MISS"), 

689 operator=_get_owner(dag_job), 

690 run_summary=_get_run_summary(dag_job), 

691 state=_htc_status_to_wms_state(dag_job), 

692 jobs=[], 

693 total_number_jobs=dag_job["total_jobs"], 

694 job_state_counts=dag_job["state_counts"]) 

695 

696 try: 

697 for job in jobs.values(): 

698 if job["ClusterId"] != int(float(wms_workflow_id)): 

699 job_report = WmsJobReport(wms_id=job["ClusterId"], 

700 name=job.get("DAGNodeName", str(job["ClusterId"])), 

701 label=job.get("bps_job_label", 

702 pegasus_name_to_label(job["DAGNodeName"])), 

703 state=_htc_status_to_wms_state(job)) 

704 if job_report.label == "init": 

705 job_report.label = "pipetaskInit" 

706 report.jobs.append(job_report) 

707 except KeyError as ex: 

708 _LOG.error("Job missing key '%s': %s", str(ex), job) 

709 raise 

710 

711 run_reports = {report.wms_id: report} 

712 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

713 return run_reports 

714 

715 

716def _summary_report(user, hist, pass_thru): 

717 """Gather run information to be used in generating summary reports. 

718 

719 Parameters 

720 ---------- 

721 user : `str` 

722 Run lookup restricted to given user. 

723 hist : `float` 

724 How many previous days to search for run information. 

725 pass_thru : `str` 

726 Advanced users can define the HTCondor constraint to be used 

727 when searching queue and history. 

728 

729 Returns 

730 ------- 

731 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

732 Run information for the summary report. The keys are HTCondor ids and 

733 the values are collections of report information for each run. 

734 message : `str` 

735 Message to be printed with the summary report. 

736 """ 

737 # only doing summary report so only look for dagman jobs 

738 if pass_thru: 

739 constraint = pass_thru 

740 else: 

741 # Notes: 

742 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

743 # manually restarted. 

744 # * Any job with DAGManJobID isn't a DAG job 

745 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

746 if user: 

747 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

748 

749 # Check runs in queue. 

750 jobs = condor_q(constraint) 

751 

752 if hist: 

753 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

754 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

755 hist_jobs = condor_history(constraint) 

756 _update_jobs(jobs, hist_jobs) 

757 

758 _LOG.debug("Job ids from queue and history %s", jobs.keys()) 

759 

760 # Have list of DAGMan jobs, need to get run_report info. 

761 run_reports = {} 

762 for job in jobs.values(): 

763 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

764 # If didn't get from queue information (e.g., Kerberos bug), 

765 # try reading from file. 

766 if total_jobs == 0: 

767 try: 

768 job.update(read_dag_status(job["Iwd"])) 

769 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

770 except StopIteration: 

771 pass # don't kill report can't find htcondor files 

772 

773 if "bps_run" not in job: 

774 _add_run_info(job["Iwd"], job) 

775 report = WmsRunReport(wms_id=job.get("ClusterId", MISSING_ID), 

776 path=job["Iwd"], 

777 label=job.get("bps_job_label", "MISS"), 

778 run=job.get("bps_run", "MISS"), 

779 project=job.get("bps_project", "MISS"), 

780 campaign=job.get("bps_campaign", "MISS"), 

781 payload=job.get("bps_payload", "MISS"), 

782 operator=_get_owner(job), 

783 run_summary=_get_run_summary(job), 

784 state=_htc_status_to_wms_state(job), 

785 jobs=[], 

786 total_number_jobs=total_jobs, 

787 job_state_counts=state_counts) 

788 

789 run_reports[report.wms_id] = report 

790 

791 return run_reports, "" 

792 

793 

794def _add_run_info(wms_path, job): 

795 """Find BPS run information elsewhere for runs without bps attributes. 

796 

797 Parameters 

798 ---------- 

799 wms_path : `str` 

800 Path to submit files for the run. 

801 job : `dict` [`str`, `Any`] 

802 HTCondor dag job information. 

803 

804 Raises 

805 ------ 

806 StopIteration 

807 If cannot find file it is looking for. Permission errors are 

808 caught and job's run is marked with error. 

809 """ 

810 path = Path(wms_path) / "jobs" 

811 try: 

812 jobdir = next(path.glob("*"), Path(wms_path)) 

813 try: 

814 subfile = next(jobdir.glob("*.sub")) 

815 _LOG.debug("_add_run_info: subfile = %s", subfile) 

816 with open(subfile, "r") as fh: 

817 for line in fh: 

818 if line.startswith("+bps_"): 

819 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

820 if m: 

821 _LOG.debug("Matching line: %s", line) 

822 job[m.group(1)] = m.group(2).replace('"', "") 

823 else: 

824 _LOG.debug("Could not parse attribute: %s", line) 

825 except StopIteration: 

826 job["bps_run"] = "Missing" 

827 

828 except PermissionError: 

829 job["bps_run"] = "PermissionError" 

830 _LOG.debug("After adding job = %s", job) 

831 

832 

833def _get_owner(job): 

834 """Get the owner of a dag job. 

835 

836 Parameters 

837 ---------- 

838 job : `dict` [`str`, `Any`] 

839 HTCondor dag job information. 

840 

841 Returns 

842 ------- 

843 owner : `str` 

844 Owner of the dag job. 

845 """ 

846 owner = job.get("bps_operator", None) 

847 if not owner: 

848 owner = job.get("Owner", None) 

849 if not owner: 

850 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

851 owner = "MISS" 

852 return owner 

853 

854 

855def _get_run_summary(job): 

856 """Get the run summary for a job. 

857 

858 Parameters 

859 ---------- 

860 job : `dict` [`str`, `Any`] 

861 HTCondor dag job information. 

862 

863 Returns 

864 ------- 

865 summary : `str` 

866 Number of jobs per PipelineTask label in approximate pipeline order. 

867 Format: <label>:<count>[;<label>:<count>]+ 

868 """ 

869 summary = job.get("bps_run_summary", None) 

870 if not summary: 

871 summary, _ = summary_from_dag(job["Iwd"]) 

872 if not summary: 

873 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

874 _LOG.debug("_get_run_summary: summary=%s", summary) 

875 

876 # Workaround sometimes using init vs pipetaskInit 

877 summary = summary.replace("init:", "pipetaskInit:") 

878 

879 if "pegasus_version" in job and "pegasus" not in summary: 

880 summary += ";pegasus:0" 

881 

882 return summary 

883 

884 

885def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

886 """Count number of jobs per WMS state. 

887 

888 Parameters 

889 ---------- 

890 wms_workflow_id : `str` 

891 HTCondor job id. 

892 jobs : `dict` [`str`, `Any`] 

893 HTCondor dag job information. 

894 

895 Returns 

896 ------- 

897 total_count : `int` 

898 Total number of dag nodes. 

899 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

900 Keys are the different WMS states and values are counts of jobs 

901 that are in that WMS state. 

902 """ 

903 state_counts = dict.fromkeys(WmsStates, 0) 

904 

905 for jid, jinfo in jobs.items(): 

906 if jid != wms_workflow_id: 

907 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

908 

909 total_counted = sum(state_counts.values()) 

910 if "NodesTotal" in jobs[wms_workflow_id]: 

911 total_count = jobs[wms_workflow_id]["NodesTotal"] 

912 else: 

913 total_count = total_counted 

914 

915 state_counts[WmsStates.UNREADY] += total_count - total_counted 

916 

917 return total_count, state_counts 

918 

919 

920def _get_state_counts_from_dag_job(job): 

921 """Count number of jobs per WMS state. 

922 

923 Parameters 

924 ---------- 

925 job : `dict` [`str`, `Any`] 

926 HTCondor dag job information. 

927 

928 Returns 

929 ------- 

930 total_count : `int` 

931 Total number of dag nodes. 

932 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

933 Keys are the different WMS states and values are counts of jobs 

934 that are in that WMS state. 

935 """ 

936 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

937 state_counts = dict.fromkeys(WmsStates, 0) 

938 if "DAG_NodesReady" in job: 

939 state_counts = { 

940 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

941 WmsStates.READY: job.get("DAG_NodesReady", 0), 

942 WmsStates.HELD: job.get("JobProcsHeld", 0), 

943 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

944 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

945 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

946 total_jobs = job.get("DAG_NodesTotal") 

947 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

948 elif "NodesFailed" in job: 

949 state_counts = { 

950 WmsStates.UNREADY: job.get("NodesUnready", 0), 

951 WmsStates.READY: job.get("NodesReady", 0), 

952 WmsStates.HELD: job.get("JobProcsHeld", 0), 

953 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

954 WmsStates.FAILED: job.get("NodesFailed", 0), 

955 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

956 try: 

957 total_jobs = job.get("NodesTotal") 

958 except KeyError as ex: 

959 _LOG.error("Job missing %s. job = %s", str(ex), job) 

960 raise 

961 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

962 else: 

963 # With Kerberos job auth and Kerberos bug, if warning would be printed 

964 # for every DAG. 

965 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

966 total_jobs = 0 

967 

968 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

969 return total_jobs, state_counts 

970 

971 

972def _htc_status_to_wms_state(job): 

973 """Convert HTCondor job status to generic wms state. 

974 

975 Parameters 

976 ---------- 

977 job : `dict` [`str`, `Any`] 

978 HTCondor job information. 

979 

980 Returns 

981 ------- 

982 wms_state : `WmsStates` 

983 The equivalent WmsState to given job's status. 

984 """ 

985 wms_state = WmsStates.MISFIT 

986 if "JobStatus" in job: 

987 wms_state = _htc_job_status_to_wms_state(job) 

988 elif "NodeStatus" in job: 

989 wms_state = _htc_node_status_to_wms_state(job) 

990 return wms_state 

991 

992 

993def _htc_job_status_to_wms_state(job): 

994 """Convert HTCondor job status to generic wms state. 

995 

996 Parameters 

997 ---------- 

998 job : `dict` [`str`, `Any`] 

999 HTCondor job information. 

1000 

1001 Returns 

1002 ------- 

1003 wms_state : `lsst.ctrl.bps.WmsStates` 

1004 The equivalent WmsState to given job's status. 

1005 """ 

1006 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1007 type(job["JobStatus"])) 

1008 job_status = int(job["JobStatus"]) 

1009 wms_state = WmsStates.MISFIT 

1010 

1011 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1012 if job_status == JobStatus.IDLE: 

1013 wms_state = WmsStates.PENDING 

1014 elif job_status == JobStatus.RUNNING: 

1015 wms_state = WmsStates.RUNNING 

1016 elif job_status == JobStatus.REMOVED: 

1017 wms_state = WmsStates.DELETED 

1018 elif job_status == JobStatus.COMPLETED: 

1019 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1020 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1021 job.get("ReturnValue", 0): 

1022 wms_state = WmsStates.FAILED 

1023 else: 

1024 wms_state = WmsStates.SUCCEEDED 

1025 elif job_status == JobStatus.HELD: 

1026 wms_state = WmsStates.HELD 

1027 

1028 return wms_state 

1029 

1030 

1031def _htc_node_status_to_wms_state(job): 

1032 """Convert HTCondor status to generic wms state. 

1033 

1034 Parameters 

1035 ---------- 

1036 job : `dict` [`str`, `Any`] 

1037 HTCondor job information. 

1038 

1039 Returns 

1040 ------- 

1041 wms_state : `lsst.ctrl.bps.WmsStates` 

1042 The equivalent WmsState to given node's status. 

1043 """ 

1044 wms_state = WmsStates.MISFIT 

1045 

1046 status = job["NodeStatus"] 

1047 if status == NodeStatus.NOT_READY: 

1048 wms_state = WmsStates.UNREADY 

1049 elif status == NodeStatus.READY: 

1050 wms_state = WmsStates.READY 

1051 elif status == NodeStatus.PRERUN: 

1052 wms_state = WmsStates.MISFIT 

1053 elif status == NodeStatus.SUBMITTED: 

1054 if job["JobProcsHeld"]: 

1055 wms_state = WmsStates.HELD 

1056 elif job["StatusDetails"] == "not_idle": 

1057 wms_state = WmsStates.RUNNING 

1058 elif job["JobProcsQueued"]: 

1059 wms_state = WmsStates.PENDING 

1060 elif status == NodeStatus.POSTRUN: 

1061 wms_state = WmsStates.MISFIT 

1062 elif status == NodeStatus.DONE: 

1063 wms_state = WmsStates.SUCCEEDED 

1064 elif status == NodeStatus.ERROR: 

1065 wms_state = WmsStates.FAILED 

1066 

1067 return wms_state 

1068 

1069 

1070def _update_jobs(jobs1, jobs2): 

1071 """Update jobs1 with info in jobs2. 

1072 

1073 (Basically an update for nested dictionaries.) 

1074 

1075 Parameters 

1076 ---------- 

1077 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1078 HTCondor job information to be updated. 

1079 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1080 Additional HTCondor job information. 

1081 """ 

1082 for jid, jinfo in jobs2.items(): 

1083 if jid in jobs1: 

1084 jobs1[jid].update(jinfo) 

1085 else: 

1086 jobs1[jid] = jinfo 

1087 

1088 

1089def _wms_id_to_cluster(wms_id): 

1090 """Convert WMS ID to cluster ID. 

1091 

1092 Parameters 

1093 ---------- 

1094 wms_id : `int` or `float` or `str` 

1095 HTCondor job id or path. 

1096 

1097 Returns 

1098 ------- 

1099 cluster_id : `int` 

1100 HTCondor cluster id. 

1101 """ 

1102 # If wms_id represents path, get numeric id. 

1103 try: 

1104 cluster_id = int(float(wms_id)) 

1105 except ValueError: 

1106 wms_path = Path(wms_id) 

1107 if wms_path.exists(): 

1108 try: 

1109 cluster_id, _ = read_dag_log(wms_id) 

1110 cluster_id = int(float(cluster_id)) 

1111 except StopIteration: 

1112 cluster_id = 0 

1113 else: 

1114 cluster_id = 0 

1115 return cluster_id