Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

659 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import os 

29import re 

30import logging 

31from enum import IntEnum, auto 

32from pathlib import Path 

33from collections import defaultdict 

34 

35import htcondor 

36from packaging import version 

37 

38from lsst.utils.timer import time_this 

39from ... import ( 

40 BaseWmsWorkflow, 

41 BaseWmsService, 

42 GenericWorkflow, 

43 GenericWorkflowJob, 

44 WmsRunReport, 

45 WmsJobReport, 

46 WmsStates 

47) 

48from ...bps_utils import ( 

49 chdir, 

50 create_count_summary 

51) 

52from .lssthtc import ( 

53 HTCDag, 

54 HTCJob, 

55 MISSING_ID, 

56 JobStatus, 

57 NodeStatus, 

58 htc_backup_files, 

59 htc_check_dagman_output, 

60 htc_create_submit_from_cmd, 

61 htc_create_submit_from_dag, 

62 htc_create_submit_from_file, 

63 htc_escape, 

64 htc_submit_dag, 

65 htc_version, 

66 read_dag_info, 

67 read_dag_log, 

68 read_dag_status, 

69 read_node_status, 

70 write_dag_info, 

71 condor_q, 

72 condor_search, 

73 condor_status, 

74 pegasus_name_to_label, 

75 summary_from_dag, 

76) 

77 

78 

79class WmsIdType(IntEnum): 

80 """Type of valid WMS ids. 

81 """ 

82 

83 UNKNOWN = auto() 

84 """The type of id cannot be determined. 

85 """ 

86 

87 LOCAL = auto() 

88 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

89 """ 

90 

91 GLOBAL = auto() 

92 """Id is a HTCondor's global job id. 

93 """ 

94 

95 PATH = auto() 

96 """Id is a submission path. 

97 """ 

98 

99 

100DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

101"""Default pattern for searching execute machines in an HTCondor pool. 

102""" 

103 

104_LOG = logging.getLogger(__name__) 

105 

106 

107class HTCondorService(BaseWmsService): 

108 """HTCondor version of WMS service. 

109 """ 

110 def prepare(self, config, generic_workflow, out_prefix=None): 

111 """Convert generic workflow to an HTCondor DAG ready for submission. 

112 

113 Parameters 

114 ---------- 

115 config : `lsst.ctrl.bps.BpsConfig` 

116 BPS configuration that includes necessary submit/runtime 

117 information. 

118 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

119 The generic workflow (e.g., has executable name and arguments). 

120 out_prefix : `str` 

121 The root directory into which all WMS-specific files are written. 

122 

123 Returns 

124 ---------- 

125 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

126 HTCondor workflow ready to be run. 

127 """ 

128 _LOG.debug("out_prefix = '%s'", out_prefix) 

129 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

130 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

131 f"{self.__class__.__module__}." 

132 f"{self.__class__.__name__}") 

133 

134 with time_this(log=_LOG, level=logging.INFO, prefix=None, 

135 msg="Completed writing out HTCondor workflow"): 

136 workflow.write(out_prefix) 

137 return workflow 

138 

139 def submit(self, workflow): 

140 """Submit a single HTCondor workflow. 

141 

142 Parameters 

143 ---------- 

144 workflow : `lsst.ctrl.bps.BaseWorkflow` 

145 A single HTCondor workflow to submit. run_id is updated after 

146 successful submission to WMS. 

147 """ 

148 dag = workflow.dag 

149 

150 ver = version.parse(htc_version()) 

151 if ver >= version.parse("8.9.3"): 

152 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

153 else: 

154 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

155 

156 # For workflow portability, internal paths are all relative. Hence 

157 # the DAG needs to be submitted to HTCondor from inside the submit 

158 # directory. 

159 with chdir(workflow.submit_path): 

160 _LOG.info("Submitting from directory: %s", os.getcwd()) 

161 schedd_dag_info = htc_submit_dag(sub) 

162 if schedd_dag_info: 

163 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

164 

165 _, dag_info = schedd_dag_info.popitem() 

166 _, dag_ad = dag_info.popitem() 

167 

168 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

169 workflow.run_id = dag.run_id 

170 else: 

171 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

172 

173 def restart(self, wms_workflow_id): 

174 """Restart a failed DAGMan workflow. 

175 

176 Parameters 

177 ---------- 

178 wms_workflow_id : `str` 

179 The directory with HTCondor files. 

180 

181 Returns 

182 ------- 

183 run_id : `str` 

184 HTCondor id of the restarted DAGMan job. If restart failed, it will 

185 be set to None. 

186 run_name : `str` 

187 Name of the restarted workflow. If restart failed, it will be set 

188 to None. 

189 message : `str` 

190 A message describing any issues encountered during the restart. 

191 If there were no issues, an empty string is returned. 

192 """ 

193 wms_path = Path(wms_workflow_id) 

194 if not wms_path.is_dir(): 

195 return None, None, f"Directory '{wms_path}' not found" 

196 

197 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

198 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

199 if not rescue_dags: 

200 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

201 

202 _LOG.info("Verifying that the workflow is not already in the job queue") 

203 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"') 

204 if schedd_dag_info: 

205 _, dag_info = schedd_dag_info.popitem() 

206 _, dag_ad = dag_info.popitem() 

207 id_ = dag_ad["GlobalJobId"] 

208 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

209 

210 _LOG.info("Checking execution status of the workflow") 

211 warn = False 

212 dag_ad = read_dag_status(str(wms_path)) 

213 if dag_ad: 

214 nodes_total = dag_ad.get("NodesTotal", 0) 

215 if nodes_total != 0: 

216 nodes_done = dag_ad.get("NodesDone", 0) 

217 if nodes_total == nodes_done: 

218 return None, None, "All jobs in the workflow finished successfully" 

219 else: 

220 warn = True 

221 else: 

222 warn = True 

223 if warn: 

224 _LOG.warning("Cannot determine the execution status of the workflow, " 

225 "continuing with restart regardless") 

226 

227 _LOG.info("Backing up select HTCondor files from previous run attempt") 

228 htc_backup_files(wms_path, subdir='backups') 

229 

230 # For workflow portability, internal paths are all relative. Hence 

231 # the DAG needs to be resubmitted to HTCondor from inside the submit 

232 # directory. 

233 _LOG.info("Adding workflow to the job queue") 

234 run_id, run_name, message = None, None, "" 

235 with chdir(wms_path): 

236 try: 

237 dag_path = next(wms_path.glob('*.dag.condor.sub')) 

238 except StopIteration: 

239 message = f"DAGMan submit description file not found in '{wms_path}'" 

240 else: 

241 sub = htc_create_submit_from_file(dag_path.name) 

242 schedd_dag_info = htc_submit_dag(sub) 

243 

244 # Save select information about the DAGMan job to a file. Use 

245 # the run name (available in the ClassAd) as the filename. 

246 if schedd_dag_info: 

247 dag_info = next(iter(schedd_dag_info.values())) 

248 dag_ad = next(iter(dag_info.values())) 

249 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

250 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

251 run_name = dag_ad["bps_run"] 

252 else: 

253 message = "DAGMan job information unavailable" 

254 

255 return run_id, run_name, message 

256 

257 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

258 """Query WMS for list of submitted WMS workflows/jobs. 

259 

260 This should be a quick lookup function to create list of jobs for 

261 other functions. 

262 

263 Parameters 

264 ---------- 

265 wms_id : `int` or `str`, optional 

266 Id or path that can be used by WMS service to look up job. 

267 user : `str`, optional 

268 User whose submitted jobs should be listed. 

269 require_bps : `bool`, optional 

270 Whether to require jobs returned in list to be bps-submitted jobs. 

271 pass_thru : `str`, optional 

272 Information to pass through to WMS. 

273 is_global : `bool`, optional 

274 If set, all job queues (and their histories) will be queried for 

275 job information. Defaults to False which means that only the local 

276 job queue will be queried. 

277 

278 Returns 

279 ------- 

280 job_ids : `list` [`Any`] 

281 Only job ids to be used by cancel and other functions. Typically 

282 this means top-level jobs (i.e., not children jobs). 

283 """ 

284 _LOG.debug("list_submitted_jobs params: " 

285 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

286 wms_id, user, require_bps, pass_thru, is_global) 

287 

288 # Determine which Schedds will be queried for job information. 

289 coll = htcondor.Collector() 

290 

291 schedd_ads = [] 

292 if is_global: 

293 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

294 else: 

295 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

296 

297 # Construct appropriate constraint expression using provided arguments. 

298 constraint = "False" 

299 if wms_id is None: 

300 if user is not None: 

301 constraint = f'(Owner == "{user}")' 

302 else: 

303 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

304 if cluster_id is not None: 

305 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

306 

307 # If provided id is either a submission path or a global id, 

308 # make sure the right Schedd will be queried regardless of 

309 # 'is_global' value. 

310 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

311 schedd_ads = [schedd_ad] 

312 if require_bps: 

313 constraint += ' && (bps_isjob == "True")' 

314 if pass_thru: 

315 if "-forcex" in pass_thru: 

316 pass_thru_2 = pass_thru.replace("-forcex", "") 

317 if pass_thru_2 and not pass_thru_2.isspace(): 

318 constraint += f" && ({pass_thru_2})" 

319 else: 

320 constraint += f" && ({pass_thru})" 

321 

322 # Create a list of scheduler daemons which need to be queried. 

323 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

324 

325 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

326 results = condor_q(constraint=constraint, schedds=schedds) 

327 

328 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

329 job_ids = [] 

330 for schedd_name, job_info in results.items(): 

331 for job_id, job_ad in job_info.items(): 

332 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

333 if "DAGManJobId" not in job_ad: 

334 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

335 else: 

336 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

337 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

338 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

339 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

340 

341 _LOG.debug("job_ids = %s", job_ids) 

342 return job_ids 

343 

344 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

345 """Return run information based upon given constraints. 

346 

347 Parameters 

348 ---------- 

349 wms_workflow_id : `str`, optional 

350 Limit to specific run based on id. 

351 user : `str`, optional 

352 Limit results to runs for this user. 

353 hist : `float`, optional 

354 Limit history search to this many days. Defaults to 0. 

355 pass_thru : `str`, optional 

356 Constraints to pass through to HTCondor. 

357 is_global : `bool`, optional 

358 If set, all job queues (and their histories) will be queried for 

359 job information. Defaults to False which means that only the local 

360 job queue will be queried. 

361 

362 Returns 

363 ------- 

364 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

365 Information about runs from given job information. 

366 message : `str` 

367 Extra message for report command to print. This could be pointers 

368 to documentation or to WMS specific commands. 

369 """ 

370 if wms_workflow_id: 

371 id_type = _wms_id_type(wms_workflow_id) 

372 if id_type == WmsIdType.LOCAL: 

373 schedulers = _locate_schedds(locate_all=is_global) 

374 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

375 elif id_type == WmsIdType.GLOBAL: 

376 schedulers = _locate_schedds(locate_all=True) 

377 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

378 elif id_type == WmsIdType.PATH: 

379 run_reports, message = _report_from_path(wms_workflow_id) 

380 else: 

381 run_reports, message = {}, 'Invalid job id' 

382 else: 

383 schedulers = _locate_schedds(locate_all=is_global) 

384 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

385 _LOG.debug("report: %s, %s", run_reports, message) 

386 

387 return list(run_reports.values()), message 

388 

389 def cancel(self, wms_id, pass_thru=None): 

390 """Cancel submitted workflows/jobs. 

391 

392 Parameters 

393 ---------- 

394 wms_id : `str` 

395 Id or path of job that should be canceled. 

396 pass_thru : `str`, optional 

397 Information to pass through to WMS. 

398 

399 Returns 

400 -------- 

401 deleted : `bool` 

402 Whether successful deletion or not. Currently, if any doubt or any 

403 individual jobs not deleted, return False. 

404 message : `str` 

405 Any message from WMS (e.g., error details). 

406 """ 

407 _LOG.debug("Canceling wms_id = %s", wms_id) 

408 

409 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

410 

411 if cluster_id is None: 

412 deleted = False 

413 message = "invalid id" 

414 else: 

415 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s", 

416 cluster_id, schedd_ad["Name"]) 

417 schedd = htcondor.Schedd(schedd_ad) 

418 

419 constraint = f"ClusterId == {cluster_id}" 

420 if pass_thru is not None and "-forcex" in pass_thru: 

421 pass_thru_2 = pass_thru.replace("-forcex", "") 

422 if pass_thru_2 and not pass_thru_2.isspace(): 

423 constraint += f"&& ({pass_thru_2})" 

424 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

425 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

426 else: 

427 if pass_thru: 

428 constraint += f"&& ({pass_thru})" 

429 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

430 results = schedd.act(htcondor.JobAction.Remove, constraint) 

431 _LOG.debug("Remove results: %s", results) 

432 

433 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

434 deleted = True 

435 message = "" 

436 else: 

437 deleted = False 

438 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

439 message = "no such bps job in batch queue" 

440 else: 

441 message = f"unknown problems deleting: {results}" 

442 

443 _LOG.debug("deleted: %s; message = %s", deleted, message) 

444 return deleted, message 

445 

446 

447class HTCondorWorkflow(BaseWmsWorkflow): 

448 """Single HTCondor workflow. 

449 

450 Parameters 

451 ---------- 

452 name : `str` 

453 Unique name for Workflow used when naming files. 

454 config : `lsst.ctrl.bps.BpsConfig` 

455 BPS configuration that includes necessary submit/runtime information. 

456 """ 

457 def __init__(self, name, config=None): 

458 super().__init__(name, config) 

459 self.dag = None 

460 

461 @classmethod 

462 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

463 # Docstring inherited 

464 htc_workflow = cls(generic_workflow.name, config) 

465 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

466 

467 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

468 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

469 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

470 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

471 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

472 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

473 

474 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

475 if isinstance(tmp_template, str): 

476 subdir_template = defaultdict(lambda: tmp_template) 

477 else: 

478 subdir_template = tmp_template 

479 

480 # Create all DAG jobs 

481 site_values = {} # cache compute site specific values to reduce config lookups 

482 for job_name in generic_workflow: 

483 gwjob = generic_workflow.get_job(job_name) 

484 if gwjob.compute_site not in site_values: 

485 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

486 htc_job = _create_job(subdir_template[gwjob.label], site_values[gwjob.compute_site], 

487 generic_workflow, gwjob, out_prefix) 

488 htc_workflow.dag.add_job(htc_job) 

489 

490 # Add job dependencies to the DAG 

491 for job_name in generic_workflow: 

492 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

493 

494 # If final job exists in generic workflow, create DAG final job 

495 final = generic_workflow.get_final() 

496 if final and isinstance(final, GenericWorkflowJob): 

497 if final.compute_site and final.compute_site not in site_values: 

498 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

499 final_htjob = _create_job(subdir_template[final.label], site_values[final.compute_site], 

500 generic_workflow, final, out_prefix) 

501 if "post" not in final_htjob.dagcmds: 

502 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

503 f" {final.name} $DAG_STATUS $RETURN" 

504 htc_workflow.dag.add_final_job(final_htjob) 

505 elif final and isinstance(final, GenericWorkflow): 

506 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

507 elif final: 

508 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

509 

510 return htc_workflow 

511 

512 def write(self, out_prefix): 

513 """Output HTCondor DAGMan files needed for workflow submission. 

514 

515 Parameters 

516 ---------- 

517 out_prefix : `str` 

518 Directory prefix for HTCondor files. 

519 """ 

520 self.submit_path = out_prefix 

521 os.makedirs(out_prefix, exist_ok=True) 

522 

523 # Write down the workflow in HTCondor format. 

524 self.dag.write(out_prefix, "jobs/{self.label}") 

525 

526 

527def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

528 """Convert GenericWorkflow job nodes to DAG jobs. 

529 

530 Parameters 

531 ---------- 

532 subdir_template : `str` 

533 Template for making subdirs. 

534 site_values : `dict` 

535 Site specific values 

536 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

537 Generic workflow that is being converted. 

538 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

539 The generic job to convert to a HTCondor job. 

540 out_prefix : `str` 

541 Directory prefix for HTCondor files. 

542 

543 Returns 

544 ------- 

545 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

546 The HTCondor job equivalent to the given generic job. 

547 """ 

548 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

549 

550 curvals = defaultdict(str) 

551 curvals["label"] = gwjob.label 

552 if gwjob.tags: 

553 curvals.update(gwjob.tags) 

554 

555 subdir = subdir_template.format_map(curvals) 

556 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

557 

558 htc_job_cmds = { 

559 "universe": "vanilla", 

560 "should_transfer_files": "YES", 

561 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

562 "transfer_output_files": '""', # Set to empty string to disable 

563 "transfer_executable": "False", 

564 "getenv": "True", 

565 

566 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

567 # to put SIGBUS jobs on hold. 

568 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

569 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

570 "on_exit_hold_subcode": "34" 

571 } 

572 

573 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

574 

575 # job stdout, stderr, htcondor user log. 

576 for key in ("output", "error", "log"): 

577 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

578 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

579 

580 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], 

581 out_prefix)) 

582 

583 # Add the job cmds dict to the job object. 

584 htc_job.add_job_cmds(htc_job_cmds) 

585 

586 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

587 

588 # Add job attributes to job. 

589 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

590 htc_job.add_job_attrs(gwjob.attrs) 

591 htc_job.add_job_attrs(site_values["attrs"]) 

592 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

593 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

594 "bps_job_label": gwjob.label}) 

595 

596 return htc_job 

597 

598 

599def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

600 """Translate the job data that are one to one mapping 

601 

602 Parameters 

603 ---------- 

604 cached_vals : `dict` [`str`, `Any`] 

605 Config values common to jobs with same label. 

606 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

607 Generic workflow that contains job to being converted. 

608 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

609 Generic workflow job to be converted. 

610 

611 Returns 

612 ------- 

613 htc_job_commands : `dict` [`str`, `Any`] 

614 Contains commands which can appear in the HTCondor submit description 

615 file. 

616 """ 

617 # Values in the job script that just are name mappings. 

618 job_translation = {"mail_to": "notify_user", 

619 "when_to_mail": "notification", 

620 "request_cpus": "request_cpus", 

621 "priority": "priority", 

622 "category": "category"} 

623 

624 jobcmds = {} 

625 for gwkey, htckey in job_translation.items(): 

626 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

627 

628 # job commands that need modification 

629 if gwjob.number_of_retries: 

630 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

631 

632 if gwjob.retry_unless_exit: 

633 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

634 

635 if gwjob.request_disk: 

636 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

637 

638 if gwjob.request_memory: 

639 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

640 

641 if gwjob.memory_multiplier: 

642 # Do not use try-except! At the moment, BpsConfig returns an empty 

643 # string if it does not contain the key. 

644 memory_limit = cached_vals["memoryLimit"] 

645 if not memory_limit: 

646 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

647 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

648 "search pattern 'executeMachinesPattern' might help.") 

649 

650 # Set maximal amount of memory job can ask for. 

651 # 

652 # The check below assumes that 'memory_limit' was set to a value which 

653 # realistically reflects actual physical limitations of a given compute 

654 # resource. 

655 memory_max = memory_limit 

656 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

657 memory_max = gwjob.request_memory_max 

658 

659 # Make job ask for more memory each time it failed due to insufficient 

660 # memory requirements. 

661 jobcmds["request_memory"] = \ 

662 _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

663 

664 # Periodically release jobs which are being held due to exceeding 

665 # memory. Stop doing that (by removing the job from the HTCondor queue) 

666 # after the maximal number of retries has been reached or the job was 

667 # already run at maximal allowed memory. 

668 jobcmds["periodic_release"] = \ 

669 _create_periodic_release_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

670 jobcmds["periodic_remove"] = \ 

671 _create_periodic_remove_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

672 

673 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

674 # May need to move to special site-specific implementation if sites use 

675 # other mechanisms. 

676 if gwjob.concurrency_limit: 

677 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

678 

679 # Handle command line 

680 if gwjob.executable.transfer_executable: 

681 jobcmds["transfer_executable"] = "True" 

682 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

683 else: 

684 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

685 

686 if gwjob.arguments: 

687 arguments = gwjob.arguments 

688 arguments = _replace_cmd_vars(arguments, gwjob) 

689 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

690 arguments = _fix_env_var_syntax(arguments) 

691 jobcmds["arguments"] = arguments 

692 

693 # Add extra "pass-thru" job commands 

694 if gwjob.profile: 

695 for key, val in gwjob.profile.items(): 

696 jobcmds[key] = htc_escape(val) 

697 for key, val in cached_vals["profile"]: 

698 jobcmds[key] = htc_escape(val) 

699 

700 return jobcmds 

701 

702 

703def _translate_dag_cmds(gwjob): 

704 """Translate job values into DAGMan commands. 

705 

706 Parameters 

707 ---------- 

708 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

709 Job containing values to be translated. 

710 

711 Returns 

712 ------- 

713 dagcmds : `dict` [`str`, `Any`] 

714 DAGMan commands for the job. 

715 """ 

716 # Values in the dag script that just are name mappings. 

717 dag_translation = {"abort_on_value": "abort_dag_on", 

718 "abort_return_value": "abort_exit"} 

719 

720 dagcmds = {} 

721 for gwkey, htckey in dag_translation.items(): 

722 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

723 

724 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

725 return dagcmds 

726 

727 

728def _fix_env_var_syntax(oldstr): 

729 """Change ENV place holders to HTCondor Env var syntax. 

730 

731 Parameters 

732 ---------- 

733 oldstr : `str` 

734 String in which environment variable syntax is to be fixed. 

735 

736 Returns 

737 ------- 

738 newstr : `str` 

739 Given string with environment variable syntax fixed. 

740 """ 

741 newstr = oldstr 

742 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

743 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

744 return newstr 

745 

746 

747def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

748 """Replace file placeholders in command line arguments with correct 

749 physical file names. 

750 

751 Parameters 

752 ---------- 

753 use_shared : `bool` 

754 Whether HTCondor can assume shared filesystem. 

755 arguments : `str` 

756 Arguments string in which to replace file placeholders. 

757 workflow : `lsst.ctrl.bps.GenericWorkflow` 

758 Generic workflow that contains file information. 

759 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

760 The job corresponding to the arguments. 

761 

762 Returns 

763 ------- 

764 arguments : `str` 

765 Given arguments string with file placeholders replaced. 

766 """ 

767 # Replace input file placeholders with paths. 

768 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

769 if not gwfile.wms_transfer: 

770 # Must assume full URI if in command line and told WMS is not 

771 # responsible for transferring file. 

772 uri = gwfile.src_uri 

773 elif use_shared: 

774 if gwfile.job_shared: 

775 # Have shared filesystems and jobs can share file. 

776 uri = gwfile.src_uri 

777 else: 

778 # Taking advantage of inside knowledge. Not future-proof. 

779 # Temporary fix until have job wrapper that pulls files 

780 # within job. 

781 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

782 uri = "butler.yaml" 

783 else: 

784 uri = os.path.basename(gwfile.src_uri) 

785 else: # Using push transfer 

786 uri = os.path.basename(gwfile.src_uri) 

787 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

788 

789 # Replace output file placeholders with paths. 

790 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

791 if not gwfile.wms_transfer: 

792 # Must assume full URI if in command line and told WMS is not 

793 # responsible for transferring file. 

794 uri = gwfile.src_uri 

795 elif use_shared: 

796 if gwfile.job_shared: 

797 # Have shared filesystems and jobs can share file. 

798 uri = gwfile.src_uri 

799 else: 

800 uri = os.path.basename(gwfile.src_uri) 

801 else: # Using push transfer 

802 uri = os.path.basename(gwfile.src_uri) 

803 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

804 return arguments 

805 

806 

807def _replace_cmd_vars(arguments, gwjob): 

808 """Replace format-style placeholders in arguments. 

809 

810 Parameters 

811 ---------- 

812 arguments : `str` 

813 Arguments string in which to replace placeholders. 

814 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

815 Job containing values to be used to replace placeholders 

816 (in particular gwjob.cmdvals). 

817 

818 Returns 

819 ------- 

820 arguments : `str` 

821 Given arguments string with placeholders replaced. 

822 """ 

823 try: 

824 arguments = arguments.format(**gwjob.cmdvals) 

825 except (KeyError, TypeError): # TypeError in case None instead of {} 

826 _LOG.error("Could not replace command variables:\n" 

827 "arguments: %s\n" 

828 "cmdvals: %s", arguments, gwjob.cmdvals) 

829 raise 

830 return arguments 

831 

832 

833def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

834 """Add job input files from generic workflow to job. 

835 

836 Parameters 

837 ---------- 

838 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

839 The generic workflow (e.g., has executable name and arguments). 

840 job_name : `str` 

841 Unique name for the job. 

842 use_shared : `bool` 

843 Whether job has access to files via shared filesystem. 

844 out_prefix : `str` 

845 The root directory into which all WMS-specific files are written. 

846 

847 Returns 

848 ------- 

849 htc_commands : `dict` [`str`, `str`] 

850 HTCondor commands for the job submission script. 

851 """ 

852 htc_commands = {} 

853 inputs = [] 

854 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

855 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

856 

857 uri = Path(gwf_file.src_uri) 

858 

859 # Note if use_shared and job_shared, don't need to transfer file. 

860 

861 if not use_shared: # Copy file using push to job 

862 inputs.append(str(uri.relative_to(out_prefix))) 

863 elif not gwf_file.job_shared: # Jobs require own copy 

864 

865 # if using shared filesystem, but still need copy in job. Use 

866 # HTCondor's curl plugin for a local copy. 

867 

868 # Execution butler is represented as a directory which the 

869 # curl plugin does not handle. Taking advantage of inside 

870 # knowledge for temporary fix until have job wrapper that pulls 

871 # files within job. 

872 if gwf_file.name == "butlerConfig": 

873 # The execution butler directory doesn't normally exist until 

874 # the submit phase so checking for suffix instead of using 

875 # is_dir(). If other non-yaml file exists they would have a 

876 # different gwf_file.name. 

877 if uri.suffix == ".yaml": # Single file, so just copy. 

878 inputs.append(f"file://{uri}") 

879 else: 

880 inputs.append(f"file://{uri / 'butler.yaml'}") 

881 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

882 elif uri.is_dir(): 

883 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job " 

884 f"{gwf_file.src_uri}") 

885 else: 

886 inputs.append(f"file://{uri}") 

887 

888 if inputs: 

889 htc_commands["transfer_input_files"] = ",".join(inputs) 

890 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

891 return htc_commands 

892 

893 

894def _report_from_path(wms_path): 

895 """Gather run information from a given run directory. 

896 

897 Parameters 

898 ---------- 

899 wms_path : `str` 

900 The directory containing the submit side files (e.g., HTCondor files). 

901 

902 Returns 

903 ------- 

904 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

905 Run information for the detailed report. The key is the HTCondor id 

906 and the value is a collection of report information for that run. 

907 message : `str` 

908 Message to be printed with the summary report. 

909 """ 

910 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

911 if wms_workflow_id == MISSING_ID: 

912 run_reports = {} 

913 else: 

914 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

915 return run_reports, message 

916 

917 

918def _report_from_id(wms_workflow_id, hist, schedds=None): 

919 """Gather run information using workflow id. 

920 

921 Parameters 

922 ---------- 

923 wms_workflow_id : `str` 

924 Limit to specific run based on id. 

925 hist : `float` 

926 Limit history search to this many days. 

927 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

928 HTCondor schedulers which to query for job information. If None 

929 (default), all queries will be run against the local scheduler only. 

930 

931 Returns 

932 ------- 

933 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

934 Run information for the detailed report. The key is the HTCondor id 

935 and the value is a collection of report information for that run. 

936 message : `str` 

937 Message to be printed with the summary report. 

938 """ 

939 dag_constraint = 'regexp("dagman$", Cmd)' 

940 try: 

941 cluster_id = int(float(wms_workflow_id)) 

942 except ValueError: 

943 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

944 else: 

945 dag_constraint += f" && ClusterId == {cluster_id}" 

946 

947 # With the current implementation of the condor_* functions the query will 

948 # always return only one match per Scheduler. 

949 # 

950 # Even in the highly unlikely situation where HTCondor history (which 

951 # condor_search queries too) is long enough to have jobs from before the 

952 # cluster ids were rolled over (and as a result there is more then one job 

953 # with the same cluster id) they will not show up in the results. 

954 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

955 if len(schedd_dag_info) == 0: 

956 run_reports = {} 

957 message = "" 

958 elif len(schedd_dag_info) == 1: 

959 _, dag_info = schedd_dag_info.popitem() 

960 dag_id, dag_ad = dag_info.popitem() 

961 

962 # Create a mapping between jobs and their classads. The keys will be 

963 # of format 'ClusterId.ProcId'. 

964 job_info = {dag_id: dag_ad} 

965 

966 # Find jobs (nodes) belonging to that DAGMan job. 

967 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

968 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

969 if schedd_job_info: 

970 _, node_info = schedd_job_info.popitem() 

971 job_info.update(node_info) 

972 

973 # Collect additional pieces of information about jobs using HTCondor 

974 # files in the submission directory. 

975 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

976 _update_jobs(job_info, path_jobs) 

977 

978 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

979 else: 

980 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

981 run_reports = {} 

982 message = f"More than one job matches id '{wms_workflow_id}', " \ 

983 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

984 return run_reports, message 

985 

986 

987def _get_info_from_path(wms_path): 

988 """Gather run information from a given run directory. 

989 

990 Parameters 

991 ---------- 

992 wms_path : `str` 

993 Directory containing HTCondor files. 

994 

995 Returns 

996 ------- 

997 wms_workflow_id : `str` 

998 The run id which is a DAGman job id. 

999 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1000 Information about jobs read from files in the given directory. 

1001 The key is the HTCondor id and the value is a dictionary of HTCondor 

1002 keys and values. 

1003 message : `str` 

1004 Message to be printed with the summary report. 

1005 """ 

1006 messages = [] 

1007 try: 

1008 wms_workflow_id, jobs = read_dag_log(wms_path) 

1009 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1010 _update_jobs(jobs, read_node_status(wms_path)) 

1011 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1012 

1013 # Add more info for DAGman job 

1014 job = jobs[wms_workflow_id] 

1015 job.update(read_dag_status(wms_path)) 

1016 

1017 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1018 if "bps_run" not in job: 

1019 _add_run_info(wms_path, job) 

1020 

1021 message = htc_check_dagman_output(wms_path) 

1022 if message: 

1023 messages.append(message) 

1024 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

1025 jobs[wms_workflow_id]["total_jobs"]) 

1026 

1027 # Add extra pieces of information which cannot be found in HTCondor 

1028 # generated files like 'GlobalJobId'. 

1029 # 

1030 # Do not treat absence of this file as a serious error. Neither runs 

1031 # submitted with earlier versions of the plugin nor the runs submitted 

1032 # with Pegasus plugin will have it at the moment. However, once enough 

1033 # time passes and Pegasus plugin will have its own report() method 

1034 # (instead of sneakily using HTCondor's one), the lack of that file 

1035 # should be treated as seriously as lack of any other file. 

1036 try: 

1037 job_info = read_dag_info(wms_path) 

1038 except FileNotFoundError as exc: 

1039 message = f"Warn: Some information may not be available: {exc}" 

1040 messages.append(message) 

1041 else: 

1042 schedd_name = next(iter(job_info)) 

1043 job_ad = next(iter(job_info[schedd_name].values())) 

1044 job.update(job_ad) 

1045 except FileNotFoundError: 

1046 message = f"Could not find HTCondor files in '{wms_path}'" 

1047 _LOG.warning(message) 

1048 messages.append(message) 

1049 wms_workflow_id = MISSING_ID 

1050 jobs = {} 

1051 

1052 message = '\n'.join([msg for msg in messages if msg]) 

1053 return wms_workflow_id, jobs, message 

1054 

1055 

1056def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1057 """Gather run information to be used in generating summary reports. 

1058 

1059 Parameters 

1060 ---------- 

1061 wms_workflow_id : `str` 

1062 The run id to create the report for. 

1063 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1064 Mapping HTCondor job id to job information. 

1065 

1066 Returns 

1067 ------- 

1068 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1069 Run information for the detailed report. The key is the given HTCondor 

1070 id and the value is a collection of report information for that run. 

1071 """ 

1072 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1073 dag_job = jobs[wms_workflow_id] 

1074 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1075 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1076 path=dag_job["Iwd"], 

1077 label=dag_job.get("bps_job_label", "MISS"), 

1078 run=dag_job.get("bps_run", "MISS"), 

1079 project=dag_job.get("bps_project", "MISS"), 

1080 campaign=dag_job.get("bps_campaign", "MISS"), 

1081 payload=dag_job.get("bps_payload", "MISS"), 

1082 operator=_get_owner(dag_job), 

1083 run_summary=_get_run_summary(dag_job), 

1084 state=_htc_status_to_wms_state(dag_job), 

1085 jobs=[], 

1086 total_number_jobs=dag_job["total_jobs"], 

1087 job_state_counts=dag_job["state_counts"]) 

1088 

1089 for job_id, job_info in jobs.items(): 

1090 try: 

1091 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

1092 job_report = WmsJobReport(wms_id=job_id, 

1093 name=job_info.get("DAGNodeName", job_id), 

1094 label=job_info.get("bps_job_label", 

1095 pegasus_name_to_label(job_info["DAGNodeName"])), 

1096 state=_htc_status_to_wms_state(job_info)) 

1097 if job_report.label == "init": 

1098 job_report.label = "pipetaskInit" 

1099 report.jobs.append(job_report) 

1100 except KeyError as ex: 

1101 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1102 raise 

1103 

1104 run_reports = {report.wms_id: report} 

1105 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1106 return run_reports 

1107 

1108 

1109def _summary_report(user, hist, pass_thru, schedds=None): 

1110 """Gather run information to be used in generating summary reports. 

1111 

1112 Parameters 

1113 ---------- 

1114 user : `str` 

1115 Run lookup restricted to given user. 

1116 hist : `float` 

1117 How many previous days to search for run information. 

1118 pass_thru : `str` 

1119 Advanced users can define the HTCondor constraint to be used 

1120 when searching queue and history. 

1121 

1122 Returns 

1123 ------- 

1124 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1125 Run information for the summary report. The keys are HTCondor ids and 

1126 the values are collections of report information for each run. 

1127 message : `str` 

1128 Message to be printed with the summary report. 

1129 """ 

1130 # only doing summary report so only look for dagman jobs 

1131 if pass_thru: 

1132 constraint = pass_thru 

1133 else: 

1134 # Notes: 

1135 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1136 # manually restarted. 

1137 # * Any job with DAGManJobID isn't a DAG job 

1138 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1139 if user: 

1140 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1141 

1142 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1143 

1144 # Have list of DAGMan jobs, need to get run_report info. 

1145 run_reports = {} 

1146 for jobs in job_info.values(): 

1147 for job_id, job in jobs.items(): 

1148 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1149 # If didn't get from queue information (e.g., Kerberos bug), 

1150 # try reading from file. 

1151 if total_jobs == 0: 

1152 try: 

1153 job.update(read_dag_status(job["Iwd"])) 

1154 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1155 except StopIteration: 

1156 pass # don't kill report can't find htcondor files 

1157 

1158 if "bps_run" not in job: 

1159 _add_run_info(job["Iwd"], job) 

1160 report = WmsRunReport(wms_id=job_id, 

1161 global_wms_id=job["GlobalJobId"], 

1162 path=job["Iwd"], 

1163 label=job.get("bps_job_label", "MISS"), 

1164 run=job.get("bps_run", "MISS"), 

1165 project=job.get("bps_project", "MISS"), 

1166 campaign=job.get("bps_campaign", "MISS"), 

1167 payload=job.get("bps_payload", "MISS"), 

1168 operator=_get_owner(job), 

1169 run_summary=_get_run_summary(job), 

1170 state=_htc_status_to_wms_state(job), 

1171 jobs=[], 

1172 total_number_jobs=total_jobs, 

1173 job_state_counts=state_counts) 

1174 run_reports[report.global_wms_id] = report 

1175 

1176 return run_reports, "" 

1177 

1178 

1179def _add_run_info(wms_path, job): 

1180 """Find BPS run information elsewhere for runs without bps attributes. 

1181 

1182 Parameters 

1183 ---------- 

1184 wms_path : `str` 

1185 Path to submit files for the run. 

1186 job : `dict` [`str`, `Any`] 

1187 HTCondor dag job information. 

1188 

1189 Raises 

1190 ------ 

1191 StopIteration 

1192 If cannot find file it is looking for. Permission errors are 

1193 caught and job's run is marked with error. 

1194 """ 

1195 path = Path(wms_path) / "jobs" 

1196 try: 

1197 subfile = next(path.glob("**/*.sub")) 

1198 except (StopIteration, PermissionError): 

1199 job["bps_run"] = "Unavailable" 

1200 else: 

1201 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1202 try: 

1203 with open(subfile, "r", encoding='utf-8') as fh: 

1204 for line in fh: 

1205 if line.startswith("+bps_"): 

1206 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1207 if m: 

1208 _LOG.debug("Matching line: %s", line) 

1209 job[m.group(1)] = m.group(2).replace('"', "") 

1210 else: 

1211 _LOG.debug("Could not parse attribute: %s", line) 

1212 except PermissionError: 

1213 job["bps_run"] = "PermissionError" 

1214 _LOG.debug("After adding job = %s", job) 

1215 

1216 

1217def _get_owner(job): 

1218 """Get the owner of a dag job. 

1219 

1220 Parameters 

1221 ---------- 

1222 job : `dict` [`str`, `Any`] 

1223 HTCondor dag job information. 

1224 

1225 Returns 

1226 ------- 

1227 owner : `str` 

1228 Owner of the dag job. 

1229 """ 

1230 owner = job.get("bps_operator", None) 

1231 if not owner: 

1232 owner = job.get("Owner", None) 

1233 if not owner: 

1234 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1235 owner = "MISS" 

1236 return owner 

1237 

1238 

1239def _get_run_summary(job): 

1240 """Get the run summary for a job. 

1241 

1242 Parameters 

1243 ---------- 

1244 job : `dict` [`str`, `Any`] 

1245 HTCondor dag job information. 

1246 

1247 Returns 

1248 ------- 

1249 summary : `str` 

1250 Number of jobs per PipelineTask label in approximate pipeline order. 

1251 Format: <label>:<count>[;<label>:<count>]+ 

1252 """ 

1253 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1254 if not summary: 

1255 summary, _ = summary_from_dag(job["Iwd"]) 

1256 if not summary: 

1257 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1258 _LOG.debug("_get_run_summary: summary=%s", summary) 

1259 

1260 # Workaround sometimes using init vs pipetaskInit 

1261 summary = summary.replace("init:", "pipetaskInit:") 

1262 

1263 if "pegasus_version" in job and "pegasus" not in summary: 

1264 summary += ";pegasus:0" 

1265 

1266 return summary 

1267 

1268 

1269def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1270 """Count number of jobs per WMS state. 

1271 

1272 Parameters 

1273 ---------- 

1274 wms_workflow_id : `str` 

1275 HTCondor job id. 

1276 jobs : `dict` [`str`, `Any`] 

1277 HTCondor dag job information. 

1278 

1279 Returns 

1280 ------- 

1281 total_count : `int` 

1282 Total number of dag nodes. 

1283 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1284 Keys are the different WMS states and values are counts of jobs 

1285 that are in that WMS state. 

1286 """ 

1287 state_counts = dict.fromkeys(WmsStates, 0) 

1288 

1289 for jid, jinfo in jobs.items(): 

1290 if jid != wms_workflow_id: 

1291 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1292 

1293 total_counted = sum(state_counts.values()) 

1294 if "NodesTotal" in jobs[wms_workflow_id]: 

1295 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1296 else: 

1297 total_count = total_counted 

1298 

1299 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1300 

1301 return total_count, state_counts 

1302 

1303 

1304def _get_state_counts_from_dag_job(job): 

1305 """Count number of jobs per WMS state. 

1306 

1307 Parameters 

1308 ---------- 

1309 job : `dict` [`str`, `Any`] 

1310 HTCondor dag job information. 

1311 

1312 Returns 

1313 ------- 

1314 total_count : `int` 

1315 Total number of dag nodes. 

1316 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1317 Keys are the different WMS states and values are counts of jobs 

1318 that are in that WMS state. 

1319 """ 

1320 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1321 state_counts = dict.fromkeys(WmsStates, 0) 

1322 if "DAG_NodesReady" in job: 

1323 state_counts = { 

1324 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1325 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1326 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1327 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1328 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1329 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1330 total_jobs = job.get("DAG_NodesTotal") 

1331 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1332 elif "NodesFailed" in job: 

1333 state_counts = { 

1334 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1335 WmsStates.READY: job.get("NodesReady", 0), 

1336 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1337 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1338 WmsStates.FAILED: job.get("NodesFailed", 0), 

1339 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1340 try: 

1341 total_jobs = job.get("NodesTotal") 

1342 except KeyError as ex: 

1343 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1344 raise 

1345 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1346 else: 

1347 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1348 # for every DAG. 

1349 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1350 total_jobs = 0 

1351 

1352 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1353 return total_jobs, state_counts 

1354 

1355 

1356def _htc_status_to_wms_state(job): 

1357 """Convert HTCondor job status to generic wms state. 

1358 

1359 Parameters 

1360 ---------- 

1361 job : `dict` [`str`, `Any`] 

1362 HTCondor job information. 

1363 

1364 Returns 

1365 ------- 

1366 wms_state : `WmsStates` 

1367 The equivalent WmsState to given job's status. 

1368 """ 

1369 wms_state = WmsStates.MISFIT 

1370 if "JobStatus" in job: 

1371 wms_state = _htc_job_status_to_wms_state(job) 

1372 elif "NodeStatus" in job: 

1373 wms_state = _htc_node_status_to_wms_state(job) 

1374 return wms_state 

1375 

1376 

1377def _htc_job_status_to_wms_state(job): 

1378 """Convert HTCondor job status to generic wms state. 

1379 

1380 Parameters 

1381 ---------- 

1382 job : `dict` [`str`, `Any`] 

1383 HTCondor job information. 

1384 

1385 Returns 

1386 ------- 

1387 wms_state : `lsst.ctrl.bps.WmsStates` 

1388 The equivalent WmsState to given job's status. 

1389 """ 

1390 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1391 type(job["JobStatus"])) 

1392 job_status = int(job["JobStatus"]) 

1393 wms_state = WmsStates.MISFIT 

1394 

1395 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1396 if job_status == JobStatus.IDLE: 

1397 wms_state = WmsStates.PENDING 

1398 elif job_status == JobStatus.RUNNING: 

1399 wms_state = WmsStates.RUNNING 

1400 elif job_status == JobStatus.REMOVED: 

1401 wms_state = WmsStates.DELETED 

1402 elif job_status == JobStatus.COMPLETED: 

1403 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1404 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1405 job.get("ReturnValue", 0): 

1406 wms_state = WmsStates.FAILED 

1407 else: 

1408 wms_state = WmsStates.SUCCEEDED 

1409 elif job_status == JobStatus.HELD: 

1410 wms_state = WmsStates.HELD 

1411 

1412 return wms_state 

1413 

1414 

1415def _htc_node_status_to_wms_state(job): 

1416 """Convert HTCondor status to generic wms state. 

1417 

1418 Parameters 

1419 ---------- 

1420 job : `dict` [`str`, `Any`] 

1421 HTCondor job information. 

1422 

1423 Returns 

1424 ------- 

1425 wms_state : `lsst.ctrl.bps.WmsStates` 

1426 The equivalent WmsState to given node's status. 

1427 """ 

1428 wms_state = WmsStates.MISFIT 

1429 

1430 status = job["NodeStatus"] 

1431 if status == NodeStatus.NOT_READY: 

1432 wms_state = WmsStates.UNREADY 

1433 elif status == NodeStatus.READY: 

1434 wms_state = WmsStates.READY 

1435 elif status == NodeStatus.PRERUN: 

1436 wms_state = WmsStates.MISFIT 

1437 elif status == NodeStatus.SUBMITTED: 

1438 if job["JobProcsHeld"]: 

1439 wms_state = WmsStates.HELD 

1440 elif job["StatusDetails"] == "not_idle": 

1441 wms_state = WmsStates.RUNNING 

1442 elif job["JobProcsQueued"]: 

1443 wms_state = WmsStates.PENDING 

1444 elif status == NodeStatus.POSTRUN: 

1445 wms_state = WmsStates.MISFIT 

1446 elif status == NodeStatus.DONE: 

1447 wms_state = WmsStates.SUCCEEDED 

1448 elif status == NodeStatus.ERROR: 

1449 # Use job exist instead of post script exit 

1450 if "DAGMAN error 0" in job["StatusDetails"]: 

1451 wms_state = WmsStates.SUCCEEDED 

1452 else: 

1453 wms_state = WmsStates.FAILED 

1454 

1455 return wms_state 

1456 

1457 

1458def _update_jobs(jobs1, jobs2): 

1459 """Update jobs1 with info in jobs2. 

1460 

1461 (Basically an update for nested dictionaries.) 

1462 

1463 Parameters 

1464 ---------- 

1465 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1466 HTCondor job information to be updated. 

1467 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1468 Additional HTCondor job information. 

1469 """ 

1470 for jid, jinfo in jobs2.items(): 

1471 if jid in jobs1: 

1472 jobs1[jid].update(jinfo) 

1473 else: 

1474 jobs1[jid] = jinfo 

1475 

1476 

1477def _wms_id_type(wms_id): 

1478 """Determine the type of the WMS id. 

1479 

1480 Parameters 

1481 ---------- 

1482 wms_id : `str` 

1483 WMS id identifying a job. 

1484 

1485 Returns 

1486 ------- 

1487 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1488 Type of WMS id. 

1489 """ 

1490 try: 

1491 int(float(wms_id)) 

1492 except ValueError: 

1493 wms_path = Path(wms_id) 

1494 if wms_path.exists(): 

1495 id_type = WmsIdType.PATH 

1496 else: 

1497 id_type = WmsIdType.GLOBAL 

1498 except TypeError: 

1499 id_type = WmsIdType.UNKNOWN 

1500 else: 

1501 id_type = WmsIdType.LOCAL 

1502 return id_type 

1503 

1504 

1505def _wms_id_to_cluster(wms_id): 

1506 """Convert WMS id to cluster id. 

1507 

1508 Parameters 

1509 ---------- 

1510 wms_id : `int` or `float` or `str` 

1511 HTCondor job id or path. 

1512 

1513 Returns 

1514 ------- 

1515 schedd_ad : `classad.ClassAd` 

1516 ClassAd describing the scheduler managing the job with the given id. 

1517 cluster_id : `int` 

1518 HTCondor cluster id. 

1519 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1520 The type of the provided id. 

1521 """ 

1522 coll = htcondor.Collector() 

1523 

1524 schedd_ad = None 

1525 cluster_id = None 

1526 id_type = _wms_id_type(wms_id) 

1527 if id_type == WmsIdType.LOCAL: 

1528 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1529 cluster_id = int(float(wms_id)) 

1530 elif id_type == WmsIdType.GLOBAL: 

1531 constraint = f'GlobalJobId == "{wms_id}"' 

1532 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1533 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1534 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1535 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries) 

1536 if (ads := query.nextAdsNonBlocking())} 

1537 if results: 

1538 schedd_name = next(iter(results)) 

1539 schedd_ad = schedd_ads[schedd_name] 

1540 cluster_id = results[schedd_name]["ClusterId"] 

1541 elif id_type == WmsIdType.PATH: 

1542 try: 

1543 job_info = read_dag_info(wms_id) 

1544 except (FileNotFoundError, PermissionError, IOError): 

1545 pass 

1546 else: 

1547 schedd_name = next(iter(job_info)) 

1548 job_id = next(iter(job_info[schedd_name])) 

1549 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1550 cluster_id = int(float(job_id)) 

1551 else: 

1552 pass 

1553 return schedd_ad, cluster_id, id_type 

1554 

1555 

1556def _create_periodic_release_expr(memory, multiplier, limit): 

1557 """Construct an HTCondorAd expression for releasing held jobs. 

1558 

1559 The expression instruct HTCondor to release any job which was put on hold 

1560 due to exceeding memory requirements back to the job queue providing it 

1561 satisfies all of the conditions below: 

1562 

1563 * number of run attempts did not reach allowable number of retries, 

1564 * the memory requirements in the last failed run attempt did not reach 

1565 the specified memory limit. 

1566 

1567 Parameters 

1568 ---------- 

1569 memory : `int` 

1570 Requested memory in MB. 

1571 multiplier : `float` 

1572 Memory growth rate between retires. 

1573 limit : `int` 

1574 Memory limit. 

1575 

1576 Returns 

1577 ------- 

1578 expr : `str` 

1579 A string representing an HTCondor ClassAd expression for releasing jobs 

1580 which have been held due to exceeding the memory requirements. 

1581 """ 

1582 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1583 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1584 

1585 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1586 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1587 # The special comparison operators ensure that all comparisons below will 

1588 # evaluate to FALSE in this case. 

1589 # 

1590 # Note: 

1591 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1592 # the entire expression should evaluate to FALSE when the job is not HELD. 

1593 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1594 # but better safe than sorry. 

1595 was_mem_exceeded = "JobStatus == 5 " \ 

1596 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1597 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1598 

1599 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1600 return expr 

1601 

1602 

1603def _create_periodic_remove_expr(memory, multiplier, limit): 

1604 """Construct an HTCondorAd expression for removing jobs from the queue. 

1605 

1606 The expression instruct HTCondor to remove any job which was put on hold 

1607 due to exceeding memory requirements from the job queue providing it 

1608 satisfies any of the conditions below: 

1609 

1610 * allowable number of retries was reached, 

1611 * the memory requirements during the last failed run attempt reached 

1612 the specified memory limit. 

1613 

1614 Parameters 

1615 ---------- 

1616 memory : `int` 

1617 Requested memory in MB. 

1618 multiplier : `float` 

1619 Memory growth rate between retires. 

1620 limit : `int` 

1621 Memory limit. 

1622 

1623 Returns 

1624 ------- 

1625 expr : `str` 

1626 A string representing an HTCondor ClassAd expression for removing jobs 

1627 which were run at the maximal allowable memory and still exceeded 

1628 the memory requirements. 

1629 """ 

1630 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1631 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1632 

1633 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1634 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1635 # The special comparison operators ensure that all comparisons below will 

1636 # evaluate to FALSE in this case. 

1637 # 

1638 # Note: 

1639 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1640 # the entire expression should evaluate to FALSE when the job is not HELD. 

1641 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1642 # but better safe than sorry. 

1643 was_mem_exceeded = "JobStatus == 5 " \ 

1644 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1645 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1646 

1647 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1648 return expr 

1649 

1650 

1651def _create_request_memory_expr(memory, multiplier, limit): 

1652 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1653 

1654 Parameters 

1655 ---------- 

1656 memory : `int` 

1657 Requested memory in MB. 

1658 multiplier : `float` 

1659 Memory growth rate between retires. 

1660 limit : `int` 

1661 Memory limit. 

1662 

1663 Returns 

1664 ------- 

1665 expr : `str` 

1666 A string representing an HTCondor ClassAd expression enabling safe 

1667 memory scaling between job retries. 

1668 """ 

1669 # The check if the job was held due to exceeding memory requirements 

1670 # will be made *after* job was released back to the job queue (is in 

1671 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1672 # the ones describing job's current state. 

1673 # 

1674 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1675 # initially put in the job queue. The special comparison operators ensure 

1676 # that all comparisons below will evaluate to FALSE in this case. 

1677 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1678 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1679 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1680 

1681 # If job runs the first time or was held for reasons other than exceeding 

1682 # the memory, set the required memory to the requested value or use 

1683 # the memory value measured by HTCondor (MemoryUsage) depending on 

1684 # whichever is greater. 

1685 expr = f"({was_mem_exceeded}) " \ 

1686 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " \ 

1687 f": max({{{memory}, MemoryUsage ?: 0}})" 

1688 return expr 

1689 

1690 

1691def _locate_schedds(locate_all=False): 

1692 """Find out Scheduler daemons in an HTCondor pool. 

1693 

1694 Parameters 

1695 ---------- 

1696 locate_all : `bool`, optional 

1697 If True, all available schedulers in the HTCondor pool will be located. 

1698 False by default which means that the search will be limited to looking 

1699 for the Scheduler running on a local host. 

1700 

1701 Returns 

1702 ------- 

1703 schedds : `dict` [`str`, `htcondor.Schedd`] 

1704 A mapping between Scheduler names and Python objects allowing for 

1705 interacting with them. 

1706 """ 

1707 coll = htcondor.Collector() 

1708 

1709 schedd_ads = [] 

1710 if locate_all: 

1711 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1712 else: 

1713 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1714 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1715 

1716 

1717def _gather_site_values(config, compute_site): 

1718 """Gather values specific to given site. 

1719 

1720 Parameters 

1721 ---------- 

1722 config : `lsst.ctrl.bps.BpsConfig` 

1723 BPS configuration that includes necessary submit/runtime 

1724 information. 

1725 compute_site : `str` 

1726 Compute site name. 

1727 

1728 Returns 

1729 ------- 

1730 site_values : `dict` [`str`, `Any`] 

1731 Values specific to the given site. 

1732 """ 

1733 site_values = {"attrs": {}, "profile": {}} 

1734 search_opts = {} 

1735 if compute_site: 

1736 search_opts["curvals"] = {"curr_site": compute_site} 

1737 

1738 # Determine the hard limit for the memory requirement. 

1739 found, limit = config.search('memoryLimit', opt=search_opts) 

1740 if not found: 

1741 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1742 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1743 del search_opts["default"] 

1744 

1745 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1746 # by definition, they cannot have more memory than 

1747 # the partitionable slot they are the part of. 

1748 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1749 pool_info = condor_status(constraint=constraint) 

1750 try: 

1751 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1752 except ValueError: 

1753 _LOG.debug("No execute machine in the pool matches %s", patt) 

1754 if limit: 

1755 config[".bps_defined.memory_limit"] = limit 

1756 

1757 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1758 site_values["memoryLimit"] = limit 

1759 

1760 key = f".site.{compute_site}.profile.condor" 

1761 if key in config: 

1762 for key, val in config[key].items(): 

1763 if key.startswith("+"): 

1764 site_values["attrs"][key[1:]] = val 

1765 else: 

1766 site_values["profile"][key] = val 

1767 

1768 return site_values