Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%

743 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-28 03:11 -0700

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Interface between generic workflow to HTCondor workflow system. 

29""" 

30 

31__all__ = ["HTCondorService", "HTCondorWorkflow"] 

32 

33 

34import logging 

35import os 

36import re 

37from collections import defaultdict 

38from enum import IntEnum, auto 

39from pathlib import Path 

40 

41import htcondor 

42from lsst.ctrl.bps import ( 

43 BaseWmsService, 

44 BaseWmsWorkflow, 

45 GenericWorkflow, 

46 GenericWorkflowJob, 

47 WmsJobReport, 

48 WmsRunReport, 

49 WmsStates, 

50) 

51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary 

52from lsst.utils.timer import time_this 

53from packaging import version 

54 

55from .lssthtc import ( 

56 MISSING_ID, 

57 HTCDag, 

58 HTCJob, 

59 JobStatus, 

60 NodeStatus, 

61 condor_history, 

62 condor_q, 

63 condor_search, 

64 condor_status, 

65 htc_backup_files, 

66 htc_check_dagman_output, 

67 htc_create_submit_from_cmd, 

68 htc_create_submit_from_dag, 

69 htc_create_submit_from_file, 

70 htc_escape, 

71 htc_submit_dag, 

72 htc_version, 

73 pegasus_name_to_label, 

74 read_dag_info, 

75 read_dag_log, 

76 read_dag_status, 

77 read_node_status, 

78 summary_from_dag, 

79 write_dag_info, 

80) 

81 

82 

83class WmsIdType(IntEnum): 

84 """Type of valid WMS ids.""" 

85 

86 UNKNOWN = auto() 

87 """The type of id cannot be determined. 

88 """ 

89 

90 LOCAL = auto() 

91 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

92 """ 

93 

94 GLOBAL = auto() 

95 """Id is a HTCondor's global job id. 

96 """ 

97 

98 PATH = auto() 

99 """Id is a submission path. 

100 """ 

101 

102 

103DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

104"""Default pattern for searching execute machines in an HTCondor pool. 

105""" 

106 

107_LOG = logging.getLogger(__name__) 

108 

109 

110class HTCondorService(BaseWmsService): 

111 """HTCondor version of WMS service.""" 

112 

113 def prepare(self, config, generic_workflow, out_prefix=None): 

114 """Convert generic workflow to an HTCondor DAG ready for submission. 

115 

116 Parameters 

117 ---------- 

118 config : `lsst.ctrl.bps.BpsConfig` 

119 BPS configuration that includes necessary submit/runtime 

120 information. 

121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

122 The generic workflow (e.g., has executable name and arguments). 

123 out_prefix : `str` 

124 The root directory into which all WMS-specific files are written. 

125 

126 Returns 

127 ------- 

128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

129 HTCondor workflow ready to be run. 

130 """ 

131 _LOG.debug("out_prefix = '%s'", out_prefix) 

132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

133 workflow = HTCondorWorkflow.from_generic_workflow( 

134 config, 

135 generic_workflow, 

136 out_prefix, 

137 f"{self.__class__.__module__}.{self.__class__.__name__}", 

138 ) 

139 

140 with time_this( 

141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

142 ): 

143 workflow.write(out_prefix) 

144 return workflow 

145 

146 def submit(self, workflow): 

147 """Submit a single HTCondor workflow. 

148 

149 Parameters 

150 ---------- 

151 workflow : `lsst.ctrl.bps.BaseWorkflow` 

152 A single HTCondor workflow to submit. run_id is updated after 

153 successful submission to WMS. 

154 """ 

155 dag = workflow.dag 

156 

157 ver = version.parse(htc_version()) 

158 if ver >= version.parse("8.9.3"): 

159 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

160 else: 

161 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

162 

163 # For workflow portability, internal paths are all relative. Hence 

164 # the DAG needs to be submitted to HTCondor from inside the submit 

165 # directory. 

166 with chdir(workflow.submit_path): 

167 _LOG.info("Submitting from directory: %s", os.getcwd()) 

168 schedd_dag_info = htc_submit_dag(sub) 

169 if schedd_dag_info: 

170 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

171 

172 _, dag_info = schedd_dag_info.popitem() 

173 _, dag_ad = dag_info.popitem() 

174 

175 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

176 workflow.run_id = dag.run_id 

177 else: 

178 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

179 

180 def restart(self, wms_workflow_id): 

181 """Restart a failed DAGMan workflow. 

182 

183 Parameters 

184 ---------- 

185 wms_workflow_id : `str` 

186 The directory with HTCondor files. 

187 

188 Returns 

189 ------- 

190 run_id : `str` 

191 HTCondor id of the restarted DAGMan job. If restart failed, it will 

192 be set to None. 

193 run_name : `str` 

194 Name of the restarted workflow. If restart failed, it will be set 

195 to None. 

196 message : `str` 

197 A message describing any issues encountered during the restart. 

198 If there were no issues, an empty string is returned. 

199 """ 

200 wms_path, id_type = _wms_id_to_dir(wms_workflow_id) 

201 if wms_path is None: 

202 return ( 

203 None, 

204 None, 

205 ( 

206 f"workflow with run id '{wms_workflow_id}' not found. " 

207 f"Hint: use run's submit directory as the id instead" 

208 ), 

209 ) 

210 

211 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}: 

212 if not wms_path.is_dir(): 

213 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found." 

214 

215 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

216 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

217 if not rescue_dags: 

218 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

219 

220 _LOG.info("Verifying that the workflow is not already in the job queue") 

221 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"') 

222 if schedd_dag_info: 

223 _, dag_info = schedd_dag_info.popitem() 

224 _, dag_ad = dag_info.popitem() 

225 id_ = dag_ad["GlobalJobId"] 

226 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

227 

228 _LOG.info("Checking execution status of the workflow") 

229 warn = False 

230 dag_ad = read_dag_status(str(wms_path)) 

231 if dag_ad: 

232 nodes_total = dag_ad.get("NodesTotal", 0) 

233 if nodes_total != 0: 

234 nodes_done = dag_ad.get("NodesDone", 0) 

235 if nodes_total == nodes_done: 

236 return None, None, "All jobs in the workflow finished successfully" 

237 else: 

238 warn = True 

239 else: 

240 warn = True 

241 if warn: 

242 _LOG.warning( 

243 "Cannot determine the execution status of the workflow, continuing with restart regardless" 

244 ) 

245 

246 _LOG.info("Backing up select HTCondor files from previous run attempt") 

247 htc_backup_files(wms_path, subdir="backups") 

248 

249 # For workflow portability, internal paths are all relative. Hence 

250 # the DAG needs to be resubmitted to HTCondor from inside the submit 

251 # directory. 

252 _LOG.info("Adding workflow to the job queue") 

253 run_id, run_name, message = None, None, "" 

254 with chdir(wms_path): 

255 try: 

256 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

257 except StopIteration: 

258 message = f"DAGMan submit description file not found in '{wms_path}'" 

259 else: 

260 sub = htc_create_submit_from_file(dag_path.name) 

261 schedd_dag_info = htc_submit_dag(sub) 

262 

263 # Save select information about the DAGMan job to a file. Use 

264 # the run name (available in the ClassAd) as the filename. 

265 if schedd_dag_info: 

266 dag_info = next(iter(schedd_dag_info.values())) 

267 dag_ad = next(iter(dag_info.values())) 

268 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

269 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

270 run_name = dag_ad["bps_run"] 

271 else: 

272 message = "DAGMan job information unavailable" 

273 

274 return run_id, run_name, message 

275 

276 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

277 """Query WMS for list of submitted WMS workflows/jobs. 

278 

279 This should be a quick lookup function to create list of jobs for 

280 other functions. 

281 

282 Parameters 

283 ---------- 

284 wms_id : `int` or `str`, optional 

285 Id or path that can be used by WMS service to look up job. 

286 user : `str`, optional 

287 User whose submitted jobs should be listed. 

288 require_bps : `bool`, optional 

289 Whether to require jobs returned in list to be bps-submitted jobs. 

290 pass_thru : `str`, optional 

291 Information to pass through to WMS. 

292 is_global : `bool`, optional 

293 If set, all job queues (and their histories) will be queried for 

294 job information. Defaults to False which means that only the local 

295 job queue will be queried. 

296 

297 Returns 

298 ------- 

299 job_ids : `list` [`Any`] 

300 Only job ids to be used by cancel and other functions. Typically 

301 this means top-level jobs (i.e., not children jobs). 

302 """ 

303 _LOG.debug( 

304 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

305 wms_id, 

306 user, 

307 require_bps, 

308 pass_thru, 

309 is_global, 

310 ) 

311 

312 # Determine which Schedds will be queried for job information. 

313 coll = htcondor.Collector() 

314 

315 schedd_ads = [] 

316 if is_global: 

317 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

318 else: 

319 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

320 

321 # Construct appropriate constraint expression using provided arguments. 

322 constraint = "False" 

323 if wms_id is None: 

324 if user is not None: 

325 constraint = f'(Owner == "{user}")' 

326 else: 

327 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

328 if cluster_id is not None: 

329 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

330 

331 # If provided id is either a submission path or a global id, 

332 # make sure the right Schedd will be queried regardless of 

333 # 'is_global' value. 

334 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

335 schedd_ads = [schedd_ad] 

336 if require_bps: 

337 constraint += ' && (bps_isjob == "True")' 

338 if pass_thru: 

339 if "-forcex" in pass_thru: 

340 pass_thru_2 = pass_thru.replace("-forcex", "") 

341 if pass_thru_2 and not pass_thru_2.isspace(): 

342 constraint += f" && ({pass_thru_2})" 

343 else: 

344 constraint += f" && ({pass_thru})" 

345 

346 # Create a list of scheduler daemons which need to be queried. 

347 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

348 

349 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

350 results = condor_q(constraint=constraint, schedds=schedds) 

351 

352 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

353 job_ids = [] 

354 for schedd_name, job_info in results.items(): 

355 for job_id, job_ad in job_info.items(): 

356 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

357 if "DAGManJobId" not in job_ad: 

358 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

359 else: 

360 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

361 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

362 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

363 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

364 

365 _LOG.debug("job_ids = %s", job_ids) 

366 return job_ids 

367 

368 def report( 

369 self, 

370 wms_workflow_id=None, 

371 user=None, 

372 hist=0, 

373 pass_thru=None, 

374 is_global=False, 

375 return_exit_codes=False, 

376 ): 

377 """Return run information based upon given constraints. 

378 

379 Parameters 

380 ---------- 

381 wms_workflow_id : `str`, optional 

382 Limit to specific run based on id. 

383 user : `str`, optional 

384 Limit results to runs for this user. 

385 hist : `float`, optional 

386 Limit history search to this many days. Defaults to 0. 

387 pass_thru : `str`, optional 

388 Constraints to pass through to HTCondor. 

389 is_global : `bool`, optional 

390 If set, all job queues (and their histories) will be queried for 

391 job information. Defaults to False which means that only the local 

392 job queue will be queried. 

393 return_exit_codes : `bool`, optional 

394 If set, return exit codes related to jobs with a 

395 non-success status. Defaults to False, which means that only 

396 the summary state is returned. 

397 

398 Only applicable in the context of a WMS with associated 

399 handlers to return exit codes from jobs. 

400 

401 Returns 

402 ------- 

403 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

404 Information about runs from given job information. 

405 message : `str` 

406 Extra message for report command to print. This could be pointers 

407 to documentation or to WMS specific commands. 

408 """ 

409 if wms_workflow_id: 

410 id_type = _wms_id_type(wms_workflow_id) 

411 if id_type == WmsIdType.LOCAL: 

412 schedulers = _locate_schedds(locate_all=is_global) 

413 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

414 elif id_type == WmsIdType.GLOBAL: 

415 schedulers = _locate_schedds(locate_all=True) 

416 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

417 elif id_type == WmsIdType.PATH: 

418 run_reports, message = _report_from_path(wms_workflow_id) 

419 else: 

420 run_reports, message = {}, "Invalid job id" 

421 else: 

422 schedulers = _locate_schedds(locate_all=is_global) 

423 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

424 _LOG.debug("report: %s, %s", run_reports, message) 

425 

426 return list(run_reports.values()), message 

427 

428 def cancel(self, wms_id, pass_thru=None): 

429 """Cancel submitted workflows/jobs. 

430 

431 Parameters 

432 ---------- 

433 wms_id : `str` 

434 Id or path of job that should be canceled. 

435 pass_thru : `str`, optional 

436 Information to pass through to WMS. 

437 

438 Returns 

439 ------- 

440 deleted : `bool` 

441 Whether successful deletion or not. Currently, if any doubt or any 

442 individual jobs not deleted, return False. 

443 message : `str` 

444 Any message from WMS (e.g., error details). 

445 """ 

446 _LOG.debug("Canceling wms_id = %s", wms_id) 

447 

448 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

449 

450 if cluster_id is None: 

451 deleted = False 

452 message = "invalid id" 

453 else: 

454 _LOG.debug( 

455 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

456 cluster_id, 

457 schedd_ad["Name"], 

458 ) 

459 schedd = htcondor.Schedd(schedd_ad) 

460 

461 constraint = f"ClusterId == {cluster_id}" 

462 if pass_thru is not None and "-forcex" in pass_thru: 

463 pass_thru_2 = pass_thru.replace("-forcex", "") 

464 if pass_thru_2 and not pass_thru_2.isspace(): 

465 constraint += f"&& ({pass_thru_2})" 

466 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

467 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

468 else: 

469 if pass_thru: 

470 constraint += f"&& ({pass_thru})" 

471 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

472 results = schedd.act(htcondor.JobAction.Remove, constraint) 

473 _LOG.debug("Remove results: %s", results) 

474 

475 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

476 deleted = True 

477 message = "" 

478 else: 

479 deleted = False 

480 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

481 message = "no such bps job in batch queue" 

482 else: 

483 message = f"unknown problems deleting: {results}" 

484 

485 _LOG.debug("deleted: %s; message = %s", deleted, message) 

486 return deleted, message 

487 

488 

489class HTCondorWorkflow(BaseWmsWorkflow): 

490 """Single HTCondor workflow. 

491 

492 Parameters 

493 ---------- 

494 name : `str` 

495 Unique name for Workflow used when naming files. 

496 config : `lsst.ctrl.bps.BpsConfig` 

497 BPS configuration that includes necessary submit/runtime information. 

498 """ 

499 

500 def __init__(self, name, config=None): 

501 super().__init__(name, config) 

502 self.dag = None 

503 

504 @classmethod 

505 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

506 # Docstring inherited 

507 htc_workflow = cls(generic_workflow.name, config) 

508 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

509 

510 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

511 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

512 htc_workflow.dag.add_attribs( 

513 { 

514 "bps_wms_service": service_class, 

515 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

516 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

517 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

518 } 

519 ) 

520 

521 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

522 if isinstance(tmp_template, str): 

523 subdir_template = defaultdict(lambda: tmp_template) 

524 else: 

525 subdir_template = tmp_template 

526 

527 # Create all DAG jobs 

528 site_values = {} # cache compute site specific values to reduce config lookups 

529 for job_name in generic_workflow: 

530 gwjob = generic_workflow.get_job(job_name) 

531 if gwjob.compute_site not in site_values: 

532 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

533 htc_job = _create_job( 

534 subdir_template[gwjob.label], 

535 site_values[gwjob.compute_site], 

536 generic_workflow, 

537 gwjob, 

538 out_prefix, 

539 ) 

540 htc_workflow.dag.add_job(htc_job) 

541 

542 # Add job dependencies to the DAG 

543 for job_name in generic_workflow: 

544 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

545 

546 # If final job exists in generic workflow, create DAG final job 

547 final = generic_workflow.get_final() 

548 if final and isinstance(final, GenericWorkflowJob): 

549 if final.compute_site and final.compute_site not in site_values: 

550 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

551 final_htjob = _create_job( 

552 subdir_template[final.label], 

553 site_values[final.compute_site], 

554 generic_workflow, 

555 final, 

556 out_prefix, 

557 ) 

558 if "post" not in final_htjob.dagcmds: 

559 final_htjob.dagcmds["post"] = ( 

560 f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN" 

561 ) 

562 htc_workflow.dag.add_final_job(final_htjob) 

563 elif final and isinstance(final, GenericWorkflow): 

564 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

565 elif final: 

566 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

567 

568 return htc_workflow 

569 

570 def write(self, out_prefix): 

571 """Output HTCondor DAGMan files needed for workflow submission. 

572 

573 Parameters 

574 ---------- 

575 out_prefix : `str` 

576 Directory prefix for HTCondor files. 

577 """ 

578 self.submit_path = out_prefix 

579 os.makedirs(out_prefix, exist_ok=True) 

580 

581 # Write down the workflow in HTCondor format. 

582 self.dag.write(out_prefix, "jobs/{self.label}") 

583 

584 

585def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

586 """Convert GenericWorkflow job nodes to DAG jobs. 

587 

588 Parameters 

589 ---------- 

590 subdir_template : `str` 

591 Template for making subdirs. 

592 site_values : `dict` 

593 Site specific values 

594 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

595 Generic workflow that is being converted. 

596 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

597 The generic job to convert to a HTCondor job. 

598 out_prefix : `str` 

599 Directory prefix for HTCondor files. 

600 

601 Returns 

602 ------- 

603 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

604 The HTCondor job equivalent to the given generic job. 

605 """ 

606 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

607 

608 curvals = defaultdict(str) 

609 curvals["label"] = gwjob.label 

610 if gwjob.tags: 

611 curvals.update(gwjob.tags) 

612 

613 subdir = subdir_template.format_map(curvals) 

614 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

615 

616 htc_job_cmds = { 

617 "universe": "vanilla", 

618 "should_transfer_files": "YES", 

619 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

620 "transfer_output_files": '""', # Set to empty string to disable 

621 "transfer_executable": "False", 

622 "getenv": "True", 

623 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

624 # htcondor to put on hold any jobs which exited by a signal. 

625 "on_exit_hold": "ExitBySignal == true", 

626 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

627 '"Handling signal as if job has gone over memory limit.")', 

628 "on_exit_hold_subcode": "34", 

629 } 

630 

631 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

632 

633 # job stdout, stderr, htcondor user log. 

634 for key in ("output", "error", "log"): 

635 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

636 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

637 

638 htc_job_cmds.update( 

639 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

640 ) 

641 

642 # Add the job cmds dict to the job object. 

643 htc_job.add_job_cmds(htc_job_cmds) 

644 

645 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

646 

647 # Add job attributes to job. 

648 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

649 htc_job.add_job_attrs(gwjob.attrs) 

650 htc_job.add_job_attrs(site_values["attrs"]) 

651 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

652 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

653 

654 return htc_job 

655 

656 

657def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

658 """Translate the job data that are one to one mapping 

659 

660 Parameters 

661 ---------- 

662 cached_vals : `dict` [`str`, `Any`] 

663 Config values common to jobs with same label. 

664 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

665 Generic workflow that contains job to being converted. 

666 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

667 Generic workflow job to be converted. 

668 

669 Returns 

670 ------- 

671 htc_job_commands : `dict` [`str`, `Any`] 

672 Contains commands which can appear in the HTCondor submit description 

673 file. 

674 """ 

675 # Values in the job script that just are name mappings. 

676 job_translation = { 

677 "mail_to": "notify_user", 

678 "when_to_mail": "notification", 

679 "request_cpus": "request_cpus", 

680 "priority": "priority", 

681 "category": "category", 

682 "accounting_group": "accounting_group", 

683 "accounting_user": "accounting_group_user", 

684 } 

685 

686 jobcmds = {} 

687 for gwkey, htckey in job_translation.items(): 

688 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

689 

690 # If accounting info was not set explicitly, use site settings if any. 

691 if not gwjob.accounting_group: 

692 jobcmds["accounting_group"] = cached_vals.get("accountingGroup") 

693 if not gwjob.accounting_user: 

694 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser") 

695 

696 # job commands that need modification 

697 if gwjob.number_of_retries: 

698 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

699 

700 if gwjob.retry_unless_exit: 

701 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

702 

703 if gwjob.request_disk: 

704 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

705 

706 if gwjob.request_memory: 

707 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

708 

709 if gwjob.memory_multiplier: 

710 # Do not use try-except! At the moment, BpsConfig returns an empty 

711 # string if it does not contain the key. 

712 memory_limit = cached_vals["memoryLimit"] 

713 if not memory_limit: 

714 raise RuntimeError( 

715 "Memory autoscaling enabled, but automatic detection of the memory limit " 

716 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

717 "search pattern 'executeMachinesPattern' might help." 

718 ) 

719 

720 # Set maximal amount of memory job can ask for. 

721 # 

722 # The check below assumes that 'memory_limit' was set to a value which 

723 # realistically reflects actual physical limitations of a given compute 

724 # resource. 

725 memory_max = memory_limit 

726 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

727 memory_max = gwjob.request_memory_max 

728 

729 # Make job ask for more memory each time it failed due to insufficient 

730 # memory requirements. 

731 jobcmds["request_memory"] = _create_request_memory_expr( 

732 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

733 ) 

734 

735 # Periodically release jobs which are being held due to exceeding 

736 # memory. Stop doing that (by removing the job from the HTCondor queue) 

737 # after the maximal number of retries has been reached or the job was 

738 # already run at maximal allowed memory. 

739 jobcmds["periodic_release"] = _create_periodic_release_expr( 

740 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

741 ) 

742 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

743 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

744 ) 

745 

746 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

747 # May need to move to special site-specific implementation if sites use 

748 # other mechanisms. 

749 if gwjob.concurrency_limit: 

750 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

751 

752 # Handle command line 

753 if gwjob.executable.transfer_executable: 

754 jobcmds["transfer_executable"] = "True" 

755 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

756 else: 

757 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

758 

759 if gwjob.arguments: 

760 arguments = gwjob.arguments 

761 arguments = _replace_cmd_vars(arguments, gwjob) 

762 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

763 arguments = _fix_env_var_syntax(arguments) 

764 jobcmds["arguments"] = arguments 

765 

766 # Add extra "pass-thru" job commands 

767 if gwjob.profile: 

768 for key, val in gwjob.profile.items(): 

769 jobcmds[key] = htc_escape(val) 

770 for key, val in cached_vals["profile"].items(): 

771 jobcmds[key] = htc_escape(val) 

772 

773 return jobcmds 

774 

775 

776def _translate_dag_cmds(gwjob): 

777 """Translate job values into DAGMan commands. 

778 

779 Parameters 

780 ---------- 

781 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

782 Job containing values to be translated. 

783 

784 Returns 

785 ------- 

786 dagcmds : `dict` [`str`, `Any`] 

787 DAGMan commands for the job. 

788 """ 

789 # Values in the dag script that just are name mappings. 

790 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

791 

792 dagcmds = {} 

793 for gwkey, htckey in dag_translation.items(): 

794 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

795 

796 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

797 return dagcmds 

798 

799 

800def _fix_env_var_syntax(oldstr): 

801 """Change ENV place holders to HTCondor Env var syntax. 

802 

803 Parameters 

804 ---------- 

805 oldstr : `str` 

806 String in which environment variable syntax is to be fixed. 

807 

808 Returns 

809 ------- 

810 newstr : `str` 

811 Given string with environment variable syntax fixed. 

812 """ 

813 newstr = oldstr 

814 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

815 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

816 return newstr 

817 

818 

819def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

820 """Replace file placeholders in command line arguments with correct 

821 physical file names. 

822 

823 Parameters 

824 ---------- 

825 use_shared : `bool` 

826 Whether HTCondor can assume shared filesystem. 

827 arguments : `str` 

828 Arguments string in which to replace file placeholders. 

829 workflow : `lsst.ctrl.bps.GenericWorkflow` 

830 Generic workflow that contains file information. 

831 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

832 The job corresponding to the arguments. 

833 

834 Returns 

835 ------- 

836 arguments : `str` 

837 Given arguments string with file placeholders replaced. 

838 """ 

839 # Replace input file placeholders with paths. 

840 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

841 if not gwfile.wms_transfer: 

842 # Must assume full URI if in command line and told WMS is not 

843 # responsible for transferring file. 

844 uri = gwfile.src_uri 

845 elif use_shared: 

846 if gwfile.job_shared: 

847 # Have shared filesystems and jobs can share file. 

848 uri = gwfile.src_uri 

849 else: 

850 # Taking advantage of inside knowledge. Not future-proof. 

851 # Temporary fix until have job wrapper that pulls files 

852 # within job. 

853 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

854 uri = "butler.yaml" 

855 else: 

856 uri = os.path.basename(gwfile.src_uri) 

857 else: # Using push transfer 

858 uri = os.path.basename(gwfile.src_uri) 

859 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

860 

861 # Replace output file placeholders with paths. 

862 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

863 if not gwfile.wms_transfer: 

864 # Must assume full URI if in command line and told WMS is not 

865 # responsible for transferring file. 

866 uri = gwfile.src_uri 

867 elif use_shared: 

868 if gwfile.job_shared: 

869 # Have shared filesystems and jobs can share file. 

870 uri = gwfile.src_uri 

871 else: 

872 uri = os.path.basename(gwfile.src_uri) 

873 else: # Using push transfer 

874 uri = os.path.basename(gwfile.src_uri) 

875 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

876 return arguments 

877 

878 

879def _replace_cmd_vars(arguments, gwjob): 

880 """Replace format-style placeholders in arguments. 

881 

882 Parameters 

883 ---------- 

884 arguments : `str` 

885 Arguments string in which to replace placeholders. 

886 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

887 Job containing values to be used to replace placeholders 

888 (in particular gwjob.cmdvals). 

889 

890 Returns 

891 ------- 

892 arguments : `str` 

893 Given arguments string with placeholders replaced. 

894 """ 

895 try: 

896 arguments = arguments.format(**gwjob.cmdvals) 

897 except (KeyError, TypeError): # TypeError in case None instead of {} 

898 _LOG.error( 

899 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals 

900 ) 

901 raise 

902 return arguments 

903 

904 

905def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

906 """Add job input files from generic workflow to job. 

907 

908 Parameters 

909 ---------- 

910 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

911 The generic workflow (e.g., has executable name and arguments). 

912 job_name : `str` 

913 Unique name for the job. 

914 use_shared : `bool` 

915 Whether job has access to files via shared filesystem. 

916 out_prefix : `str` 

917 The root directory into which all WMS-specific files are written. 

918 

919 Returns 

920 ------- 

921 htc_commands : `dict` [`str`, `str`] 

922 HTCondor commands for the job submission script. 

923 """ 

924 htc_commands = {} 

925 inputs = [] 

926 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

927 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

928 

929 uri = Path(gwf_file.src_uri) 

930 

931 # Note if use_shared and job_shared, don't need to transfer file. 

932 

933 if not use_shared: # Copy file using push to job 

934 inputs.append(str(uri.relative_to(out_prefix))) 

935 elif not gwf_file.job_shared: # Jobs require own copy 

936 # if using shared filesystem, but still need copy in job. Use 

937 # HTCondor's curl plugin for a local copy. 

938 

939 # Execution butler is represented as a directory which the 

940 # curl plugin does not handle. Taking advantage of inside 

941 # knowledge for temporary fix until have job wrapper that pulls 

942 # files within job. 

943 if gwf_file.name == "butlerConfig": 

944 # The execution butler directory doesn't normally exist until 

945 # the submit phase so checking for suffix instead of using 

946 # is_dir(). If other non-yaml file exists they would have a 

947 # different gwf_file.name. 

948 if uri.suffix == ".yaml": # Single file, so just copy. 

949 inputs.append(f"file://{uri}") 

950 else: 

951 inputs.append(f"file://{uri / 'butler.yaml'}") 

952 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

953 elif uri.is_dir(): 

954 raise RuntimeError( 

955 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}" 

956 ) 

957 else: 

958 inputs.append(f"file://{uri}") 

959 

960 if inputs: 

961 htc_commands["transfer_input_files"] = ",".join(inputs) 

962 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

963 return htc_commands 

964 

965 

966def _report_from_path(wms_path): 

967 """Gather run information from a given run directory. 

968 

969 Parameters 

970 ---------- 

971 wms_path : `str` 

972 The directory containing the submit side files (e.g., HTCondor files). 

973 

974 Returns 

975 ------- 

976 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

977 Run information for the detailed report. The key is the HTCondor id 

978 and the value is a collection of report information for that run. 

979 message : `str` 

980 Message to be printed with the summary report. 

981 """ 

982 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

983 if wms_workflow_id == MISSING_ID: 

984 run_reports = {} 

985 else: 

986 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

987 return run_reports, message 

988 

989 

990def _report_from_id(wms_workflow_id, hist, schedds=None): 

991 """Gather run information using workflow id. 

992 

993 Parameters 

994 ---------- 

995 wms_workflow_id : `str` 

996 Limit to specific run based on id. 

997 hist : `float` 

998 Limit history search to this many days. 

999 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1000 HTCondor schedulers which to query for job information. If None 

1001 (default), all queries will be run against the local scheduler only. 

1002 

1003 Returns 

1004 ------- 

1005 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1006 Run information for the detailed report. The key is the HTCondor id 

1007 and the value is a collection of report information for that run. 

1008 message : `str` 

1009 Message to be printed with the summary report. 

1010 """ 

1011 messages = [] 

1012 

1013 # Collect information about the job by querying HTCondor schedd and 

1014 # HTCondor history. 

1015 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds) 

1016 if len(schedd_dag_info) == 1: 

1017 # Extract the DAG info without altering the results of the query. 

1018 schedd_name = next(iter(schedd_dag_info)) 

1019 dag_id = next(iter(schedd_dag_info[schedd_name])) 

1020 dag_ad = schedd_dag_info[schedd_name][dag_id] 

1021 

1022 # If the provided workflow id does not correspond to the one extracted 

1023 # from the DAGMan log file in the submit directory, rerun the query 

1024 # with the id found in the file. 

1025 # 

1026 # This is to cover the situation in which the user provided the old job 

1027 # id of a restarted run. 

1028 try: 

1029 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"]) 

1030 except FileNotFoundError as exc: 

1031 # At the moment missing DAGMan log is pretty much a fatal error. 

1032 # So empty the DAG info to finish early (see the if statement 

1033 # below). 

1034 schedd_dag_info.clean() 

1035 messages.append(f"Cannot create the report for '{dag_id}': {exc}") 

1036 else: 

1037 if path_dag_id != dag_id: 

1038 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds) 

1039 messages.append( 

1040 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. " 

1041 "This normally occurs when a run is restarted. The report shown is for the most " 

1042 f"recent status with run id '{path_dag_id}'" 

1043 ) 

1044 

1045 if len(schedd_dag_info) == 0: 

1046 run_reports = {} 

1047 elif len(schedd_dag_info) == 1: 

1048 _, dag_info = schedd_dag_info.popitem() 

1049 dag_id, dag_ad = dag_info.popitem() 

1050 

1051 # Create a mapping between jobs and their classads. The keys will 

1052 # be of format 'ClusterId.ProcId'. 

1053 job_info = {dag_id: dag_ad} 

1054 

1055 # Find jobs (nodes) belonging to that DAGMan job. 

1056 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1057 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1058 if schedd_job_info: 

1059 _, node_info = schedd_job_info.popitem() 

1060 job_info.update(node_info) 

1061 

1062 # Collect additional pieces of information about jobs using HTCondor 

1063 # files in the submission directory. 

1064 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1065 _update_jobs(job_info, path_jobs) 

1066 if message: 

1067 messages.append(message) 

1068 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1069 else: 

1070 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1071 message = ( 

1072 f"More than one job matches id '{wms_workflow_id}', " 

1073 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1074 ) 

1075 messages.append(message) 

1076 run_reports = {} 

1077 

1078 message = "\n".join(messages) 

1079 return run_reports, message 

1080 

1081 

1082def _get_info_from_schedd(wms_workflow_id, hist, schedds): 

1083 """Gather run information from HTCondor. 

1084 

1085 Parameters 

1086 ---------- 

1087 wms_workflow_id : `str` 

1088 Limit to specific run based on id. 

1089 hist : `int` 

1090 Limit history search to this many days. 

1091 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1092 HTCondor schedulers which to query for job information. If None 

1093 (default), all queries will be run against the local scheduler only. 

1094 

1095 Returns 

1096 ------- 

1097 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1098 Information about jobs satisfying the search criteria where for each 

1099 Scheduler, local HTCondor job ids are mapped to their respective 

1100 classads. 

1101 """ 

1102 dag_constraint = 'regexp("dagman$", Cmd)' 

1103 try: 

1104 cluster_id = int(float(wms_workflow_id)) 

1105 except ValueError: 

1106 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

1107 else: 

1108 dag_constraint += f" && ClusterId == {cluster_id}" 

1109 

1110 # With the current implementation of the condor_* functions the query 

1111 # will always return only one match per Scheduler. 

1112 # 

1113 # Even in the highly unlikely situation where HTCondor history (which 

1114 # condor_search queries too) is long enough to have jobs from before 

1115 # the cluster ids were rolled over (and as a result there is more then 

1116 # one job with the same cluster id) they will not show up in 

1117 # the results. 

1118 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

1119 return schedd_dag_info 

1120 

1121 

1122def _get_info_from_path(wms_path): 

1123 """Gather run information from a given run directory. 

1124 

1125 Parameters 

1126 ---------- 

1127 wms_path : `str` 

1128 Directory containing HTCondor files. 

1129 

1130 Returns 

1131 ------- 

1132 wms_workflow_id : `str` 

1133 The run id which is a DAGman job id. 

1134 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1135 Information about jobs read from files in the given directory. 

1136 The key is the HTCondor id and the value is a dictionary of HTCondor 

1137 keys and values. 

1138 message : `str` 

1139 Message to be printed with the summary report. 

1140 """ 

1141 messages = [] 

1142 try: 

1143 wms_workflow_id, jobs = read_dag_log(wms_path) 

1144 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1145 _update_jobs(jobs, read_node_status(wms_path)) 

1146 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1147 

1148 # Add more info for DAGman job 

1149 job = jobs[wms_workflow_id] 

1150 job.update(read_dag_status(wms_path)) 

1151 

1152 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1153 if "bps_run" not in job: 

1154 _add_run_info(wms_path, job) 

1155 

1156 message = htc_check_dagman_output(wms_path) 

1157 if message: 

1158 messages.append(message) 

1159 _LOG.debug( 

1160 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1161 ) 

1162 

1163 # Add extra pieces of information which cannot be found in HTCondor 

1164 # generated files like 'GlobalJobId'. 

1165 # 

1166 # Do not treat absence of this file as a serious error. Neither runs 

1167 # submitted with earlier versions of the plugin nor the runs submitted 

1168 # with Pegasus plugin will have it at the moment. However, once enough 

1169 # time passes and Pegasus plugin will have its own report() method 

1170 # (instead of sneakily using HTCondor's one), the lack of that file 

1171 # should be treated as seriously as lack of any other file. 

1172 try: 

1173 job_info = read_dag_info(wms_path) 

1174 except FileNotFoundError as exc: 

1175 message = f"Warn: Some information may not be available: {exc}" 

1176 messages.append(message) 

1177 else: 

1178 schedd_name = next(iter(job_info)) 

1179 job_ad = next(iter(job_info[schedd_name].values())) 

1180 job.update(job_ad) 

1181 except FileNotFoundError: 

1182 message = f"Could not find HTCondor files in '{wms_path}'" 

1183 _LOG.warning(message) 

1184 messages.append(message) 

1185 wms_workflow_id = MISSING_ID 

1186 jobs = {} 

1187 

1188 message = "\n".join([msg for msg in messages if msg]) 

1189 return wms_workflow_id, jobs, message 

1190 

1191 

1192def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1193 """Gather run information to be used in generating summary reports. 

1194 

1195 Parameters 

1196 ---------- 

1197 wms_workflow_id : `str` 

1198 The run id to create the report for. 

1199 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1200 Mapping HTCondor job id to job information. 

1201 

1202 Returns 

1203 ------- 

1204 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1205 Run information for the detailed report. The key is the given HTCondor 

1206 id and the value is a collection of report information for that run. 

1207 """ 

1208 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1209 dag_job = jobs.pop(wms_workflow_id) 

1210 report = WmsRunReport( 

1211 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1212 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1213 path=dag_job["Iwd"], 

1214 label=dag_job.get("bps_job_label", "MISS"), 

1215 run=dag_job.get("bps_run", "MISS"), 

1216 project=dag_job.get("bps_project", "MISS"), 

1217 campaign=dag_job.get("bps_campaign", "MISS"), 

1218 payload=dag_job.get("bps_payload", "MISS"), 

1219 operator=_get_owner(dag_job), 

1220 run_summary=_get_run_summary(dag_job), 

1221 state=_htc_status_to_wms_state(dag_job), 

1222 jobs=[], 

1223 total_number_jobs=dag_job["total_jobs"], 

1224 job_state_counts=dag_job["state_counts"], 

1225 exit_code_summary=_get_exit_code_summary(jobs), 

1226 ) 

1227 

1228 for job_id, job_info in jobs.items(): 

1229 try: 

1230 job_report = WmsJobReport( 

1231 wms_id=job_id, 

1232 name=job_info.get("DAGNodeName", job_id), 

1233 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1234 state=_htc_status_to_wms_state(job_info), 

1235 ) 

1236 if job_report.label == "init": 

1237 job_report.label = "pipetaskInit" 

1238 report.jobs.append(job_report) 

1239 except KeyError as ex: 

1240 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1241 raise 

1242 

1243 # Add the removed entry to restore the original content of the dictionary. 

1244 # The ordering of keys will be change permanently though. 

1245 jobs.update({wms_workflow_id: dag_job}) 

1246 

1247 run_reports = {report.wms_id: report} 

1248 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1249 return run_reports 

1250 

1251 

1252def _summary_report(user, hist, pass_thru, schedds=None): 

1253 """Gather run information to be used in generating summary reports. 

1254 

1255 Parameters 

1256 ---------- 

1257 user : `str` 

1258 Run lookup restricted to given user. 

1259 hist : `float` 

1260 How many previous days to search for run information. 

1261 pass_thru : `str` 

1262 Advanced users can define the HTCondor constraint to be used 

1263 when searching queue and history. 

1264 

1265 Returns 

1266 ------- 

1267 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1268 Run information for the summary report. The keys are HTCondor ids and 

1269 the values are collections of report information for each run. 

1270 message : `str` 

1271 Message to be printed with the summary report. 

1272 """ 

1273 # only doing summary report so only look for dagman jobs 

1274 if pass_thru: 

1275 constraint = pass_thru 

1276 else: 

1277 # Notes: 

1278 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1279 # manually restarted. 

1280 # * Any job with DAGManJobID isn't a DAG job 

1281 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1282 if user: 

1283 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1284 

1285 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1286 

1287 # Have list of DAGMan jobs, need to get run_report info. 

1288 run_reports = {} 

1289 for jobs in job_info.values(): 

1290 for job_id, job in jobs.items(): 

1291 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1292 # If didn't get from queue information (e.g., Kerberos bug), 

1293 # try reading from file. 

1294 if total_jobs == 0: 

1295 try: 

1296 job.update(read_dag_status(job["Iwd"])) 

1297 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1298 except StopIteration: 

1299 pass # don't kill report can't find htcondor files 

1300 

1301 if "bps_run" not in job: 

1302 _add_run_info(job["Iwd"], job) 

1303 report = WmsRunReport( 

1304 wms_id=job_id, 

1305 global_wms_id=job["GlobalJobId"], 

1306 path=job["Iwd"], 

1307 label=job.get("bps_job_label", "MISS"), 

1308 run=job.get("bps_run", "MISS"), 

1309 project=job.get("bps_project", "MISS"), 

1310 campaign=job.get("bps_campaign", "MISS"), 

1311 payload=job.get("bps_payload", "MISS"), 

1312 operator=_get_owner(job), 

1313 run_summary=_get_run_summary(job), 

1314 state=_htc_status_to_wms_state(job), 

1315 jobs=[], 

1316 total_number_jobs=total_jobs, 

1317 job_state_counts=state_counts, 

1318 ) 

1319 run_reports[report.global_wms_id] = report 

1320 

1321 return run_reports, "" 

1322 

1323 

1324def _add_run_info(wms_path, job): 

1325 """Find BPS run information elsewhere for runs without bps attributes. 

1326 

1327 Parameters 

1328 ---------- 

1329 wms_path : `str` 

1330 Path to submit files for the run. 

1331 job : `dict` [`str`, `Any`] 

1332 HTCondor dag job information. 

1333 

1334 Raises 

1335 ------ 

1336 StopIteration 

1337 If cannot find file it is looking for. Permission errors are 

1338 caught and job's run is marked with error. 

1339 """ 

1340 path = Path(wms_path) / "jobs" 

1341 try: 

1342 subfile = next(path.glob("**/*.sub")) 

1343 except (StopIteration, PermissionError): 

1344 job["bps_run"] = "Unavailable" 

1345 else: 

1346 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1347 try: 

1348 with open(subfile, encoding="utf-8") as fh: 

1349 for line in fh: 

1350 if line.startswith("+bps_"): 

1351 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1352 if m: 

1353 _LOG.debug("Matching line: %s", line) 

1354 job[m.group(1)] = m.group(2).replace('"', "") 

1355 else: 

1356 _LOG.debug("Could not parse attribute: %s", line) 

1357 except PermissionError: 

1358 job["bps_run"] = "PermissionError" 

1359 _LOG.debug("After adding job = %s", job) 

1360 

1361 

1362def _get_owner(job): 

1363 """Get the owner of a dag job. 

1364 

1365 Parameters 

1366 ---------- 

1367 job : `dict` [`str`, `Any`] 

1368 HTCondor dag job information. 

1369 

1370 Returns 

1371 ------- 

1372 owner : `str` 

1373 Owner of the dag job. 

1374 """ 

1375 owner = job.get("bps_operator", None) 

1376 if not owner: 

1377 owner = job.get("Owner", None) 

1378 if not owner: 

1379 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1380 owner = "MISS" 

1381 return owner 

1382 

1383 

1384def _get_run_summary(job): 

1385 """Get the run summary for a job. 

1386 

1387 Parameters 

1388 ---------- 

1389 job : `dict` [`str`, `Any`] 

1390 HTCondor dag job information. 

1391 

1392 Returns 

1393 ------- 

1394 summary : `str` 

1395 Number of jobs per PipelineTask label in approximate pipeline order. 

1396 Format: <label>:<count>[;<label>:<count>]+ 

1397 """ 

1398 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1399 if not summary: 

1400 summary, _ = summary_from_dag(job["Iwd"]) 

1401 if not summary: 

1402 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1403 _LOG.debug("_get_run_summary: summary=%s", summary) 

1404 

1405 # Workaround sometimes using init vs pipetaskInit 

1406 summary = summary.replace("init:", "pipetaskInit:") 

1407 

1408 if "pegasus_version" in job and "pegasus" not in summary: 

1409 summary += ";pegasus:0" 

1410 

1411 return summary 

1412 

1413 

1414def _get_exit_code_summary(jobs): 

1415 """Get the exit code summary for a run. 

1416 

1417 Parameters 

1418 ---------- 

1419 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1420 Mapping HTCondor job id to job information. 

1421 

1422 Returns 

1423 ------- 

1424 summary : `dict` [`str`, `list` [`int`]] 

1425 Jobs' exit codes per job label. 

1426 """ 

1427 summary = {} 

1428 for job_id, job_ad in jobs.items(): 

1429 job_label = job_ad["bps_job_label"] 

1430 summary.setdefault(job_label, []) 

1431 try: 

1432 exit_code = 0 

1433 job_status = job_ad["JobStatus"] 

1434 match job_status: 

1435 case JobStatus.COMPLETED: 

1436 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"] 

1437 case JobStatus.HELD: 

1438 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["HoldReasonCode"] 

1439 case ( 

1440 JobStatus.IDLE 

1441 | JobStatus.RUNNING 

1442 | JobStatus.REMOVED 

1443 | JobStatus.TRANSFERRING_OUTPUT 

1444 | JobStatus.SUSPENDED 

1445 ): 

1446 pass 

1447 case _: 

1448 _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%d'", job_status, job_id) 

1449 if exit_code != 0: 

1450 summary[job_label].append(exit_code) 

1451 except KeyError as ex: 

1452 _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id) 

1453 return summary 

1454 

1455 

1456def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1457 """Count number of jobs per WMS state. 

1458 

1459 Parameters 

1460 ---------- 

1461 wms_workflow_id : `str` 

1462 HTCondor job id. 

1463 jobs : `dict` [`str`, `Any`] 

1464 HTCondor dag job information. 

1465 

1466 Returns 

1467 ------- 

1468 total_count : `int` 

1469 Total number of dag nodes. 

1470 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1471 Keys are the different WMS states and values are counts of jobs 

1472 that are in that WMS state. 

1473 """ 

1474 state_counts = dict.fromkeys(WmsStates, 0) 

1475 

1476 for jid, jinfo in jobs.items(): 

1477 if jid != wms_workflow_id: 

1478 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1479 

1480 total_counted = sum(state_counts.values()) 

1481 if "NodesTotal" in jobs[wms_workflow_id]: 

1482 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1483 else: 

1484 total_count = total_counted 

1485 

1486 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1487 

1488 return total_count, state_counts 

1489 

1490 

1491def _get_state_counts_from_dag_job(job): 

1492 """Count number of jobs per WMS state. 

1493 

1494 Parameters 

1495 ---------- 

1496 job : `dict` [`str`, `Any`] 

1497 HTCondor dag job information. 

1498 

1499 Returns 

1500 ------- 

1501 total_count : `int` 

1502 Total number of dag nodes. 

1503 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1504 Keys are the different WMS states and values are counts of jobs 

1505 that are in that WMS state. 

1506 """ 

1507 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1508 state_counts = dict.fromkeys(WmsStates, 0) 

1509 if "DAG_NodesReady" in job: 

1510 state_counts = { 

1511 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1512 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1513 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1514 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1515 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1516 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1517 } 

1518 total_jobs = job.get("DAG_NodesTotal") 

1519 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1520 elif "NodesFailed" in job: 

1521 state_counts = { 

1522 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1523 WmsStates.READY: job.get("NodesReady", 0), 

1524 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1525 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1526 WmsStates.FAILED: job.get("NodesFailed", 0), 

1527 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1528 } 

1529 try: 

1530 total_jobs = job.get("NodesTotal") 

1531 except KeyError as ex: 

1532 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1533 raise 

1534 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1535 else: 

1536 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1537 # for every DAG. 

1538 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1539 total_jobs = 0 

1540 

1541 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1542 return total_jobs, state_counts 

1543 

1544 

1545def _htc_status_to_wms_state(job): 

1546 """Convert HTCondor job status to generic wms state. 

1547 

1548 Parameters 

1549 ---------- 

1550 job : `dict` [`str`, `Any`] 

1551 HTCondor job information. 

1552 

1553 Returns 

1554 ------- 

1555 wms_state : `WmsStates` 

1556 The equivalent WmsState to given job's status. 

1557 """ 

1558 wms_state = WmsStates.MISFIT 

1559 if "JobStatus" in job: 

1560 wms_state = _htc_job_status_to_wms_state(job) 

1561 elif "NodeStatus" in job: 

1562 wms_state = _htc_node_status_to_wms_state(job) 

1563 return wms_state 

1564 

1565 

1566def _htc_job_status_to_wms_state(job): 

1567 """Convert HTCondor job status to generic wms state. 

1568 

1569 Parameters 

1570 ---------- 

1571 job : `dict` [`str`, `Any`] 

1572 HTCondor job information. 

1573 

1574 Returns 

1575 ------- 

1576 wms_state : `lsst.ctrl.bps.WmsStates` 

1577 The equivalent WmsState to given job's status. 

1578 """ 

1579 _LOG.debug( 

1580 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1581 ) 

1582 job_status = int(job["JobStatus"]) 

1583 wms_state = WmsStates.MISFIT 

1584 

1585 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1586 if job_status == JobStatus.IDLE: 

1587 wms_state = WmsStates.PENDING 

1588 elif job_status == JobStatus.RUNNING: 

1589 wms_state = WmsStates.RUNNING 

1590 elif job_status == JobStatus.REMOVED: 

1591 wms_state = WmsStates.DELETED 

1592 elif job_status == JobStatus.COMPLETED: 

1593 if ( 

1594 job.get("ExitBySignal", False) 

1595 or job.get("ExitCode", 0) 

1596 or job.get("ExitSignal", 0) 

1597 or job.get("DAG_Status", 0) 

1598 or job.get("ReturnValue", 0) 

1599 ): 

1600 wms_state = WmsStates.FAILED 

1601 else: 

1602 wms_state = WmsStates.SUCCEEDED 

1603 elif job_status == JobStatus.HELD: 

1604 wms_state = WmsStates.HELD 

1605 

1606 return wms_state 

1607 

1608 

1609def _htc_node_status_to_wms_state(job): 

1610 """Convert HTCondor status to generic wms state. 

1611 

1612 Parameters 

1613 ---------- 

1614 job : `dict` [`str`, `Any`] 

1615 HTCondor job information. 

1616 

1617 Returns 

1618 ------- 

1619 wms_state : `lsst.ctrl.bps.WmsStates` 

1620 The equivalent WmsState to given node's status. 

1621 """ 

1622 wms_state = WmsStates.MISFIT 

1623 

1624 status = job["NodeStatus"] 

1625 if status == NodeStatus.NOT_READY: 

1626 wms_state = WmsStates.UNREADY 

1627 elif status == NodeStatus.READY: 

1628 wms_state = WmsStates.READY 

1629 elif status == NodeStatus.PRERUN: 

1630 wms_state = WmsStates.MISFIT 

1631 elif status == NodeStatus.SUBMITTED: 

1632 if job["JobProcsHeld"]: 

1633 wms_state = WmsStates.HELD 

1634 elif job["StatusDetails"] == "not_idle": 

1635 wms_state = WmsStates.RUNNING 

1636 elif job["JobProcsQueued"]: 

1637 wms_state = WmsStates.PENDING 

1638 elif status == NodeStatus.POSTRUN: 

1639 wms_state = WmsStates.MISFIT 

1640 elif status == NodeStatus.DONE: 

1641 wms_state = WmsStates.SUCCEEDED 

1642 elif status == NodeStatus.ERROR: 

1643 # Use job exist instead of post script exit 

1644 if "DAGMAN error 0" in job["StatusDetails"]: 

1645 wms_state = WmsStates.SUCCEEDED 

1646 else: 

1647 wms_state = WmsStates.FAILED 

1648 

1649 return wms_state 

1650 

1651 

1652def _update_jobs(jobs1, jobs2): 

1653 """Update jobs1 with info in jobs2. 

1654 

1655 (Basically an update for nested dictionaries.) 

1656 

1657 Parameters 

1658 ---------- 

1659 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1660 HTCondor job information to be updated. 

1661 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1662 Additional HTCondor job information. 

1663 """ 

1664 for jid, jinfo in jobs2.items(): 

1665 if jid in jobs1: 

1666 jobs1[jid].update(jinfo) 

1667 else: 

1668 jobs1[jid] = jinfo 

1669 

1670 

1671def _wms_id_type(wms_id): 

1672 """Determine the type of the WMS id. 

1673 

1674 Parameters 

1675 ---------- 

1676 wms_id : `str` 

1677 WMS id identifying a job. 

1678 

1679 Returns 

1680 ------- 

1681 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1682 Type of WMS id. 

1683 """ 

1684 try: 

1685 int(float(wms_id)) 

1686 except ValueError: 

1687 wms_path = Path(wms_id) 

1688 if wms_path.is_dir(): 

1689 id_type = WmsIdType.PATH 

1690 else: 

1691 id_type = WmsIdType.GLOBAL 

1692 except TypeError: 

1693 id_type = WmsIdType.UNKNOWN 

1694 else: 

1695 id_type = WmsIdType.LOCAL 

1696 return id_type 

1697 

1698 

1699def _wms_id_to_cluster(wms_id): 

1700 """Convert WMS id to cluster id. 

1701 

1702 Parameters 

1703 ---------- 

1704 wms_id : `int` or `float` or `str` 

1705 HTCondor job id or path. 

1706 

1707 Returns 

1708 ------- 

1709 schedd_ad : `classad.ClassAd` 

1710 ClassAd describing the scheduler managing the job with the given id. 

1711 cluster_id : `int` 

1712 HTCondor cluster id. 

1713 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1714 The type of the provided id. 

1715 """ 

1716 coll = htcondor.Collector() 

1717 

1718 schedd_ad = None 

1719 cluster_id = None 

1720 id_type = _wms_id_type(wms_id) 

1721 if id_type == WmsIdType.LOCAL: 

1722 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1723 cluster_id = int(float(wms_id)) 

1724 elif id_type == WmsIdType.GLOBAL: 

1725 constraint = f'GlobalJobId == "{wms_id}"' 

1726 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1727 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()} 

1728 job_info = condor_q(constraint=constraint, schedds=schedds) 

1729 if job_info: 

1730 schedd_name, job_rec = job_info.popitem() 

1731 job_id, _ = job_rec.popitem() 

1732 schedd_ad = schedd_ads[schedd_name] 

1733 cluster_id = int(float(job_id)) 

1734 elif id_type == WmsIdType.PATH: 

1735 try: 

1736 job_info = read_dag_info(wms_id) 

1737 except (FileNotFoundError, PermissionError, OSError): 

1738 pass 

1739 else: 

1740 schedd_name, job_rec = job_info.popitem() 

1741 job_id, _ = job_rec.popitem() 

1742 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1743 cluster_id = int(float(job_id)) 

1744 else: 

1745 pass 

1746 return schedd_ad, cluster_id, id_type 

1747 

1748 

1749def _wms_id_to_dir(wms_id): 

1750 """Convert WMS id to a submit directory candidate. 

1751 

1752 The function does not check if the directory exists or if it is a valid 

1753 BPS submit directory. 

1754 

1755 Parameters 

1756 ---------- 

1757 wms_id : `int` or `float` or `str` 

1758 HTCondor job id or path. 

1759 

1760 Returns 

1761 ------- 

1762 wms_path : `pathlib.Path` or None 

1763 Submit directory candidate for the run with the given job id. If no 

1764 directory can be associated with the provided WMS id, it will be set 

1765 to None. 

1766 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1767 The type of the provided id. 

1768 

1769 Raises 

1770 ------ 

1771 TypeError 

1772 Raised if provided WMS id has invalid type. 

1773 """ 

1774 coll = htcondor.Collector() 

1775 schedd_ads = [] 

1776 

1777 constraint = None 

1778 wms_path = None 

1779 id_type = _wms_id_type(wms_id) 

1780 match id_type: 

1781 case WmsIdType.LOCAL: 

1782 constraint = f"ClusterId == {int(float(wms_id))}" 

1783 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1784 case WmsIdType.GLOBAL: 

1785 constraint = f'GlobalJobId == "{wms_id}"' 

1786 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1787 case WmsIdType.PATH: 

1788 wms_path = Path(wms_id) 

1789 case WmsIdType.UNKNOWN: 

1790 raise TypeError(f"Invalid job id type: {wms_id}") 

1791 if constraint is not None: 

1792 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1793 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"]) 

1794 if job_info: 

1795 _, job_rec = job_info.popitem() 

1796 _, job_ad = job_rec.popitem() 

1797 wms_path = Path(job_ad["Iwd"]) 

1798 return wms_path, id_type 

1799 

1800 

1801def _create_periodic_release_expr(memory, multiplier, limit): 

1802 """Construct an HTCondorAd expression for releasing held jobs. 

1803 

1804 The expression instruct HTCondor to release any job which was put on hold 

1805 due to exceeding memory requirements back to the job queue providing it 

1806 satisfies all of the conditions below: 

1807 

1808 * number of run attempts did not reach allowable number of retries, 

1809 * the memory requirements in the last failed run attempt did not reach 

1810 the specified memory limit. 

1811 

1812 Parameters 

1813 ---------- 

1814 memory : `int` 

1815 Requested memory in MB. 

1816 multiplier : `float` 

1817 Memory growth rate between retires. 

1818 limit : `int` 

1819 Memory limit. 

1820 

1821 Returns 

1822 ------- 

1823 expr : `str` 

1824 A string representing an HTCondor ClassAd expression for releasing jobs 

1825 which have been held due to exceeding the memory requirements. 

1826 """ 

1827 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1828 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1829 

1830 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1831 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1832 # The special comparison operators ensure that all comparisons below will 

1833 # evaluate to FALSE in this case. 

1834 # 

1835 # Note: 

1836 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1837 # the entire expression should evaluate to FALSE when the job is not HELD. 

1838 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1839 # but better safe than sorry. 

1840 was_mem_exceeded = ( 

1841 "JobStatus == 5 " 

1842 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1843 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1844 ) 

1845 

1846 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1847 return expr 

1848 

1849 

1850def _create_periodic_remove_expr(memory, multiplier, limit): 

1851 """Construct an HTCondorAd expression for removing jobs from the queue. 

1852 

1853 The expression instruct HTCondor to remove any job which was put on hold 

1854 due to exceeding memory requirements from the job queue providing it 

1855 satisfies any of the conditions below: 

1856 

1857 * allowable number of retries was reached, 

1858 * the memory requirements during the last failed run attempt reached 

1859 the specified memory limit. 

1860 

1861 Parameters 

1862 ---------- 

1863 memory : `int` 

1864 Requested memory in MB. 

1865 multiplier : `float` 

1866 Memory growth rate between retires. 

1867 limit : `int` 

1868 Memory limit. 

1869 

1870 Returns 

1871 ------- 

1872 expr : `str` 

1873 A string representing an HTCondor ClassAd expression for removing jobs 

1874 which were run at the maximal allowable memory and still exceeded 

1875 the memory requirements. 

1876 """ 

1877 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1878 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1879 

1880 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1881 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1882 # The special comparison operators ensure that all comparisons below will 

1883 # evaluate to FALSE in this case. 

1884 # 

1885 # Note: 

1886 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1887 # the entire expression should evaluate to FALSE when the job is not HELD. 

1888 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1889 # but better safe than sorry. 

1890 was_mem_exceeded = ( 

1891 "JobStatus == 5 " 

1892 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1893 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1894 ) 

1895 

1896 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1897 return expr 

1898 

1899 

1900def _create_request_memory_expr(memory, multiplier, limit): 

1901 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1902 

1903 Parameters 

1904 ---------- 

1905 memory : `int` 

1906 Requested memory in MB. 

1907 multiplier : `float` 

1908 Memory growth rate between retires. 

1909 limit : `int` 

1910 Memory limit. 

1911 

1912 Returns 

1913 ------- 

1914 expr : `str` 

1915 A string representing an HTCondor ClassAd expression enabling safe 

1916 memory scaling between job retries. 

1917 """ 

1918 # The check if the job was held due to exceeding memory requirements 

1919 # will be made *after* job was released back to the job queue (is in 

1920 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1921 # the ones describing job's current state. 

1922 # 

1923 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1924 # initially put in the job queue. The special comparison operators ensure 

1925 # that all comparisons below will evaluate to FALSE in this case. 

1926 was_mem_exceeded = ( 

1927 "LastJobStatus =?= 5 " 

1928 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1929 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1930 ) 

1931 

1932 # If job runs the first time or was held for reasons other than exceeding 

1933 # the memory, set the required memory to the requested value or use 

1934 # the memory value measured by HTCondor (MemoryUsage) depending on 

1935 # whichever is greater. 

1936 expr = ( 

1937 f"({was_mem_exceeded}) " 

1938 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1939 f": max({{{memory}, MemoryUsage ?: 0}})" 

1940 ) 

1941 return expr 

1942 

1943 

1944def _locate_schedds(locate_all=False): 

1945 """Find out Scheduler daemons in an HTCondor pool. 

1946 

1947 Parameters 

1948 ---------- 

1949 locate_all : `bool`, optional 

1950 If True, all available schedulers in the HTCondor pool will be located. 

1951 False by default which means that the search will be limited to looking 

1952 for the Scheduler running on a local host. 

1953 

1954 Returns 

1955 ------- 

1956 schedds : `dict` [`str`, `htcondor.Schedd`] 

1957 A mapping between Scheduler names and Python objects allowing for 

1958 interacting with them. 

1959 """ 

1960 coll = htcondor.Collector() 

1961 

1962 schedd_ads = [] 

1963 if locate_all: 

1964 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1965 else: 

1966 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1967 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1968 

1969 

1970def _gather_site_values(config, compute_site): 

1971 """Gather values specific to given site. 

1972 

1973 Parameters 

1974 ---------- 

1975 config : `lsst.ctrl.bps.BpsConfig` 

1976 BPS configuration that includes necessary submit/runtime 

1977 information. 

1978 compute_site : `str` 

1979 Compute site name. 

1980 

1981 Returns 

1982 ------- 

1983 site_values : `dict` [`str`, `Any`] 

1984 Values specific to the given site. 

1985 """ 

1986 site_values = {"attrs": {}, "profile": {}} 

1987 search_opts = {} 

1988 if compute_site: 

1989 search_opts["curvals"] = {"curr_site": compute_site} 

1990 

1991 # Determine the hard limit for the memory requirement. 

1992 found, limit = config.search("memoryLimit", opt=search_opts) 

1993 if not found: 

1994 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1995 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1996 del search_opts["default"] 

1997 

1998 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1999 # by definition, they cannot have more memory than 

2000 # the partitionable slot they are the part of. 

2001 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

2002 pool_info = condor_status(constraint=constraint) 

2003 try: 

2004 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

2005 except ValueError: 

2006 _LOG.debug("No execute machine in the pool matches %s", patt) 

2007 if limit: 

2008 config[".bps_defined.memory_limit"] = limit 

2009 

2010 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

2011 site_values["memoryLimit"] = limit 

2012 

2013 found, value = config.search("accountingGroup", opt=search_opts) 

2014 if found: 

2015 site_values["accountingGroup"] = value 

2016 found, value = config.search("accountingUser", opt=search_opts) 

2017 if found: 

2018 site_values["accountingUser"] = value 

2019 

2020 key = f".site.{compute_site}.profile.condor" 

2021 if key in config: 

2022 for key, val in config[key].items(): 

2023 if key.startswith("+"): 

2024 site_values["attrs"][key[1:]] = val 

2025 else: 

2026 site_values["profile"][key] = val 

2027 

2028 return site_values