Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%

741 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 03:42 -0700

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Interface between generic workflow to HTCondor workflow system. 

29""" 

30 

31__all__ = ["HTCondorService", "HTCondorWorkflow"] 

32 

33 

34import logging 

35import os 

36import re 

37from collections import defaultdict 

38from enum import IntEnum, auto 

39from pathlib import Path 

40 

41import htcondor 

42from lsst.ctrl.bps import ( 

43 BaseWmsService, 

44 BaseWmsWorkflow, 

45 GenericWorkflow, 

46 GenericWorkflowJob, 

47 WmsJobReport, 

48 WmsRunReport, 

49 WmsStates, 

50) 

51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary 

52from lsst.utils.timer import time_this 

53from packaging import version 

54 

55from .lssthtc import ( 

56 MISSING_ID, 

57 HTCDag, 

58 HTCJob, 

59 JobStatus, 

60 NodeStatus, 

61 condor_history, 

62 condor_q, 

63 condor_search, 

64 condor_status, 

65 htc_backup_files, 

66 htc_check_dagman_output, 

67 htc_create_submit_from_cmd, 

68 htc_create_submit_from_dag, 

69 htc_create_submit_from_file, 

70 htc_escape, 

71 htc_submit_dag, 

72 htc_version, 

73 pegasus_name_to_label, 

74 read_dag_info, 

75 read_dag_log, 

76 read_dag_status, 

77 read_node_status, 

78 summary_from_dag, 

79 write_dag_info, 

80) 

81 

82 

83class WmsIdType(IntEnum): 

84 """Type of valid WMS ids.""" 

85 

86 UNKNOWN = auto() 

87 """The type of id cannot be determined. 

88 """ 

89 

90 LOCAL = auto() 

91 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

92 """ 

93 

94 GLOBAL = auto() 

95 """Id is a HTCondor's global job id. 

96 """ 

97 

98 PATH = auto() 

99 """Id is a submission path. 

100 """ 

101 

102 

103DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

104"""Default pattern for searching execute machines in an HTCondor pool. 

105""" 

106 

107_LOG = logging.getLogger(__name__) 

108 

109 

110class HTCondorService(BaseWmsService): 

111 """HTCondor version of WMS service.""" 

112 

113 def prepare(self, config, generic_workflow, out_prefix=None): 

114 """Convert generic workflow to an HTCondor DAG ready for submission. 

115 

116 Parameters 

117 ---------- 

118 config : `lsst.ctrl.bps.BpsConfig` 

119 BPS configuration that includes necessary submit/runtime 

120 information. 

121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

122 The generic workflow (e.g., has executable name and arguments). 

123 out_prefix : `str` 

124 The root directory into which all WMS-specific files are written. 

125 

126 Returns 

127 ------- 

128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

129 HTCondor workflow ready to be run. 

130 """ 

131 _LOG.debug("out_prefix = '%s'", out_prefix) 

132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

133 workflow = HTCondorWorkflow.from_generic_workflow( 

134 config, 

135 generic_workflow, 

136 out_prefix, 

137 f"{self.__class__.__module__}.{self.__class__.__name__}", 

138 ) 

139 

140 with time_this( 

141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

142 ): 

143 workflow.write(out_prefix) 

144 return workflow 

145 

146 def submit(self, workflow, **kwargs): 

147 """Submit a single HTCondor workflow. 

148 

149 Parameters 

150 ---------- 

151 workflow : `lsst.ctrl.bps.BaseWorkflow` 

152 A single HTCondor workflow to submit. run_id is updated after 

153 successful submission to WMS. 

154 **kwargs : `~typing.Any` 

155 """ 

156 dag = workflow.dag 

157 

158 ver = version.parse(htc_version()) 

159 if ver >= version.parse("8.9.3"): 

160 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

161 else: 

162 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

163 

164 # For workflow portability, internal paths are all relative. Hence 

165 # the DAG needs to be submitted to HTCondor from inside the submit 

166 # directory. 

167 with chdir(workflow.submit_path): 

168 _LOG.info("Submitting from directory: %s", os.getcwd()) 

169 schedd_dag_info = htc_submit_dag(sub) 

170 if schedd_dag_info: 

171 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

172 

173 _, dag_info = schedd_dag_info.popitem() 

174 _, dag_ad = dag_info.popitem() 

175 

176 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

177 workflow.run_id = dag.run_id 

178 else: 

179 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

180 

181 def restart(self, wms_workflow_id): 

182 """Restart a failed DAGMan workflow. 

183 

184 Parameters 

185 ---------- 

186 wms_workflow_id : `str` 

187 The directory with HTCondor files. 

188 

189 Returns 

190 ------- 

191 run_id : `str` 

192 HTCondor id of the restarted DAGMan job. If restart failed, it will 

193 be set to None. 

194 run_name : `str` 

195 Name of the restarted workflow. If restart failed, it will be set 

196 to None. 

197 message : `str` 

198 A message describing any issues encountered during the restart. 

199 If there were no issues, an empty string is returned. 

200 """ 

201 wms_path, id_type = _wms_id_to_dir(wms_workflow_id) 

202 if wms_path is None: 

203 return ( 

204 None, 

205 None, 

206 ( 

207 f"workflow with run id '{wms_workflow_id}' not found. " 

208 f"Hint: use run's submit directory as the id instead" 

209 ), 

210 ) 

211 

212 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}: 

213 if not wms_path.is_dir(): 

214 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found." 

215 

216 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

217 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

218 if not rescue_dags: 

219 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

220 

221 _LOG.info("Verifying that the workflow is not already in the job queue") 

222 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"') 

223 if schedd_dag_info: 

224 _, dag_info = schedd_dag_info.popitem() 

225 _, dag_ad = dag_info.popitem() 

226 id_ = dag_ad["GlobalJobId"] 

227 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

228 

229 _LOG.info("Checking execution status of the workflow") 

230 warn = False 

231 dag_ad = read_dag_status(str(wms_path)) 

232 if dag_ad: 

233 nodes_total = dag_ad.get("NodesTotal", 0) 

234 if nodes_total != 0: 

235 nodes_done = dag_ad.get("NodesDone", 0) 

236 if nodes_total == nodes_done: 

237 return None, None, "All jobs in the workflow finished successfully" 

238 else: 

239 warn = True 

240 else: 

241 warn = True 

242 if warn: 

243 _LOG.warning( 

244 "Cannot determine the execution status of the workflow, continuing with restart regardless" 

245 ) 

246 

247 _LOG.info("Backing up select HTCondor files from previous run attempt") 

248 htc_backup_files(wms_path, subdir="backups") 

249 

250 # For workflow portability, internal paths are all relative. Hence 

251 # the DAG needs to be resubmitted to HTCondor from inside the submit 

252 # directory. 

253 _LOG.info("Adding workflow to the job queue") 

254 run_id, run_name, message = None, None, "" 

255 with chdir(wms_path): 

256 try: 

257 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

258 except StopIteration: 

259 message = f"DAGMan submit description file not found in '{wms_path}'" 

260 else: 

261 sub = htc_create_submit_from_file(dag_path.name) 

262 schedd_dag_info = htc_submit_dag(sub) 

263 

264 # Save select information about the DAGMan job to a file. Use 

265 # the run name (available in the ClassAd) as the filename. 

266 if schedd_dag_info: 

267 dag_info = next(iter(schedd_dag_info.values())) 

268 dag_ad = next(iter(dag_info.values())) 

269 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

270 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

271 run_name = dag_ad["bps_run"] 

272 else: 

273 message = "DAGMan job information unavailable" 

274 

275 return run_id, run_name, message 

276 

277 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

278 """Query WMS for list of submitted WMS workflows/jobs. 

279 

280 This should be a quick lookup function to create list of jobs for 

281 other functions. 

282 

283 Parameters 

284 ---------- 

285 wms_id : `int` or `str`, optional 

286 Id or path that can be used by WMS service to look up job. 

287 user : `str`, optional 

288 User whose submitted jobs should be listed. 

289 require_bps : `bool`, optional 

290 Whether to require jobs returned in list to be bps-submitted jobs. 

291 pass_thru : `str`, optional 

292 Information to pass through to WMS. 

293 is_global : `bool`, optional 

294 If set, all job queues (and their histories) will be queried for 

295 job information. Defaults to False which means that only the local 

296 job queue will be queried. 

297 

298 Returns 

299 ------- 

300 job_ids : `list` [`Any`] 

301 Only job ids to be used by cancel and other functions. Typically 

302 this means top-level jobs (i.e., not children jobs). 

303 """ 

304 _LOG.debug( 

305 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

306 wms_id, 

307 user, 

308 require_bps, 

309 pass_thru, 

310 is_global, 

311 ) 

312 

313 # Determine which Schedds will be queried for job information. 

314 coll = htcondor.Collector() 

315 

316 schedd_ads = [] 

317 if is_global: 

318 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

319 else: 

320 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

321 

322 # Construct appropriate constraint expression using provided arguments. 

323 constraint = "False" 

324 if wms_id is None: 

325 if user is not None: 

326 constraint = f'(Owner == "{user}")' 

327 else: 

328 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

329 if cluster_id is not None: 

330 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

331 

332 # If provided id is either a submission path or a global id, 

333 # make sure the right Schedd will be queried regardless of 

334 # 'is_global' value. 

335 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

336 schedd_ads = [schedd_ad] 

337 if require_bps: 

338 constraint += ' && (bps_isjob == "True")' 

339 if pass_thru: 

340 if "-forcex" in pass_thru: 

341 pass_thru_2 = pass_thru.replace("-forcex", "") 

342 if pass_thru_2 and not pass_thru_2.isspace(): 

343 constraint += f" && ({pass_thru_2})" 

344 else: 

345 constraint += f" && ({pass_thru})" 

346 

347 # Create a list of scheduler daemons which need to be queried. 

348 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

349 

350 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

351 results = condor_q(constraint=constraint, schedds=schedds) 

352 

353 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

354 job_ids = [] 

355 for schedd_name, job_info in results.items(): 

356 for job_id, job_ad in job_info.items(): 

357 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

358 if "DAGManJobId" not in job_ad: 

359 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

360 else: 

361 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

362 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

363 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

364 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

365 

366 _LOG.debug("job_ids = %s", job_ids) 

367 return job_ids 

368 

369 def report( 

370 self, 

371 wms_workflow_id=None, 

372 user=None, 

373 hist=0, 

374 pass_thru=None, 

375 is_global=False, 

376 return_exit_codes=False, 

377 ): 

378 """Return run information based upon given constraints. 

379 

380 Parameters 

381 ---------- 

382 wms_workflow_id : `str`, optional 

383 Limit to specific run based on id. 

384 user : `str`, optional 

385 Limit results to runs for this user. 

386 hist : `float`, optional 

387 Limit history search to this many days. Defaults to 0. 

388 pass_thru : `str`, optional 

389 Constraints to pass through to HTCondor. 

390 is_global : `bool`, optional 

391 If set, all job queues (and their histories) will be queried for 

392 job information. Defaults to False which means that only the local 

393 job queue will be queried. 

394 return_exit_codes : `bool`, optional 

395 If set, return exit codes related to jobs with a 

396 non-success status. Defaults to False, which means that only 

397 the summary state is returned. 

398 

399 Only applicable in the context of a WMS with associated 

400 handlers to return exit codes from jobs. 

401 

402 Returns 

403 ------- 

404 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

405 Information about runs from given job information. 

406 message : `str` 

407 Extra message for report command to print. This could be pointers 

408 to documentation or to WMS specific commands. 

409 """ 

410 if wms_workflow_id: 

411 id_type = _wms_id_type(wms_workflow_id) 

412 if id_type == WmsIdType.LOCAL: 

413 schedulers = _locate_schedds(locate_all=is_global) 

414 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

415 elif id_type == WmsIdType.GLOBAL: 

416 schedulers = _locate_schedds(locate_all=True) 

417 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

418 elif id_type == WmsIdType.PATH: 

419 run_reports, message = _report_from_path(wms_workflow_id) 

420 else: 

421 run_reports, message = {}, "Invalid job id" 

422 else: 

423 schedulers = _locate_schedds(locate_all=is_global) 

424 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

425 _LOG.debug("report: %s, %s", run_reports, message) 

426 

427 return list(run_reports.values()), message 

428 

429 def cancel(self, wms_id, pass_thru=None): 

430 """Cancel submitted workflows/jobs. 

431 

432 Parameters 

433 ---------- 

434 wms_id : `str` 

435 Id or path of job that should be canceled. 

436 pass_thru : `str`, optional 

437 Information to pass through to WMS. 

438 

439 Returns 

440 ------- 

441 deleted : `bool` 

442 Whether successful deletion or not. Currently, if any doubt or any 

443 individual jobs not deleted, return False. 

444 message : `str` 

445 Any message from WMS (e.g., error details). 

446 """ 

447 _LOG.debug("Canceling wms_id = %s", wms_id) 

448 

449 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

450 

451 if cluster_id is None: 

452 deleted = False 

453 message = "invalid id" 

454 else: 

455 _LOG.debug( 

456 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

457 cluster_id, 

458 schedd_ad["Name"], 

459 ) 

460 schedd = htcondor.Schedd(schedd_ad) 

461 

462 constraint = f"ClusterId == {cluster_id}" 

463 if pass_thru is not None and "-forcex" in pass_thru: 

464 pass_thru_2 = pass_thru.replace("-forcex", "") 

465 if pass_thru_2 and not pass_thru_2.isspace(): 

466 constraint += f"&& ({pass_thru_2})" 

467 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

468 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

469 else: 

470 if pass_thru: 

471 constraint += f"&& ({pass_thru})" 

472 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

473 results = schedd.act(htcondor.JobAction.Remove, constraint) 

474 _LOG.debug("Remove results: %s", results) 

475 

476 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

477 deleted = True 

478 message = "" 

479 else: 

480 deleted = False 

481 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

482 message = "no such bps job in batch queue" 

483 else: 

484 message = f"unknown problems deleting: {results}" 

485 

486 _LOG.debug("deleted: %s; message = %s", deleted, message) 

487 return deleted, message 

488 

489 

490class HTCondorWorkflow(BaseWmsWorkflow): 

491 """Single HTCondor workflow. 

492 

493 Parameters 

494 ---------- 

495 name : `str` 

496 Unique name for Workflow used when naming files. 

497 config : `lsst.ctrl.bps.BpsConfig` 

498 BPS configuration that includes necessary submit/runtime information. 

499 """ 

500 

501 def __init__(self, name, config=None): 

502 super().__init__(name, config) 

503 self.dag = None 

504 

505 @classmethod 

506 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

507 # Docstring inherited 

508 htc_workflow = cls(generic_workflow.name, config) 

509 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

510 

511 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

512 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

513 htc_workflow.dag.add_attribs( 

514 { 

515 "bps_wms_service": service_class, 

516 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

517 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

518 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

519 } 

520 ) 

521 

522 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

523 if isinstance(tmp_template, str): 

524 subdir_template = defaultdict(lambda: tmp_template) 

525 else: 

526 subdir_template = tmp_template 

527 

528 # Create all DAG jobs 

529 site_values = {} # cache compute site specific values to reduce config lookups 

530 for job_name in generic_workflow: 

531 gwjob = generic_workflow.get_job(job_name) 

532 if gwjob.compute_site not in site_values: 

533 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

534 htc_job = _create_job( 

535 subdir_template[gwjob.label], 

536 site_values[gwjob.compute_site], 

537 generic_workflow, 

538 gwjob, 

539 out_prefix, 

540 ) 

541 htc_workflow.dag.add_job(htc_job) 

542 

543 # Add job dependencies to the DAG 

544 for job_name in generic_workflow: 

545 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

546 

547 # If final job exists in generic workflow, create DAG final job 

548 final = generic_workflow.get_final() 

549 if final and isinstance(final, GenericWorkflowJob): 

550 if final.compute_site and final.compute_site not in site_values: 

551 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

552 final_htjob = _create_job( 

553 subdir_template[final.label], 

554 site_values[final.compute_site], 

555 generic_workflow, 

556 final, 

557 out_prefix, 

558 ) 

559 if "post" not in final_htjob.dagcmds: 

560 final_htjob.dagcmds["post"] = ( 

561 f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN" 

562 ) 

563 htc_workflow.dag.add_final_job(final_htjob) 

564 elif final and isinstance(final, GenericWorkflow): 

565 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

566 elif final: 

567 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

568 

569 return htc_workflow 

570 

571 def write(self, out_prefix): 

572 """Output HTCondor DAGMan files needed for workflow submission. 

573 

574 Parameters 

575 ---------- 

576 out_prefix : `str` 

577 Directory prefix for HTCondor files. 

578 """ 

579 self.submit_path = out_prefix 

580 os.makedirs(out_prefix, exist_ok=True) 

581 

582 # Write down the workflow in HTCondor format. 

583 self.dag.write(out_prefix, "jobs/{self.label}") 

584 

585 

586def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

587 """Convert GenericWorkflow job nodes to DAG jobs. 

588 

589 Parameters 

590 ---------- 

591 subdir_template : `str` 

592 Template for making subdirs. 

593 site_values : `dict` 

594 Site specific values 

595 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

596 Generic workflow that is being converted. 

597 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

598 The generic job to convert to a HTCondor job. 

599 out_prefix : `str` 

600 Directory prefix for HTCondor files. 

601 

602 Returns 

603 ------- 

604 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

605 The HTCondor job equivalent to the given generic job. 

606 """ 

607 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

608 

609 curvals = defaultdict(str) 

610 curvals["label"] = gwjob.label 

611 if gwjob.tags: 

612 curvals.update(gwjob.tags) 

613 

614 subdir = subdir_template.format_map(curvals) 

615 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

616 

617 htc_job_cmds = { 

618 "universe": "vanilla", 

619 "should_transfer_files": "YES", 

620 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

621 "transfer_output_files": '""', # Set to empty string to disable 

622 "transfer_executable": "False", 

623 "getenv": "True", 

624 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

625 # htcondor to put on hold any jobs which exited by a signal. 

626 "on_exit_hold": "ExitBySignal == true", 

627 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

628 '"Handling signal as if job has gone over memory limit.")', 

629 "on_exit_hold_subcode": "34", 

630 } 

631 

632 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

633 

634 # job stdout, stderr, htcondor user log. 

635 for key in ("output", "error", "log"): 

636 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

637 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

638 

639 htc_job_cmds.update( 

640 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

641 ) 

642 

643 # Add the job cmds dict to the job object. 

644 htc_job.add_job_cmds(htc_job_cmds) 

645 

646 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

647 

648 # Add job attributes to job. 

649 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

650 htc_job.add_job_attrs(gwjob.attrs) 

651 htc_job.add_job_attrs(site_values["attrs"]) 

652 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

653 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

654 

655 return htc_job 

656 

657 

658def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

659 """Translate the job data that are one to one mapping 

660 

661 Parameters 

662 ---------- 

663 cached_vals : `dict` [`str`, `Any`] 

664 Config values common to jobs with same label. 

665 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

666 Generic workflow that contains job to being converted. 

667 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

668 Generic workflow job to be converted. 

669 

670 Returns 

671 ------- 

672 htc_job_commands : `dict` [`str`, `Any`] 

673 Contains commands which can appear in the HTCondor submit description 

674 file. 

675 """ 

676 # Values in the job script that just are name mappings. 

677 job_translation = { 

678 "mail_to": "notify_user", 

679 "when_to_mail": "notification", 

680 "request_cpus": "request_cpus", 

681 "priority": "priority", 

682 "category": "category", 

683 "accounting_group": "accounting_group", 

684 "accounting_user": "accounting_group_user", 

685 } 

686 

687 jobcmds = {} 

688 for gwkey, htckey in job_translation.items(): 

689 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

690 

691 # If accounting info was not set explicitly, use site settings if any. 

692 if not gwjob.accounting_group: 

693 jobcmds["accounting_group"] = cached_vals.get("accountingGroup") 

694 if not gwjob.accounting_user: 

695 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser") 

696 

697 # job commands that need modification 

698 if gwjob.number_of_retries: 

699 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

700 

701 if gwjob.retry_unless_exit: 

702 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

703 

704 if gwjob.request_disk: 

705 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

706 

707 if gwjob.request_memory: 

708 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

709 

710 if gwjob.memory_multiplier: 

711 # Do not use try-except! At the moment, BpsConfig returns an empty 

712 # string if it does not contain the key. 

713 memory_limit = cached_vals["memoryLimit"] 

714 if not memory_limit: 

715 raise RuntimeError( 

716 "Memory autoscaling enabled, but automatic detection of the memory limit " 

717 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

718 "search pattern 'executeMachinesPattern' might help." 

719 ) 

720 

721 # Set maximal amount of memory job can ask for. 

722 # 

723 # The check below assumes that 'memory_limit' was set to a value which 

724 # realistically reflects actual physical limitations of a given compute 

725 # resource. 

726 memory_max = memory_limit 

727 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

728 memory_max = gwjob.request_memory_max 

729 

730 # Make job ask for more memory each time it failed due to insufficient 

731 # memory requirements. 

732 jobcmds["request_memory"] = _create_request_memory_expr( 

733 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

734 ) 

735 

736 # Periodically release jobs which are being held due to exceeding 

737 # memory. Stop doing that (by removing the job from the HTCondor queue) 

738 # after the maximal number of retries has been reached or the job was 

739 # already run at maximal allowed memory. 

740 jobcmds["periodic_release"] = _create_periodic_release_expr( 

741 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

742 ) 

743 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

744 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

745 ) 

746 

747 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

748 # May need to move to special site-specific implementation if sites use 

749 # other mechanisms. 

750 if gwjob.concurrency_limit: 

751 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

752 

753 # Handle command line 

754 if gwjob.executable.transfer_executable: 

755 jobcmds["transfer_executable"] = "True" 

756 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

757 else: 

758 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

759 

760 if gwjob.arguments: 

761 arguments = gwjob.arguments 

762 arguments = _replace_cmd_vars(arguments, gwjob) 

763 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

764 arguments = _fix_env_var_syntax(arguments) 

765 jobcmds["arguments"] = arguments 

766 

767 # Add extra "pass-thru" job commands 

768 if gwjob.profile: 

769 for key, val in gwjob.profile.items(): 

770 jobcmds[key] = htc_escape(val) 

771 for key, val in cached_vals["profile"].items(): 

772 jobcmds[key] = htc_escape(val) 

773 

774 return jobcmds 

775 

776 

777def _translate_dag_cmds(gwjob): 

778 """Translate job values into DAGMan commands. 

779 

780 Parameters 

781 ---------- 

782 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

783 Job containing values to be translated. 

784 

785 Returns 

786 ------- 

787 dagcmds : `dict` [`str`, `Any`] 

788 DAGMan commands for the job. 

789 """ 

790 # Values in the dag script that just are name mappings. 

791 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

792 

793 dagcmds = {} 

794 for gwkey, htckey in dag_translation.items(): 

795 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

796 

797 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

798 return dagcmds 

799 

800 

801def _fix_env_var_syntax(oldstr): 

802 """Change ENV place holders to HTCondor Env var syntax. 

803 

804 Parameters 

805 ---------- 

806 oldstr : `str` 

807 String in which environment variable syntax is to be fixed. 

808 

809 Returns 

810 ------- 

811 newstr : `str` 

812 Given string with environment variable syntax fixed. 

813 """ 

814 newstr = oldstr 

815 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

816 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

817 return newstr 

818 

819 

820def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

821 """Replace file placeholders in command line arguments with correct 

822 physical file names. 

823 

824 Parameters 

825 ---------- 

826 use_shared : `bool` 

827 Whether HTCondor can assume shared filesystem. 

828 arguments : `str` 

829 Arguments string in which to replace file placeholders. 

830 workflow : `lsst.ctrl.bps.GenericWorkflow` 

831 Generic workflow that contains file information. 

832 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

833 The job corresponding to the arguments. 

834 

835 Returns 

836 ------- 

837 arguments : `str` 

838 Given arguments string with file placeholders replaced. 

839 """ 

840 # Replace input file placeholders with paths. 

841 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

842 if not gwfile.wms_transfer: 

843 # Must assume full URI if in command line and told WMS is not 

844 # responsible for transferring file. 

845 uri = gwfile.src_uri 

846 elif use_shared: 

847 if gwfile.job_shared: 

848 # Have shared filesystems and jobs can share file. 

849 uri = gwfile.src_uri 

850 else: 

851 # Taking advantage of inside knowledge. Not future-proof. 

852 # Temporary fix until have job wrapper that pulls files 

853 # within job. 

854 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

855 uri = "butler.yaml" 

856 else: 

857 uri = os.path.basename(gwfile.src_uri) 

858 else: # Using push transfer 

859 uri = os.path.basename(gwfile.src_uri) 

860 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

861 

862 # Replace output file placeholders with paths. 

863 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

864 if not gwfile.wms_transfer: 

865 # Must assume full URI if in command line and told WMS is not 

866 # responsible for transferring file. 

867 uri = gwfile.src_uri 

868 elif use_shared: 

869 if gwfile.job_shared: 

870 # Have shared filesystems and jobs can share file. 

871 uri = gwfile.src_uri 

872 else: 

873 uri = os.path.basename(gwfile.src_uri) 

874 else: # Using push transfer 

875 uri = os.path.basename(gwfile.src_uri) 

876 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

877 return arguments 

878 

879 

880def _replace_cmd_vars(arguments, gwjob): 

881 """Replace format-style placeholders in arguments. 

882 

883 Parameters 

884 ---------- 

885 arguments : `str` 

886 Arguments string in which to replace placeholders. 

887 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

888 Job containing values to be used to replace placeholders 

889 (in particular gwjob.cmdvals). 

890 

891 Returns 

892 ------- 

893 arguments : `str` 

894 Given arguments string with placeholders replaced. 

895 """ 

896 try: 

897 arguments = arguments.format(**gwjob.cmdvals) 

898 except (KeyError, TypeError): # TypeError in case None instead of {} 

899 _LOG.error( 

900 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals 

901 ) 

902 raise 

903 return arguments 

904 

905 

906def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

907 """Add job input files from generic workflow to job. 

908 

909 Parameters 

910 ---------- 

911 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

912 The generic workflow (e.g., has executable name and arguments). 

913 job_name : `str` 

914 Unique name for the job. 

915 use_shared : `bool` 

916 Whether job has access to files via shared filesystem. 

917 out_prefix : `str` 

918 The root directory into which all WMS-specific files are written. 

919 

920 Returns 

921 ------- 

922 htc_commands : `dict` [`str`, `str`] 

923 HTCondor commands for the job submission script. 

924 """ 

925 htc_commands = {} 

926 inputs = [] 

927 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

928 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

929 

930 uri = Path(gwf_file.src_uri) 

931 

932 # Note if use_shared and job_shared, don't need to transfer file. 

933 

934 if not use_shared: # Copy file using push to job 

935 inputs.append(str(uri.relative_to(out_prefix))) 

936 elif not gwf_file.job_shared: # Jobs require own copy 

937 # if using shared filesystem, but still need copy in job. Use 

938 # HTCondor's curl plugin for a local copy. 

939 

940 # Execution butler is represented as a directory which the 

941 # curl plugin does not handle. Taking advantage of inside 

942 # knowledge for temporary fix until have job wrapper that pulls 

943 # files within job. 

944 if gwf_file.name == "butlerConfig": 

945 # The execution butler directory doesn't normally exist until 

946 # the submit phase so checking for suffix instead of using 

947 # is_dir(). If other non-yaml file exists they would have a 

948 # different gwf_file.name. 

949 if uri.suffix == ".yaml": # Single file, so just copy. 

950 inputs.append(f"file://{uri}") 

951 else: 

952 inputs.append(f"file://{uri / 'butler.yaml'}") 

953 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

954 elif uri.is_dir(): 

955 raise RuntimeError( 

956 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}" 

957 ) 

958 else: 

959 inputs.append(f"file://{uri}") 

960 

961 if inputs: 

962 htc_commands["transfer_input_files"] = ",".join(inputs) 

963 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

964 return htc_commands 

965 

966 

967def _report_from_path(wms_path): 

968 """Gather run information from a given run directory. 

969 

970 Parameters 

971 ---------- 

972 wms_path : `str` 

973 The directory containing the submit side files (e.g., HTCondor files). 

974 

975 Returns 

976 ------- 

977 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

978 Run information for the detailed report. The key is the HTCondor id 

979 and the value is a collection of report information for that run. 

980 message : `str` 

981 Message to be printed with the summary report. 

982 """ 

983 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

984 if wms_workflow_id == MISSING_ID: 

985 run_reports = {} 

986 else: 

987 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

988 return run_reports, message 

989 

990 

991def _report_from_id(wms_workflow_id, hist, schedds=None): 

992 """Gather run information using workflow id. 

993 

994 Parameters 

995 ---------- 

996 wms_workflow_id : `str` 

997 Limit to specific run based on id. 

998 hist : `float` 

999 Limit history search to this many days. 

1000 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1001 HTCondor schedulers which to query for job information. If None 

1002 (default), all queries will be run against the local scheduler only. 

1003 

1004 Returns 

1005 ------- 

1006 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1007 Run information for the detailed report. The key is the HTCondor id 

1008 and the value is a collection of report information for that run. 

1009 message : `str` 

1010 Message to be printed with the summary report. 

1011 """ 

1012 messages = [] 

1013 

1014 # Collect information about the job by querying HTCondor schedd and 

1015 # HTCondor history. 

1016 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds) 

1017 if len(schedd_dag_info) == 1: 

1018 # Extract the DAG info without altering the results of the query. 

1019 schedd_name = next(iter(schedd_dag_info)) 

1020 dag_id = next(iter(schedd_dag_info[schedd_name])) 

1021 dag_ad = schedd_dag_info[schedd_name][dag_id] 

1022 

1023 # If the provided workflow id does not correspond to the one extracted 

1024 # from the DAGMan log file in the submit directory, rerun the query 

1025 # with the id found in the file. 

1026 # 

1027 # This is to cover the situation in which the user provided the old job 

1028 # id of a restarted run. 

1029 try: 

1030 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"]) 

1031 except FileNotFoundError as exc: 

1032 # At the moment missing DAGMan log is pretty much a fatal error. 

1033 # So empty the DAG info to finish early (see the if statement 

1034 # below). 

1035 schedd_dag_info.clean() 

1036 messages.append(f"Cannot create the report for '{dag_id}': {exc}") 

1037 else: 

1038 if path_dag_id != dag_id: 

1039 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds) 

1040 messages.append( 

1041 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. " 

1042 "This normally occurs when a run is restarted. The report shown is for the most " 

1043 f"recent status with run id '{path_dag_id}'" 

1044 ) 

1045 

1046 if len(schedd_dag_info) == 0: 

1047 run_reports = {} 

1048 elif len(schedd_dag_info) == 1: 

1049 _, dag_info = schedd_dag_info.popitem() 

1050 dag_id, dag_ad = dag_info.popitem() 

1051 

1052 # Create a mapping between jobs and their classads. The keys will 

1053 # be of format 'ClusterId.ProcId'. 

1054 job_info = {dag_id: dag_ad} 

1055 

1056 # Find jobs (nodes) belonging to that DAGMan job. 

1057 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1058 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1059 if schedd_job_info: 

1060 _, node_info = schedd_job_info.popitem() 

1061 job_info.update(node_info) 

1062 

1063 # Collect additional pieces of information about jobs using HTCondor 

1064 # files in the submission directory. 

1065 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1066 _update_jobs(job_info, path_jobs) 

1067 if message: 

1068 messages.append(message) 

1069 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1070 else: 

1071 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1072 message = ( 

1073 f"More than one job matches id '{wms_workflow_id}', " 

1074 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1075 ) 

1076 messages.append(message) 

1077 run_reports = {} 

1078 

1079 message = "\n".join(messages) 

1080 return run_reports, message 

1081 

1082 

1083def _get_info_from_schedd(wms_workflow_id, hist, schedds): 

1084 """Gather run information from HTCondor. 

1085 

1086 Parameters 

1087 ---------- 

1088 wms_workflow_id : `str` 

1089 Limit to specific run based on id. 

1090 hist : `int` 

1091 Limit history search to this many days. 

1092 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1093 HTCondor schedulers which to query for job information. If None 

1094 (default), all queries will be run against the local scheduler only. 

1095 

1096 Returns 

1097 ------- 

1098 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1099 Information about jobs satisfying the search criteria where for each 

1100 Scheduler, local HTCondor job ids are mapped to their respective 

1101 classads. 

1102 """ 

1103 dag_constraint = 'regexp("dagman$", Cmd)' 

1104 try: 

1105 cluster_id = int(float(wms_workflow_id)) 

1106 except ValueError: 

1107 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

1108 else: 

1109 dag_constraint += f" && ClusterId == {cluster_id}" 

1110 

1111 # With the current implementation of the condor_* functions the query 

1112 # will always return only one match per Scheduler. 

1113 # 

1114 # Even in the highly unlikely situation where HTCondor history (which 

1115 # condor_search queries too) is long enough to have jobs from before 

1116 # the cluster ids were rolled over (and as a result there is more then 

1117 # one job with the same cluster id) they will not show up in 

1118 # the results. 

1119 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

1120 return schedd_dag_info 

1121 

1122 

1123def _get_info_from_path(wms_path): 

1124 """Gather run information from a given run directory. 

1125 

1126 Parameters 

1127 ---------- 

1128 wms_path : `str` 

1129 Directory containing HTCondor files. 

1130 

1131 Returns 

1132 ------- 

1133 wms_workflow_id : `str` 

1134 The run id which is a DAGman job id. 

1135 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1136 Information about jobs read from files in the given directory. 

1137 The key is the HTCondor id and the value is a dictionary of HTCondor 

1138 keys and values. 

1139 message : `str` 

1140 Message to be printed with the summary report. 

1141 """ 

1142 messages = [] 

1143 try: 

1144 wms_workflow_id, jobs = read_dag_log(wms_path) 

1145 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1146 _update_jobs(jobs, read_node_status(wms_path)) 

1147 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1148 

1149 # Add more info for DAGman job 

1150 job = jobs[wms_workflow_id] 

1151 job.update(read_dag_status(wms_path)) 

1152 

1153 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1154 if "bps_run" not in job: 

1155 _add_run_info(wms_path, job) 

1156 

1157 message = htc_check_dagman_output(wms_path) 

1158 if message: 

1159 messages.append(message) 

1160 _LOG.debug( 

1161 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1162 ) 

1163 

1164 # Add extra pieces of information which cannot be found in HTCondor 

1165 # generated files like 'GlobalJobId'. 

1166 # 

1167 # Do not treat absence of this file as a serious error. Neither runs 

1168 # submitted with earlier versions of the plugin nor the runs submitted 

1169 # with Pegasus plugin will have it at the moment. However, once enough 

1170 # time passes and Pegasus plugin will have its own report() method 

1171 # (instead of sneakily using HTCondor's one), the lack of that file 

1172 # should be treated as seriously as lack of any other file. 

1173 try: 

1174 job_info = read_dag_info(wms_path) 

1175 except FileNotFoundError as exc: 

1176 message = f"Warn: Some information may not be available: {exc}" 

1177 messages.append(message) 

1178 else: 

1179 schedd_name = next(iter(job_info)) 

1180 job_ad = next(iter(job_info[schedd_name].values())) 

1181 job.update(job_ad) 

1182 except FileNotFoundError: 

1183 message = f"Could not find HTCondor files in '{wms_path}'" 

1184 _LOG.warning(message) 

1185 messages.append(message) 

1186 wms_workflow_id = MISSING_ID 

1187 jobs = {} 

1188 

1189 message = "\n".join([msg for msg in messages if msg]) 

1190 return wms_workflow_id, jobs, message 

1191 

1192 

1193def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1194 """Gather run information to be used in generating summary reports. 

1195 

1196 Parameters 

1197 ---------- 

1198 wms_workflow_id : `str` 

1199 The run id to create the report for. 

1200 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1201 Mapping HTCondor job id to job information. 

1202 

1203 Returns 

1204 ------- 

1205 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1206 Run information for the detailed report. The key is the given HTCondor 

1207 id and the value is a collection of report information for that run. 

1208 """ 

1209 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1210 dag_job = jobs.pop(wms_workflow_id) 

1211 report = WmsRunReport( 

1212 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1213 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1214 path=dag_job["Iwd"], 

1215 label=dag_job.get("bps_job_label", "MISS"), 

1216 run=dag_job.get("bps_run", "MISS"), 

1217 project=dag_job.get("bps_project", "MISS"), 

1218 campaign=dag_job.get("bps_campaign", "MISS"), 

1219 payload=dag_job.get("bps_payload", "MISS"), 

1220 operator=_get_owner(dag_job), 

1221 run_summary=_get_run_summary(dag_job), 

1222 state=_htc_status_to_wms_state(dag_job), 

1223 jobs=[], 

1224 total_number_jobs=dag_job["total_jobs"], 

1225 job_state_counts=dag_job["state_counts"], 

1226 exit_code_summary=_get_exit_code_summary(jobs), 

1227 ) 

1228 

1229 for job_id, job_info in jobs.items(): 

1230 try: 

1231 job_report = WmsJobReport( 

1232 wms_id=job_id, 

1233 name=job_info.get("DAGNodeName", job_id), 

1234 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1235 state=_htc_status_to_wms_state(job_info), 

1236 ) 

1237 if job_report.label == "init": 

1238 job_report.label = "pipetaskInit" 

1239 report.jobs.append(job_report) 

1240 except KeyError as ex: 

1241 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1242 raise 

1243 

1244 # Add the removed entry to restore the original content of the dictionary. 

1245 # The ordering of keys will be change permanently though. 

1246 jobs.update({wms_workflow_id: dag_job}) 

1247 

1248 run_reports = {report.wms_id: report} 

1249 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1250 return run_reports 

1251 

1252 

1253def _summary_report(user, hist, pass_thru, schedds=None): 

1254 """Gather run information to be used in generating summary reports. 

1255 

1256 Parameters 

1257 ---------- 

1258 user : `str` 

1259 Run lookup restricted to given user. 

1260 hist : `float` 

1261 How many previous days to search for run information. 

1262 pass_thru : `str` 

1263 Advanced users can define the HTCondor constraint to be used 

1264 when searching queue and history. 

1265 

1266 Returns 

1267 ------- 

1268 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1269 Run information for the summary report. The keys are HTCondor ids and 

1270 the values are collections of report information for each run. 

1271 message : `str` 

1272 Message to be printed with the summary report. 

1273 """ 

1274 # only doing summary report so only look for dagman jobs 

1275 if pass_thru: 

1276 constraint = pass_thru 

1277 else: 

1278 # Notes: 

1279 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1280 # manually restarted. 

1281 # * Any job with DAGManJobID isn't a DAG job 

1282 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1283 if user: 

1284 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1285 

1286 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1287 

1288 # Have list of DAGMan jobs, need to get run_report info. 

1289 run_reports = {} 

1290 for jobs in job_info.values(): 

1291 for job_id, job in jobs.items(): 

1292 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1293 # If didn't get from queue information (e.g., Kerberos bug), 

1294 # try reading from file. 

1295 if total_jobs == 0: 

1296 try: 

1297 job.update(read_dag_status(job["Iwd"])) 

1298 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1299 except StopIteration: 

1300 pass # don't kill report can't find htcondor files 

1301 

1302 if "bps_run" not in job: 

1303 _add_run_info(job["Iwd"], job) 

1304 report = WmsRunReport( 

1305 wms_id=job_id, 

1306 global_wms_id=job["GlobalJobId"], 

1307 path=job["Iwd"], 

1308 label=job.get("bps_job_label", "MISS"), 

1309 run=job.get("bps_run", "MISS"), 

1310 project=job.get("bps_project", "MISS"), 

1311 campaign=job.get("bps_campaign", "MISS"), 

1312 payload=job.get("bps_payload", "MISS"), 

1313 operator=_get_owner(job), 

1314 run_summary=_get_run_summary(job), 

1315 state=_htc_status_to_wms_state(job), 

1316 jobs=[], 

1317 total_number_jobs=total_jobs, 

1318 job_state_counts=state_counts, 

1319 ) 

1320 run_reports[report.global_wms_id] = report 

1321 

1322 return run_reports, "" 

1323 

1324 

1325def _add_run_info(wms_path, job): 

1326 """Find BPS run information elsewhere for runs without bps attributes. 

1327 

1328 Parameters 

1329 ---------- 

1330 wms_path : `str` 

1331 Path to submit files for the run. 

1332 job : `dict` [`str`, `Any`] 

1333 HTCondor dag job information. 

1334 

1335 Raises 

1336 ------ 

1337 StopIteration 

1338 If cannot find file it is looking for. Permission errors are 

1339 caught and job's run is marked with error. 

1340 """ 

1341 path = Path(wms_path) / "jobs" 

1342 try: 

1343 subfile = next(path.glob("**/*.sub")) 

1344 except (StopIteration, PermissionError): 

1345 job["bps_run"] = "Unavailable" 

1346 else: 

1347 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1348 try: 

1349 with open(subfile, encoding="utf-8") as fh: 

1350 for line in fh: 

1351 if line.startswith("+bps_"): 

1352 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1353 if m: 

1354 _LOG.debug("Matching line: %s", line) 

1355 job[m.group(1)] = m.group(2).replace('"', "") 

1356 else: 

1357 _LOG.debug("Could not parse attribute: %s", line) 

1358 except PermissionError: 

1359 job["bps_run"] = "PermissionError" 

1360 _LOG.debug("After adding job = %s", job) 

1361 

1362 

1363def _get_owner(job): 

1364 """Get the owner of a dag job. 

1365 

1366 Parameters 

1367 ---------- 

1368 job : `dict` [`str`, `Any`] 

1369 HTCondor dag job information. 

1370 

1371 Returns 

1372 ------- 

1373 owner : `str` 

1374 Owner of the dag job. 

1375 """ 

1376 owner = job.get("bps_operator", None) 

1377 if not owner: 

1378 owner = job.get("Owner", None) 

1379 if not owner: 

1380 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1381 owner = "MISS" 

1382 return owner 

1383 

1384 

1385def _get_run_summary(job): 

1386 """Get the run summary for a job. 

1387 

1388 Parameters 

1389 ---------- 

1390 job : `dict` [`str`, `Any`] 

1391 HTCondor dag job information. 

1392 

1393 Returns 

1394 ------- 

1395 summary : `str` 

1396 Number of jobs per PipelineTask label in approximate pipeline order. 

1397 Format: <label>:<count>[;<label>:<count>]+ 

1398 """ 

1399 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1400 if not summary: 

1401 summary, _ = summary_from_dag(job["Iwd"]) 

1402 if not summary: 

1403 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1404 _LOG.debug("_get_run_summary: summary=%s", summary) 

1405 

1406 # Workaround sometimes using init vs pipetaskInit 

1407 summary = summary.replace("init:", "pipetaskInit:") 

1408 

1409 if "pegasus_version" in job and "pegasus" not in summary: 

1410 summary += ";pegasus:0" 

1411 

1412 return summary 

1413 

1414 

1415def _get_exit_code_summary(jobs): 

1416 """Get the exit code summary for a run. 

1417 

1418 Parameters 

1419 ---------- 

1420 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1421 Mapping HTCondor job id to job information. 

1422 

1423 Returns 

1424 ------- 

1425 summary : `dict` [`str`, `list` [`int`]] 

1426 Jobs' exit codes per job label. 

1427 """ 

1428 summary = {} 

1429 for job_id, job_ad in jobs.items(): 

1430 job_label = job_ad["bps_job_label"] 

1431 summary.setdefault(job_label, []) 

1432 try: 

1433 exit_code = 0 

1434 job_status = job_ad["JobStatus"] 

1435 match job_status: 

1436 case JobStatus.COMPLETED | JobStatus.HELD: 

1437 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"] 

1438 case ( 

1439 JobStatus.IDLE 

1440 | JobStatus.RUNNING 

1441 | JobStatus.REMOVED 

1442 | JobStatus.TRANSFERRING_OUTPUT 

1443 | JobStatus.SUSPENDED 

1444 ): 

1445 pass 

1446 case _: 

1447 _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%s'", job_status, job_id) 

1448 if exit_code != 0: 

1449 summary[job_label].append(exit_code) 

1450 except KeyError as ex: 

1451 _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id) 

1452 return summary 

1453 

1454 

1455def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1456 """Count number of jobs per WMS state. 

1457 

1458 Parameters 

1459 ---------- 

1460 wms_workflow_id : `str` 

1461 HTCondor job id. 

1462 jobs : `dict` [`str`, `Any`] 

1463 HTCondor dag job information. 

1464 

1465 Returns 

1466 ------- 

1467 total_count : `int` 

1468 Total number of dag nodes. 

1469 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1470 Keys are the different WMS states and values are counts of jobs 

1471 that are in that WMS state. 

1472 """ 

1473 state_counts = dict.fromkeys(WmsStates, 0) 

1474 

1475 for jid, jinfo in jobs.items(): 

1476 if jid != wms_workflow_id: 

1477 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1478 

1479 total_counted = sum(state_counts.values()) 

1480 if "NodesTotal" in jobs[wms_workflow_id]: 

1481 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1482 else: 

1483 total_count = total_counted 

1484 

1485 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1486 

1487 return total_count, state_counts 

1488 

1489 

1490def _get_state_counts_from_dag_job(job): 

1491 """Count number of jobs per WMS state. 

1492 

1493 Parameters 

1494 ---------- 

1495 job : `dict` [`str`, `Any`] 

1496 HTCondor dag job information. 

1497 

1498 Returns 

1499 ------- 

1500 total_count : `int` 

1501 Total number of dag nodes. 

1502 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1503 Keys are the different WMS states and values are counts of jobs 

1504 that are in that WMS state. 

1505 """ 

1506 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1507 state_counts = dict.fromkeys(WmsStates, 0) 

1508 if "DAG_NodesReady" in job: 

1509 state_counts = { 

1510 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1511 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1512 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1513 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1514 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1515 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1516 } 

1517 total_jobs = job.get("DAG_NodesTotal") 

1518 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1519 elif "NodesFailed" in job: 

1520 state_counts = { 

1521 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1522 WmsStates.READY: job.get("NodesReady", 0), 

1523 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1524 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1525 WmsStates.FAILED: job.get("NodesFailed", 0), 

1526 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1527 } 

1528 try: 

1529 total_jobs = job.get("NodesTotal") 

1530 except KeyError as ex: 

1531 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1532 raise 

1533 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1534 else: 

1535 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1536 # for every DAG. 

1537 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1538 total_jobs = 0 

1539 

1540 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1541 return total_jobs, state_counts 

1542 

1543 

1544def _htc_status_to_wms_state(job): 

1545 """Convert HTCondor job status to generic wms state. 

1546 

1547 Parameters 

1548 ---------- 

1549 job : `dict` [`str`, `Any`] 

1550 HTCondor job information. 

1551 

1552 Returns 

1553 ------- 

1554 wms_state : `WmsStates` 

1555 The equivalent WmsState to given job's status. 

1556 """ 

1557 wms_state = WmsStates.MISFIT 

1558 if "JobStatus" in job: 

1559 wms_state = _htc_job_status_to_wms_state(job) 

1560 elif "NodeStatus" in job: 

1561 wms_state = _htc_node_status_to_wms_state(job) 

1562 return wms_state 

1563 

1564 

1565def _htc_job_status_to_wms_state(job): 

1566 """Convert HTCondor job status to generic wms state. 

1567 

1568 Parameters 

1569 ---------- 

1570 job : `dict` [`str`, `Any`] 

1571 HTCondor job information. 

1572 

1573 Returns 

1574 ------- 

1575 wms_state : `lsst.ctrl.bps.WmsStates` 

1576 The equivalent WmsState to given job's status. 

1577 """ 

1578 _LOG.debug( 

1579 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1580 ) 

1581 job_status = int(job["JobStatus"]) 

1582 wms_state = WmsStates.MISFIT 

1583 

1584 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1585 if job_status == JobStatus.IDLE: 

1586 wms_state = WmsStates.PENDING 

1587 elif job_status == JobStatus.RUNNING: 

1588 wms_state = WmsStates.RUNNING 

1589 elif job_status == JobStatus.REMOVED: 

1590 wms_state = WmsStates.DELETED 

1591 elif job_status == JobStatus.COMPLETED: 

1592 if ( 

1593 job.get("ExitBySignal", False) 

1594 or job.get("ExitCode", 0) 

1595 or job.get("ExitSignal", 0) 

1596 or job.get("DAG_Status", 0) 

1597 ): 

1598 wms_state = WmsStates.FAILED 

1599 else: 

1600 wms_state = WmsStates.SUCCEEDED 

1601 elif job_status == JobStatus.HELD: 

1602 wms_state = WmsStates.HELD 

1603 

1604 return wms_state 

1605 

1606 

1607def _htc_node_status_to_wms_state(job): 

1608 """Convert HTCondor status to generic wms state. 

1609 

1610 Parameters 

1611 ---------- 

1612 job : `dict` [`str`, `Any`] 

1613 HTCondor job information. 

1614 

1615 Returns 

1616 ------- 

1617 wms_state : `lsst.ctrl.bps.WmsStates` 

1618 The equivalent WmsState to given node's status. 

1619 """ 

1620 wms_state = WmsStates.MISFIT 

1621 

1622 status = job["NodeStatus"] 

1623 if status == NodeStatus.NOT_READY: 

1624 wms_state = WmsStates.UNREADY 

1625 elif status == NodeStatus.READY: 

1626 wms_state = WmsStates.READY 

1627 elif status == NodeStatus.PRERUN: 

1628 wms_state = WmsStates.MISFIT 

1629 elif status == NodeStatus.SUBMITTED: 

1630 if job["JobProcsHeld"]: 

1631 wms_state = WmsStates.HELD 

1632 elif job["StatusDetails"] == "not_idle": 

1633 wms_state = WmsStates.RUNNING 

1634 elif job["JobProcsQueued"]: 

1635 wms_state = WmsStates.PENDING 

1636 elif status == NodeStatus.POSTRUN: 

1637 wms_state = WmsStates.MISFIT 

1638 elif status == NodeStatus.DONE: 

1639 wms_state = WmsStates.SUCCEEDED 

1640 elif status == NodeStatus.ERROR: 

1641 # Use job exist instead of post script exit 

1642 if "DAGMAN error 0" in job["StatusDetails"]: 

1643 wms_state = WmsStates.SUCCEEDED 

1644 else: 

1645 wms_state = WmsStates.FAILED 

1646 

1647 return wms_state 

1648 

1649 

1650def _update_jobs(jobs1, jobs2): 

1651 """Update jobs1 with info in jobs2. 

1652 

1653 (Basically an update for nested dictionaries.) 

1654 

1655 Parameters 

1656 ---------- 

1657 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1658 HTCondor job information to be updated. 

1659 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1660 Additional HTCondor job information. 

1661 """ 

1662 for jid, jinfo in jobs2.items(): 

1663 if jid in jobs1: 

1664 jobs1[jid].update(jinfo) 

1665 else: 

1666 jobs1[jid] = jinfo 

1667 

1668 

1669def _wms_id_type(wms_id): 

1670 """Determine the type of the WMS id. 

1671 

1672 Parameters 

1673 ---------- 

1674 wms_id : `str` 

1675 WMS id identifying a job. 

1676 

1677 Returns 

1678 ------- 

1679 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1680 Type of WMS id. 

1681 """ 

1682 try: 

1683 int(float(wms_id)) 

1684 except ValueError: 

1685 wms_path = Path(wms_id) 

1686 if wms_path.is_dir(): 

1687 id_type = WmsIdType.PATH 

1688 else: 

1689 id_type = WmsIdType.GLOBAL 

1690 except TypeError: 

1691 id_type = WmsIdType.UNKNOWN 

1692 else: 

1693 id_type = WmsIdType.LOCAL 

1694 return id_type 

1695 

1696 

1697def _wms_id_to_cluster(wms_id): 

1698 """Convert WMS id to cluster id. 

1699 

1700 Parameters 

1701 ---------- 

1702 wms_id : `int` or `float` or `str` 

1703 HTCondor job id or path. 

1704 

1705 Returns 

1706 ------- 

1707 schedd_ad : `classad.ClassAd` 

1708 ClassAd describing the scheduler managing the job with the given id. 

1709 cluster_id : `int` 

1710 HTCondor cluster id. 

1711 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1712 The type of the provided id. 

1713 """ 

1714 coll = htcondor.Collector() 

1715 

1716 schedd_ad = None 

1717 cluster_id = None 

1718 id_type = _wms_id_type(wms_id) 

1719 if id_type == WmsIdType.LOCAL: 

1720 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1721 cluster_id = int(float(wms_id)) 

1722 elif id_type == WmsIdType.GLOBAL: 

1723 constraint = f'GlobalJobId == "{wms_id}"' 

1724 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1725 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()} 

1726 job_info = condor_q(constraint=constraint, schedds=schedds) 

1727 if job_info: 

1728 schedd_name, job_rec = job_info.popitem() 

1729 job_id, _ = job_rec.popitem() 

1730 schedd_ad = schedd_ads[schedd_name] 

1731 cluster_id = int(float(job_id)) 

1732 elif id_type == WmsIdType.PATH: 

1733 try: 

1734 job_info = read_dag_info(wms_id) 

1735 except (FileNotFoundError, PermissionError, OSError): 

1736 pass 

1737 else: 

1738 schedd_name, job_rec = job_info.popitem() 

1739 job_id, _ = job_rec.popitem() 

1740 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1741 cluster_id = int(float(job_id)) 

1742 else: 

1743 pass 

1744 return schedd_ad, cluster_id, id_type 

1745 

1746 

1747def _wms_id_to_dir(wms_id): 

1748 """Convert WMS id to a submit directory candidate. 

1749 

1750 The function does not check if the directory exists or if it is a valid 

1751 BPS submit directory. 

1752 

1753 Parameters 

1754 ---------- 

1755 wms_id : `int` or `float` or `str` 

1756 HTCondor job id or path. 

1757 

1758 Returns 

1759 ------- 

1760 wms_path : `pathlib.Path` or None 

1761 Submit directory candidate for the run with the given job id. If no 

1762 directory can be associated with the provided WMS id, it will be set 

1763 to None. 

1764 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1765 The type of the provided id. 

1766 

1767 Raises 

1768 ------ 

1769 TypeError 

1770 Raised if provided WMS id has invalid type. 

1771 """ 

1772 coll = htcondor.Collector() 

1773 schedd_ads = [] 

1774 

1775 constraint = None 

1776 wms_path = None 

1777 id_type = _wms_id_type(wms_id) 

1778 match id_type: 

1779 case WmsIdType.LOCAL: 

1780 constraint = f"ClusterId == {int(float(wms_id))}" 

1781 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1782 case WmsIdType.GLOBAL: 

1783 constraint = f'GlobalJobId == "{wms_id}"' 

1784 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1785 case WmsIdType.PATH: 

1786 wms_path = Path(wms_id) 

1787 case WmsIdType.UNKNOWN: 

1788 raise TypeError(f"Invalid job id type: {wms_id}") 

1789 if constraint is not None: 

1790 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1791 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"]) 

1792 if job_info: 

1793 _, job_rec = job_info.popitem() 

1794 _, job_ad = job_rec.popitem() 

1795 wms_path = Path(job_ad["Iwd"]) 

1796 return wms_path, id_type 

1797 

1798 

1799def _create_periodic_release_expr(memory, multiplier, limit): 

1800 """Construct an HTCondorAd expression for releasing held jobs. 

1801 

1802 The expression instruct HTCondor to release any job which was put on hold 

1803 due to exceeding memory requirements back to the job queue providing it 

1804 satisfies all of the conditions below: 

1805 

1806 * number of run attempts did not reach allowable number of retries, 

1807 * the memory requirements in the last failed run attempt did not reach 

1808 the specified memory limit. 

1809 

1810 Parameters 

1811 ---------- 

1812 memory : `int` 

1813 Requested memory in MB. 

1814 multiplier : `float` 

1815 Memory growth rate between retires. 

1816 limit : `int` 

1817 Memory limit. 

1818 

1819 Returns 

1820 ------- 

1821 expr : `str` 

1822 A string representing an HTCondor ClassAd expression for releasing jobs 

1823 which have been held due to exceeding the memory requirements. 

1824 """ 

1825 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1826 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1827 

1828 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1829 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1830 # The special comparison operators ensure that all comparisons below will 

1831 # evaluate to FALSE in this case. 

1832 # 

1833 # Note: 

1834 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1835 # the entire expression should evaluate to FALSE when the job is not HELD. 

1836 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1837 # but better safe than sorry. 

1838 was_mem_exceeded = ( 

1839 "JobStatus == 5 " 

1840 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1841 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1842 ) 

1843 

1844 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1845 return expr 

1846 

1847 

1848def _create_periodic_remove_expr(memory, multiplier, limit): 

1849 """Construct an HTCondorAd expression for removing jobs from the queue. 

1850 

1851 The expression instruct HTCondor to remove any job which was put on hold 

1852 due to exceeding memory requirements from the job queue providing it 

1853 satisfies any of the conditions below: 

1854 

1855 * allowable number of retries was reached, 

1856 * the memory requirements during the last failed run attempt reached 

1857 the specified memory limit. 

1858 

1859 Parameters 

1860 ---------- 

1861 memory : `int` 

1862 Requested memory in MB. 

1863 multiplier : `float` 

1864 Memory growth rate between retires. 

1865 limit : `int` 

1866 Memory limit. 

1867 

1868 Returns 

1869 ------- 

1870 expr : `str` 

1871 A string representing an HTCondor ClassAd expression for removing jobs 

1872 which were run at the maximal allowable memory and still exceeded 

1873 the memory requirements. 

1874 """ 

1875 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1876 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1877 

1878 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1879 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1880 # The special comparison operators ensure that all comparisons below will 

1881 # evaluate to FALSE in this case. 

1882 # 

1883 # Note: 

1884 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1885 # the entire expression should evaluate to FALSE when the job is not HELD. 

1886 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1887 # but better safe than sorry. 

1888 was_mem_exceeded = ( 

1889 "JobStatus == 5 " 

1890 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1891 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1892 ) 

1893 

1894 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1895 return expr 

1896 

1897 

1898def _create_request_memory_expr(memory, multiplier, limit): 

1899 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1900 

1901 Parameters 

1902 ---------- 

1903 memory : `int` 

1904 Requested memory in MB. 

1905 multiplier : `float` 

1906 Memory growth rate between retires. 

1907 limit : `int` 

1908 Memory limit. 

1909 

1910 Returns 

1911 ------- 

1912 expr : `str` 

1913 A string representing an HTCondor ClassAd expression enabling safe 

1914 memory scaling between job retries. 

1915 """ 

1916 # The check if the job was held due to exceeding memory requirements 

1917 # will be made *after* job was released back to the job queue (is in 

1918 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1919 # the ones describing job's current state. 

1920 # 

1921 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1922 # initially put in the job queue. The special comparison operators ensure 

1923 # that all comparisons below will evaluate to FALSE in this case. 

1924 was_mem_exceeded = ( 

1925 "LastJobStatus =?= 5 " 

1926 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1927 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1928 ) 

1929 

1930 # If job runs the first time or was held for reasons other than exceeding 

1931 # the memory, set the required memory to the requested value or use 

1932 # the memory value measured by HTCondor (MemoryUsage) depending on 

1933 # whichever is greater. 

1934 expr = ( 

1935 f"({was_mem_exceeded}) " 

1936 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1937 f": max({{{memory}, MemoryUsage ?: 0}})" 

1938 ) 

1939 return expr 

1940 

1941 

1942def _locate_schedds(locate_all=False): 

1943 """Find out Scheduler daemons in an HTCondor pool. 

1944 

1945 Parameters 

1946 ---------- 

1947 locate_all : `bool`, optional 

1948 If True, all available schedulers in the HTCondor pool will be located. 

1949 False by default which means that the search will be limited to looking 

1950 for the Scheduler running on a local host. 

1951 

1952 Returns 

1953 ------- 

1954 schedds : `dict` [`str`, `htcondor.Schedd`] 

1955 A mapping between Scheduler names and Python objects allowing for 

1956 interacting with them. 

1957 """ 

1958 coll = htcondor.Collector() 

1959 

1960 schedd_ads = [] 

1961 if locate_all: 

1962 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1963 else: 

1964 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1965 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1966 

1967 

1968def _gather_site_values(config, compute_site): 

1969 """Gather values specific to given site. 

1970 

1971 Parameters 

1972 ---------- 

1973 config : `lsst.ctrl.bps.BpsConfig` 

1974 BPS configuration that includes necessary submit/runtime 

1975 information. 

1976 compute_site : `str` 

1977 Compute site name. 

1978 

1979 Returns 

1980 ------- 

1981 site_values : `dict` [`str`, `Any`] 

1982 Values specific to the given site. 

1983 """ 

1984 site_values = {"attrs": {}, "profile": {}} 

1985 search_opts = {} 

1986 if compute_site: 

1987 search_opts["curvals"] = {"curr_site": compute_site} 

1988 

1989 # Determine the hard limit for the memory requirement. 

1990 found, limit = config.search("memoryLimit", opt=search_opts) 

1991 if not found: 

1992 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1993 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1994 del search_opts["default"] 

1995 

1996 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1997 # by definition, they cannot have more memory than 

1998 # the partitionable slot they are the part of. 

1999 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

2000 pool_info = condor_status(constraint=constraint) 

2001 try: 

2002 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

2003 except ValueError: 

2004 _LOG.debug("No execute machine in the pool matches %s", patt) 

2005 if limit: 

2006 config[".bps_defined.memory_limit"] = limit 

2007 

2008 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

2009 site_values["memoryLimit"] = limit 

2010 

2011 found, value = config.search("accountingGroup", opt=search_opts) 

2012 if found: 

2013 site_values["accountingGroup"] = value 

2014 found, value = config.search("accountingUser", opt=search_opts) 

2015 if found: 

2016 site_values["accountingUser"] = value 

2017 

2018 key = f".site.{compute_site}.profile.condor" 

2019 if key in config: 

2020 for key, val in config[key].items(): 

2021 if key.startswith("+"): 

2022 site_values["attrs"][key[1:]] = val 

2023 else: 

2024 site_values["profile"][key] = val 

2025 

2026 return site_values