Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%

721 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-11 18:08 +0000

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Interface between generic workflow to HTCondor workflow system. 

29""" 

30 

31__all__ = ["HTCondorService", "HTCondorWorkflow"] 

32 

33 

34import logging 

35import os 

36import re 

37from collections import defaultdict 

38from enum import IntEnum, auto 

39from pathlib import Path 

40 

41import htcondor 

42from lsst.ctrl.bps import ( 

43 BaseWmsService, 

44 BaseWmsWorkflow, 

45 GenericWorkflow, 

46 GenericWorkflowJob, 

47 WmsJobReport, 

48 WmsRunReport, 

49 WmsStates, 

50) 

51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary 

52from lsst.utils.timer import time_this 

53from packaging import version 

54 

55from .lssthtc import ( 

56 MISSING_ID, 

57 HTCDag, 

58 HTCJob, 

59 JobStatus, 

60 NodeStatus, 

61 condor_history, 

62 condor_q, 

63 condor_search, 

64 condor_status, 

65 htc_backup_files, 

66 htc_check_dagman_output, 

67 htc_create_submit_from_cmd, 

68 htc_create_submit_from_dag, 

69 htc_create_submit_from_file, 

70 htc_escape, 

71 htc_submit_dag, 

72 htc_version, 

73 pegasus_name_to_label, 

74 read_dag_info, 

75 read_dag_log, 

76 read_dag_status, 

77 read_node_status, 

78 summary_from_dag, 

79 write_dag_info, 

80) 

81 

82 

83class WmsIdType(IntEnum): 

84 """Type of valid WMS ids.""" 

85 

86 UNKNOWN = auto() 

87 """The type of id cannot be determined. 

88 """ 

89 

90 LOCAL = auto() 

91 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

92 """ 

93 

94 GLOBAL = auto() 

95 """Id is a HTCondor's global job id. 

96 """ 

97 

98 PATH = auto() 

99 """Id is a submission path. 

100 """ 

101 

102 

103DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

104"""Default pattern for searching execute machines in an HTCondor pool. 

105""" 

106 

107_LOG = logging.getLogger(__name__) 

108 

109 

110class HTCondorService(BaseWmsService): 

111 """HTCondor version of WMS service.""" 

112 

113 def prepare(self, config, generic_workflow, out_prefix=None): 

114 """Convert generic workflow to an HTCondor DAG ready for submission. 

115 

116 Parameters 

117 ---------- 

118 config : `lsst.ctrl.bps.BpsConfig` 

119 BPS configuration that includes necessary submit/runtime 

120 information. 

121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

122 The generic workflow (e.g., has executable name and arguments). 

123 out_prefix : `str` 

124 The root directory into which all WMS-specific files are written. 

125 

126 Returns 

127 ------- 

128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

129 HTCondor workflow ready to be run. 

130 """ 

131 _LOG.debug("out_prefix = '%s'", out_prefix) 

132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

133 workflow = HTCondorWorkflow.from_generic_workflow( 

134 config, 

135 generic_workflow, 

136 out_prefix, 

137 f"{self.__class__.__module__}.{self.__class__.__name__}", 

138 ) 

139 

140 with time_this( 

141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

142 ): 

143 workflow.write(out_prefix) 

144 return workflow 

145 

146 def submit(self, workflow): 

147 """Submit a single HTCondor workflow. 

148 

149 Parameters 

150 ---------- 

151 workflow : `lsst.ctrl.bps.BaseWorkflow` 

152 A single HTCondor workflow to submit. run_id is updated after 

153 successful submission to WMS. 

154 """ 

155 dag = workflow.dag 

156 

157 ver = version.parse(htc_version()) 

158 if ver >= version.parse("8.9.3"): 

159 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

160 else: 

161 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

162 

163 # For workflow portability, internal paths are all relative. Hence 

164 # the DAG needs to be submitted to HTCondor from inside the submit 

165 # directory. 

166 with chdir(workflow.submit_path): 

167 _LOG.info("Submitting from directory: %s", os.getcwd()) 

168 schedd_dag_info = htc_submit_dag(sub) 

169 if schedd_dag_info: 

170 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

171 

172 _, dag_info = schedd_dag_info.popitem() 

173 _, dag_ad = dag_info.popitem() 

174 

175 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

176 workflow.run_id = dag.run_id 

177 else: 

178 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

179 

180 def restart(self, wms_workflow_id): 

181 """Restart a failed DAGMan workflow. 

182 

183 Parameters 

184 ---------- 

185 wms_workflow_id : `str` 

186 The directory with HTCondor files. 

187 

188 Returns 

189 ------- 

190 run_id : `str` 

191 HTCondor id of the restarted DAGMan job. If restart failed, it will 

192 be set to None. 

193 run_name : `str` 

194 Name of the restarted workflow. If restart failed, it will be set 

195 to None. 

196 message : `str` 

197 A message describing any issues encountered during the restart. 

198 If there were no issues, an empty string is returned. 

199 """ 

200 wms_path, id_type = _wms_id_to_dir(wms_workflow_id) 

201 if wms_path is None: 

202 return ( 

203 None, 

204 None, 

205 ( 

206 f"workflow with run id '{wms_workflow_id}' not found. " 

207 f"Hint: use run's submit directory as the id instead" 

208 ), 

209 ) 

210 

211 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}: 

212 if not wms_path.is_dir(): 

213 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found." 

214 

215 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

216 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

217 if not rescue_dags: 

218 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

219 

220 _LOG.info("Verifying that the workflow is not already in the job queue") 

221 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"') 

222 if schedd_dag_info: 

223 _, dag_info = schedd_dag_info.popitem() 

224 _, dag_ad = dag_info.popitem() 

225 id_ = dag_ad["GlobalJobId"] 

226 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

227 

228 _LOG.info("Checking execution status of the workflow") 

229 warn = False 

230 dag_ad = read_dag_status(str(wms_path)) 

231 if dag_ad: 

232 nodes_total = dag_ad.get("NodesTotal", 0) 

233 if nodes_total != 0: 

234 nodes_done = dag_ad.get("NodesDone", 0) 

235 if nodes_total == nodes_done: 

236 return None, None, "All jobs in the workflow finished successfully" 

237 else: 

238 warn = True 

239 else: 

240 warn = True 

241 if warn: 

242 _LOG.warning( 

243 "Cannot determine the execution status of the workflow, continuing with restart regardless" 

244 ) 

245 

246 _LOG.info("Backing up select HTCondor files from previous run attempt") 

247 htc_backup_files(wms_path, subdir="backups") 

248 

249 # For workflow portability, internal paths are all relative. Hence 

250 # the DAG needs to be resubmitted to HTCondor from inside the submit 

251 # directory. 

252 _LOG.info("Adding workflow to the job queue") 

253 run_id, run_name, message = None, None, "" 

254 with chdir(wms_path): 

255 try: 

256 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

257 except StopIteration: 

258 message = f"DAGMan submit description file not found in '{wms_path}'" 

259 else: 

260 sub = htc_create_submit_from_file(dag_path.name) 

261 schedd_dag_info = htc_submit_dag(sub) 

262 

263 # Save select information about the DAGMan job to a file. Use 

264 # the run name (available in the ClassAd) as the filename. 

265 if schedd_dag_info: 

266 dag_info = next(iter(schedd_dag_info.values())) 

267 dag_ad = next(iter(dag_info.values())) 

268 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

269 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

270 run_name = dag_ad["bps_run"] 

271 else: 

272 message = "DAGMan job information unavailable" 

273 

274 return run_id, run_name, message 

275 

276 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

277 """Query WMS for list of submitted WMS workflows/jobs. 

278 

279 This should be a quick lookup function to create list of jobs for 

280 other functions. 

281 

282 Parameters 

283 ---------- 

284 wms_id : `int` or `str`, optional 

285 Id or path that can be used by WMS service to look up job. 

286 user : `str`, optional 

287 User whose submitted jobs should be listed. 

288 require_bps : `bool`, optional 

289 Whether to require jobs returned in list to be bps-submitted jobs. 

290 pass_thru : `str`, optional 

291 Information to pass through to WMS. 

292 is_global : `bool`, optional 

293 If set, all job queues (and their histories) will be queried for 

294 job information. Defaults to False which means that only the local 

295 job queue will be queried. 

296 

297 Returns 

298 ------- 

299 job_ids : `list` [`Any`] 

300 Only job ids to be used by cancel and other functions. Typically 

301 this means top-level jobs (i.e., not children jobs). 

302 """ 

303 _LOG.debug( 

304 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

305 wms_id, 

306 user, 

307 require_bps, 

308 pass_thru, 

309 is_global, 

310 ) 

311 

312 # Determine which Schedds will be queried for job information. 

313 coll = htcondor.Collector() 

314 

315 schedd_ads = [] 

316 if is_global: 

317 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

318 else: 

319 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

320 

321 # Construct appropriate constraint expression using provided arguments. 

322 constraint = "False" 

323 if wms_id is None: 

324 if user is not None: 

325 constraint = f'(Owner == "{user}")' 

326 else: 

327 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

328 if cluster_id is not None: 

329 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

330 

331 # If provided id is either a submission path or a global id, 

332 # make sure the right Schedd will be queried regardless of 

333 # 'is_global' value. 

334 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

335 schedd_ads = [schedd_ad] 

336 if require_bps: 

337 constraint += ' && (bps_isjob == "True")' 

338 if pass_thru: 

339 if "-forcex" in pass_thru: 

340 pass_thru_2 = pass_thru.replace("-forcex", "") 

341 if pass_thru_2 and not pass_thru_2.isspace(): 

342 constraint += f" && ({pass_thru_2})" 

343 else: 

344 constraint += f" && ({pass_thru})" 

345 

346 # Create a list of scheduler daemons which need to be queried. 

347 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

348 

349 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

350 results = condor_q(constraint=constraint, schedds=schedds) 

351 

352 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

353 job_ids = [] 

354 for schedd_name, job_info in results.items(): 

355 for job_id, job_ad in job_info.items(): 

356 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

357 if "DAGManJobId" not in job_ad: 

358 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

359 else: 

360 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

361 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

362 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

363 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

364 

365 _LOG.debug("job_ids = %s", job_ids) 

366 return job_ids 

367 

368 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

369 """Return run information based upon given constraints. 

370 

371 Parameters 

372 ---------- 

373 wms_workflow_id : `str`, optional 

374 Limit to specific run based on id. 

375 user : `str`, optional 

376 Limit results to runs for this user. 

377 hist : `float`, optional 

378 Limit history search to this many days. Defaults to 0. 

379 pass_thru : `str`, optional 

380 Constraints to pass through to HTCondor. 

381 is_global : `bool`, optional 

382 If set, all job queues (and their histories) will be queried for 

383 job information. Defaults to False which means that only the local 

384 job queue will be queried. 

385 

386 Returns 

387 ------- 

388 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

389 Information about runs from given job information. 

390 message : `str` 

391 Extra message for report command to print. This could be pointers 

392 to documentation or to WMS specific commands. 

393 """ 

394 if wms_workflow_id: 

395 id_type = _wms_id_type(wms_workflow_id) 

396 if id_type == WmsIdType.LOCAL: 

397 schedulers = _locate_schedds(locate_all=is_global) 

398 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

399 elif id_type == WmsIdType.GLOBAL: 

400 schedulers = _locate_schedds(locate_all=True) 

401 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

402 elif id_type == WmsIdType.PATH: 

403 run_reports, message = _report_from_path(wms_workflow_id) 

404 else: 

405 run_reports, message = {}, "Invalid job id" 

406 else: 

407 schedulers = _locate_schedds(locate_all=is_global) 

408 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

409 _LOG.debug("report: %s, %s", run_reports, message) 

410 

411 return list(run_reports.values()), message 

412 

413 def cancel(self, wms_id, pass_thru=None): 

414 """Cancel submitted workflows/jobs. 

415 

416 Parameters 

417 ---------- 

418 wms_id : `str` 

419 Id or path of job that should be canceled. 

420 pass_thru : `str`, optional 

421 Information to pass through to WMS. 

422 

423 Returns 

424 ------- 

425 deleted : `bool` 

426 Whether successful deletion or not. Currently, if any doubt or any 

427 individual jobs not deleted, return False. 

428 message : `str` 

429 Any message from WMS (e.g., error details). 

430 """ 

431 _LOG.debug("Canceling wms_id = %s", wms_id) 

432 

433 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

434 

435 if cluster_id is None: 

436 deleted = False 

437 message = "invalid id" 

438 else: 

439 _LOG.debug( 

440 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

441 cluster_id, 

442 schedd_ad["Name"], 

443 ) 

444 schedd = htcondor.Schedd(schedd_ad) 

445 

446 constraint = f"ClusterId == {cluster_id}" 

447 if pass_thru is not None and "-forcex" in pass_thru: 

448 pass_thru_2 = pass_thru.replace("-forcex", "") 

449 if pass_thru_2 and not pass_thru_2.isspace(): 

450 constraint += f"&& ({pass_thru_2})" 

451 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

452 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

453 else: 

454 if pass_thru: 

455 constraint += f"&& ({pass_thru})" 

456 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

457 results = schedd.act(htcondor.JobAction.Remove, constraint) 

458 _LOG.debug("Remove results: %s", results) 

459 

460 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

461 deleted = True 

462 message = "" 

463 else: 

464 deleted = False 

465 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

466 message = "no such bps job in batch queue" 

467 else: 

468 message = f"unknown problems deleting: {results}" 

469 

470 _LOG.debug("deleted: %s; message = %s", deleted, message) 

471 return deleted, message 

472 

473 

474class HTCondorWorkflow(BaseWmsWorkflow): 

475 """Single HTCondor workflow. 

476 

477 Parameters 

478 ---------- 

479 name : `str` 

480 Unique name for Workflow used when naming files. 

481 config : `lsst.ctrl.bps.BpsConfig` 

482 BPS configuration that includes necessary submit/runtime information. 

483 """ 

484 

485 def __init__(self, name, config=None): 

486 super().__init__(name, config) 

487 self.dag = None 

488 

489 @classmethod 

490 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

491 # Docstring inherited 

492 htc_workflow = cls(generic_workflow.name, config) 

493 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

494 

495 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

496 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

497 htc_workflow.dag.add_attribs( 

498 { 

499 "bps_wms_service": service_class, 

500 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

501 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

502 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

503 } 

504 ) 

505 

506 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

507 if isinstance(tmp_template, str): 

508 subdir_template = defaultdict(lambda: tmp_template) 

509 else: 

510 subdir_template = tmp_template 

511 

512 # Create all DAG jobs 

513 site_values = {} # cache compute site specific values to reduce config lookups 

514 for job_name in generic_workflow: 

515 gwjob = generic_workflow.get_job(job_name) 

516 if gwjob.compute_site not in site_values: 

517 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

518 htc_job = _create_job( 

519 subdir_template[gwjob.label], 

520 site_values[gwjob.compute_site], 

521 generic_workflow, 

522 gwjob, 

523 out_prefix, 

524 ) 

525 htc_workflow.dag.add_job(htc_job) 

526 

527 # Add job dependencies to the DAG 

528 for job_name in generic_workflow: 

529 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

530 

531 # If final job exists in generic workflow, create DAG final job 

532 final = generic_workflow.get_final() 

533 if final and isinstance(final, GenericWorkflowJob): 

534 if final.compute_site and final.compute_site not in site_values: 

535 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

536 final_htjob = _create_job( 

537 subdir_template[final.label], 

538 site_values[final.compute_site], 

539 generic_workflow, 

540 final, 

541 out_prefix, 

542 ) 

543 if "post" not in final_htjob.dagcmds: 

544 final_htjob.dagcmds[ 

545 "post" 

546 ] = f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN" 

547 htc_workflow.dag.add_final_job(final_htjob) 

548 elif final and isinstance(final, GenericWorkflow): 

549 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

550 elif final: 

551 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

552 

553 return htc_workflow 

554 

555 def write(self, out_prefix): 

556 """Output HTCondor DAGMan files needed for workflow submission. 

557 

558 Parameters 

559 ---------- 

560 out_prefix : `str` 

561 Directory prefix for HTCondor files. 

562 """ 

563 self.submit_path = out_prefix 

564 os.makedirs(out_prefix, exist_ok=True) 

565 

566 # Write down the workflow in HTCondor format. 

567 self.dag.write(out_prefix, "jobs/{self.label}") 

568 

569 

570def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

571 """Convert GenericWorkflow job nodes to DAG jobs. 

572 

573 Parameters 

574 ---------- 

575 subdir_template : `str` 

576 Template for making subdirs. 

577 site_values : `dict` 

578 Site specific values 

579 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

580 Generic workflow that is being converted. 

581 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

582 The generic job to convert to a HTCondor job. 

583 out_prefix : `str` 

584 Directory prefix for HTCondor files. 

585 

586 Returns 

587 ------- 

588 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

589 The HTCondor job equivalent to the given generic job. 

590 """ 

591 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

592 

593 curvals = defaultdict(str) 

594 curvals["label"] = gwjob.label 

595 if gwjob.tags: 

596 curvals.update(gwjob.tags) 

597 

598 subdir = subdir_template.format_map(curvals) 

599 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

600 

601 htc_job_cmds = { 

602 "universe": "vanilla", 

603 "should_transfer_files": "YES", 

604 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

605 "transfer_output_files": '""', # Set to empty string to disable 

606 "transfer_executable": "False", 

607 "getenv": "True", 

608 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

609 # htcondor to put on hold any jobs which exited by a signal. 

610 "on_exit_hold": "ExitBySignal == true", 

611 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

612 '"Handling signal as if job has gone over memory limit.")', 

613 "on_exit_hold_subcode": "34", 

614 } 

615 

616 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

617 

618 # job stdout, stderr, htcondor user log. 

619 for key in ("output", "error", "log"): 

620 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

621 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

622 

623 htc_job_cmds.update( 

624 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

625 ) 

626 

627 # Add the job cmds dict to the job object. 

628 htc_job.add_job_cmds(htc_job_cmds) 

629 

630 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

631 

632 # Add job attributes to job. 

633 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

634 htc_job.add_job_attrs(gwjob.attrs) 

635 htc_job.add_job_attrs(site_values["attrs"]) 

636 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

637 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

638 

639 return htc_job 

640 

641 

642def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

643 """Translate the job data that are one to one mapping 

644 

645 Parameters 

646 ---------- 

647 cached_vals : `dict` [`str`, `Any`] 

648 Config values common to jobs with same label. 

649 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

650 Generic workflow that contains job to being converted. 

651 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

652 Generic workflow job to be converted. 

653 

654 Returns 

655 ------- 

656 htc_job_commands : `dict` [`str`, `Any`] 

657 Contains commands which can appear in the HTCondor submit description 

658 file. 

659 """ 

660 # Values in the job script that just are name mappings. 

661 job_translation = { 

662 "mail_to": "notify_user", 

663 "when_to_mail": "notification", 

664 "request_cpus": "request_cpus", 

665 "priority": "priority", 

666 "category": "category", 

667 "accounting_group": "accounting_group", 

668 "accounting_user": "accounting_group_user", 

669 } 

670 

671 jobcmds = {} 

672 for gwkey, htckey in job_translation.items(): 

673 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

674 

675 # If accounting info was not set explicitly, use site settings if any. 

676 if not gwjob.accounting_group: 

677 jobcmds["accounting_group"] = cached_vals.get("accountingGroup") 

678 if not gwjob.accounting_user: 

679 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser") 

680 

681 # job commands that need modification 

682 if gwjob.number_of_retries: 

683 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

684 

685 if gwjob.retry_unless_exit: 

686 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

687 

688 if gwjob.request_disk: 

689 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

690 

691 if gwjob.request_memory: 

692 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

693 

694 if gwjob.memory_multiplier: 

695 # Do not use try-except! At the moment, BpsConfig returns an empty 

696 # string if it does not contain the key. 

697 memory_limit = cached_vals["memoryLimit"] 

698 if not memory_limit: 

699 raise RuntimeError( 

700 "Memory autoscaling enabled, but automatic detection of the memory limit " 

701 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

702 "search pattern 'executeMachinesPattern' might help." 

703 ) 

704 

705 # Set maximal amount of memory job can ask for. 

706 # 

707 # The check below assumes that 'memory_limit' was set to a value which 

708 # realistically reflects actual physical limitations of a given compute 

709 # resource. 

710 memory_max = memory_limit 

711 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

712 memory_max = gwjob.request_memory_max 

713 

714 # Make job ask for more memory each time it failed due to insufficient 

715 # memory requirements. 

716 jobcmds["request_memory"] = _create_request_memory_expr( 

717 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

718 ) 

719 

720 # Periodically release jobs which are being held due to exceeding 

721 # memory. Stop doing that (by removing the job from the HTCondor queue) 

722 # after the maximal number of retries has been reached or the job was 

723 # already run at maximal allowed memory. 

724 jobcmds["periodic_release"] = _create_periodic_release_expr( 

725 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

726 ) 

727 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

728 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

729 ) 

730 

731 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

732 # May need to move to special site-specific implementation if sites use 

733 # other mechanisms. 

734 if gwjob.concurrency_limit: 

735 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

736 

737 # Handle command line 

738 if gwjob.executable.transfer_executable: 

739 jobcmds["transfer_executable"] = "True" 

740 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

741 else: 

742 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

743 

744 if gwjob.arguments: 

745 arguments = gwjob.arguments 

746 arguments = _replace_cmd_vars(arguments, gwjob) 

747 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

748 arguments = _fix_env_var_syntax(arguments) 

749 jobcmds["arguments"] = arguments 

750 

751 # Add extra "pass-thru" job commands 

752 if gwjob.profile: 

753 for key, val in gwjob.profile.items(): 

754 jobcmds[key] = htc_escape(val) 

755 for key, val in cached_vals["profile"].items(): 

756 jobcmds[key] = htc_escape(val) 

757 

758 return jobcmds 

759 

760 

761def _translate_dag_cmds(gwjob): 

762 """Translate job values into DAGMan commands. 

763 

764 Parameters 

765 ---------- 

766 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

767 Job containing values to be translated. 

768 

769 Returns 

770 ------- 

771 dagcmds : `dict` [`str`, `Any`] 

772 DAGMan commands for the job. 

773 """ 

774 # Values in the dag script that just are name mappings. 

775 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

776 

777 dagcmds = {} 

778 for gwkey, htckey in dag_translation.items(): 

779 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

780 

781 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

782 return dagcmds 

783 

784 

785def _fix_env_var_syntax(oldstr): 

786 """Change ENV place holders to HTCondor Env var syntax. 

787 

788 Parameters 

789 ---------- 

790 oldstr : `str` 

791 String in which environment variable syntax is to be fixed. 

792 

793 Returns 

794 ------- 

795 newstr : `str` 

796 Given string with environment variable syntax fixed. 

797 """ 

798 newstr = oldstr 

799 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

800 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

801 return newstr 

802 

803 

804def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

805 """Replace file placeholders in command line arguments with correct 

806 physical file names. 

807 

808 Parameters 

809 ---------- 

810 use_shared : `bool` 

811 Whether HTCondor can assume shared filesystem. 

812 arguments : `str` 

813 Arguments string in which to replace file placeholders. 

814 workflow : `lsst.ctrl.bps.GenericWorkflow` 

815 Generic workflow that contains file information. 

816 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

817 The job corresponding to the arguments. 

818 

819 Returns 

820 ------- 

821 arguments : `str` 

822 Given arguments string with file placeholders replaced. 

823 """ 

824 # Replace input file placeholders with paths. 

825 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

826 if not gwfile.wms_transfer: 

827 # Must assume full URI if in command line and told WMS is not 

828 # responsible for transferring file. 

829 uri = gwfile.src_uri 

830 elif use_shared: 

831 if gwfile.job_shared: 

832 # Have shared filesystems and jobs can share file. 

833 uri = gwfile.src_uri 

834 else: 

835 # Taking advantage of inside knowledge. Not future-proof. 

836 # Temporary fix until have job wrapper that pulls files 

837 # within job. 

838 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

839 uri = "butler.yaml" 

840 else: 

841 uri = os.path.basename(gwfile.src_uri) 

842 else: # Using push transfer 

843 uri = os.path.basename(gwfile.src_uri) 

844 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

845 

846 # Replace output file placeholders with paths. 

847 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

848 if not gwfile.wms_transfer: 

849 # Must assume full URI if in command line and told WMS is not 

850 # responsible for transferring file. 

851 uri = gwfile.src_uri 

852 elif use_shared: 

853 if gwfile.job_shared: 

854 # Have shared filesystems and jobs can share file. 

855 uri = gwfile.src_uri 

856 else: 

857 uri = os.path.basename(gwfile.src_uri) 

858 else: # Using push transfer 

859 uri = os.path.basename(gwfile.src_uri) 

860 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

861 return arguments 

862 

863 

864def _replace_cmd_vars(arguments, gwjob): 

865 """Replace format-style placeholders in arguments. 

866 

867 Parameters 

868 ---------- 

869 arguments : `str` 

870 Arguments string in which to replace placeholders. 

871 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

872 Job containing values to be used to replace placeholders 

873 (in particular gwjob.cmdvals). 

874 

875 Returns 

876 ------- 

877 arguments : `str` 

878 Given arguments string with placeholders replaced. 

879 """ 

880 try: 

881 arguments = arguments.format(**gwjob.cmdvals) 

882 except (KeyError, TypeError): # TypeError in case None instead of {} 

883 _LOG.error( 

884 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals 

885 ) 

886 raise 

887 return arguments 

888 

889 

890def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

891 """Add job input files from generic workflow to job. 

892 

893 Parameters 

894 ---------- 

895 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

896 The generic workflow (e.g., has executable name and arguments). 

897 job_name : `str` 

898 Unique name for the job. 

899 use_shared : `bool` 

900 Whether job has access to files via shared filesystem. 

901 out_prefix : `str` 

902 The root directory into which all WMS-specific files are written. 

903 

904 Returns 

905 ------- 

906 htc_commands : `dict` [`str`, `str`] 

907 HTCondor commands for the job submission script. 

908 """ 

909 htc_commands = {} 

910 inputs = [] 

911 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

912 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

913 

914 uri = Path(gwf_file.src_uri) 

915 

916 # Note if use_shared and job_shared, don't need to transfer file. 

917 

918 if not use_shared: # Copy file using push to job 

919 inputs.append(str(uri.relative_to(out_prefix))) 

920 elif not gwf_file.job_shared: # Jobs require own copy 

921 # if using shared filesystem, but still need copy in job. Use 

922 # HTCondor's curl plugin for a local copy. 

923 

924 # Execution butler is represented as a directory which the 

925 # curl plugin does not handle. Taking advantage of inside 

926 # knowledge for temporary fix until have job wrapper that pulls 

927 # files within job. 

928 if gwf_file.name == "butlerConfig": 

929 # The execution butler directory doesn't normally exist until 

930 # the submit phase so checking for suffix instead of using 

931 # is_dir(). If other non-yaml file exists they would have a 

932 # different gwf_file.name. 

933 if uri.suffix == ".yaml": # Single file, so just copy. 

934 inputs.append(f"file://{uri}") 

935 else: 

936 inputs.append(f"file://{uri / 'butler.yaml'}") 

937 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

938 elif uri.is_dir(): 

939 raise RuntimeError( 

940 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}" 

941 ) 

942 else: 

943 inputs.append(f"file://{uri}") 

944 

945 if inputs: 

946 htc_commands["transfer_input_files"] = ",".join(inputs) 

947 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

948 return htc_commands 

949 

950 

951def _report_from_path(wms_path): 

952 """Gather run information from a given run directory. 

953 

954 Parameters 

955 ---------- 

956 wms_path : `str` 

957 The directory containing the submit side files (e.g., HTCondor files). 

958 

959 Returns 

960 ------- 

961 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

962 Run information for the detailed report. The key is the HTCondor id 

963 and the value is a collection of report information for that run. 

964 message : `str` 

965 Message to be printed with the summary report. 

966 """ 

967 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

968 if wms_workflow_id == MISSING_ID: 

969 run_reports = {} 

970 else: 

971 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

972 return run_reports, message 

973 

974 

975def _report_from_id(wms_workflow_id, hist, schedds=None): 

976 """Gather run information using workflow id. 

977 

978 Parameters 

979 ---------- 

980 wms_workflow_id : `str` 

981 Limit to specific run based on id. 

982 hist : `float` 

983 Limit history search to this many days. 

984 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

985 HTCondor schedulers which to query for job information. If None 

986 (default), all queries will be run against the local scheduler only. 

987 

988 Returns 

989 ------- 

990 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

991 Run information for the detailed report. The key is the HTCondor id 

992 and the value is a collection of report information for that run. 

993 message : `str` 

994 Message to be printed with the summary report. 

995 """ 

996 messages = [] 

997 

998 # Collect information about the job by querying HTCondor schedd and 

999 # HTCondor history. 

1000 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds) 

1001 if len(schedd_dag_info) == 1: 

1002 # Extract the DAG info without altering the results of the query. 

1003 schedd_name = next(iter(schedd_dag_info)) 

1004 dag_id = next(iter(schedd_dag_info[schedd_name])) 

1005 dag_ad = schedd_dag_info[schedd_name][dag_id] 

1006 

1007 # If the provided workflow id does not correspond to the one extracted 

1008 # from the DAGMan log file in the submit directory, rerun the query 

1009 # with the id found in the file. 

1010 # 

1011 # This is to cover the situation in which the user provided the old job 

1012 # id of a restarted run. 

1013 try: 

1014 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"]) 

1015 except FileNotFoundError as exc: 

1016 # At the moment missing DAGMan log is pretty much a fatal error. 

1017 # So empty the DAG info to finish early (see the if statement 

1018 # below). 

1019 schedd_dag_info.clean() 

1020 messages.append(f"Cannot create the report for '{dag_id}': {exc}") 

1021 else: 

1022 if path_dag_id != dag_id: 

1023 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds) 

1024 messages.append( 

1025 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. " 

1026 "This normally occurs when a run is restarted. The report shown is for the most " 

1027 f"recent status with run id '{path_dag_id}'" 

1028 ) 

1029 

1030 if len(schedd_dag_info) == 0: 

1031 run_reports = {} 

1032 elif len(schedd_dag_info) == 1: 

1033 _, dag_info = schedd_dag_info.popitem() 

1034 dag_id, dag_ad = dag_info.popitem() 

1035 

1036 # Create a mapping between jobs and their classads. The keys will 

1037 # be of format 'ClusterId.ProcId'. 

1038 job_info = {dag_id: dag_ad} 

1039 

1040 # Find jobs (nodes) belonging to that DAGMan job. 

1041 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1042 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1043 if schedd_job_info: 

1044 _, node_info = schedd_job_info.popitem() 

1045 job_info.update(node_info) 

1046 

1047 # Collect additional pieces of information about jobs using HTCondor 

1048 # files in the submission directory. 

1049 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1050 _update_jobs(job_info, path_jobs) 

1051 if message: 

1052 messages.append(message) 

1053 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1054 else: 

1055 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1056 message = ( 

1057 f"More than one job matches id '{wms_workflow_id}', " 

1058 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1059 ) 

1060 messages.append(message) 

1061 run_reports = {} 

1062 

1063 message = "\n".join(messages) 

1064 return run_reports, message 

1065 

1066 

1067def _get_info_from_schedd(wms_workflow_id, hist, schedds): 

1068 """Gather run information from HTCondor. 

1069 

1070 Parameters 

1071 ---------- 

1072 wms_workflow_id : `str` 

1073 Limit to specific run based on id. 

1074 hist : `int` 

1075 Limit history search to this many days. 

1076 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1077 HTCondor schedulers which to query for job information. If None 

1078 (default), all queries will be run against the local scheduler only. 

1079 

1080 Returns 

1081 ------- 

1082 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1083 Information about jobs satisfying the search criteria where for each 

1084 Scheduler, local HTCondor job ids are mapped to their respective 

1085 classads. 

1086 """ 

1087 dag_constraint = 'regexp("dagman$", Cmd)' 

1088 try: 

1089 cluster_id = int(float(wms_workflow_id)) 

1090 except ValueError: 

1091 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

1092 else: 

1093 dag_constraint += f" && ClusterId == {cluster_id}" 

1094 

1095 # With the current implementation of the condor_* functions the query 

1096 # will always return only one match per Scheduler. 

1097 # 

1098 # Even in the highly unlikely situation where HTCondor history (which 

1099 # condor_search queries too) is long enough to have jobs from before 

1100 # the cluster ids were rolled over (and as a result there is more then 

1101 # one job with the same cluster id) they will not show up in 

1102 # the results. 

1103 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

1104 return schedd_dag_info 

1105 

1106 

1107def _get_info_from_path(wms_path): 

1108 """Gather run information from a given run directory. 

1109 

1110 Parameters 

1111 ---------- 

1112 wms_path : `str` 

1113 Directory containing HTCondor files. 

1114 

1115 Returns 

1116 ------- 

1117 wms_workflow_id : `str` 

1118 The run id which is a DAGman job id. 

1119 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1120 Information about jobs read from files in the given directory. 

1121 The key is the HTCondor id and the value is a dictionary of HTCondor 

1122 keys and values. 

1123 message : `str` 

1124 Message to be printed with the summary report. 

1125 """ 

1126 messages = [] 

1127 try: 

1128 wms_workflow_id, jobs = read_dag_log(wms_path) 

1129 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1130 _update_jobs(jobs, read_node_status(wms_path)) 

1131 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1132 

1133 # Add more info for DAGman job 

1134 job = jobs[wms_workflow_id] 

1135 job.update(read_dag_status(wms_path)) 

1136 

1137 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1138 if "bps_run" not in job: 

1139 _add_run_info(wms_path, job) 

1140 

1141 message = htc_check_dagman_output(wms_path) 

1142 if message: 

1143 messages.append(message) 

1144 _LOG.debug( 

1145 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1146 ) 

1147 

1148 # Add extra pieces of information which cannot be found in HTCondor 

1149 # generated files like 'GlobalJobId'. 

1150 # 

1151 # Do not treat absence of this file as a serious error. Neither runs 

1152 # submitted with earlier versions of the plugin nor the runs submitted 

1153 # with Pegasus plugin will have it at the moment. However, once enough 

1154 # time passes and Pegasus plugin will have its own report() method 

1155 # (instead of sneakily using HTCondor's one), the lack of that file 

1156 # should be treated as seriously as lack of any other file. 

1157 try: 

1158 job_info = read_dag_info(wms_path) 

1159 except FileNotFoundError as exc: 

1160 message = f"Warn: Some information may not be available: {exc}" 

1161 messages.append(message) 

1162 else: 

1163 schedd_name = next(iter(job_info)) 

1164 job_ad = next(iter(job_info[schedd_name].values())) 

1165 job.update(job_ad) 

1166 except FileNotFoundError: 

1167 message = f"Could not find HTCondor files in '{wms_path}'" 

1168 _LOG.warning(message) 

1169 messages.append(message) 

1170 wms_workflow_id = MISSING_ID 

1171 jobs = {} 

1172 

1173 message = "\n".join([msg for msg in messages if msg]) 

1174 return wms_workflow_id, jobs, message 

1175 

1176 

1177def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1178 """Gather run information to be used in generating summary reports. 

1179 

1180 Parameters 

1181 ---------- 

1182 wms_workflow_id : `str` 

1183 The run id to create the report for. 

1184 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1185 Mapping HTCondor job id to job information. 

1186 

1187 Returns 

1188 ------- 

1189 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1190 Run information for the detailed report. The key is the given HTCondor 

1191 id and the value is a collection of report information for that run. 

1192 """ 

1193 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1194 dag_job = jobs[wms_workflow_id] 

1195 report = WmsRunReport( 

1196 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1197 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1198 path=dag_job["Iwd"], 

1199 label=dag_job.get("bps_job_label", "MISS"), 

1200 run=dag_job.get("bps_run", "MISS"), 

1201 project=dag_job.get("bps_project", "MISS"), 

1202 campaign=dag_job.get("bps_campaign", "MISS"), 

1203 payload=dag_job.get("bps_payload", "MISS"), 

1204 operator=_get_owner(dag_job), 

1205 run_summary=_get_run_summary(dag_job), 

1206 state=_htc_status_to_wms_state(dag_job), 

1207 jobs=[], 

1208 total_number_jobs=dag_job["total_jobs"], 

1209 job_state_counts=dag_job["state_counts"], 

1210 ) 

1211 

1212 for job_id, job_info in jobs.items(): 

1213 try: 

1214 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

1215 job_report = WmsJobReport( 

1216 wms_id=job_id, 

1217 name=job_info.get("DAGNodeName", job_id), 

1218 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1219 state=_htc_status_to_wms_state(job_info), 

1220 ) 

1221 if job_report.label == "init": 

1222 job_report.label = "pipetaskInit" 

1223 report.jobs.append(job_report) 

1224 except KeyError as ex: 

1225 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1226 raise 

1227 

1228 run_reports = {report.wms_id: report} 

1229 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1230 return run_reports 

1231 

1232 

1233def _summary_report(user, hist, pass_thru, schedds=None): 

1234 """Gather run information to be used in generating summary reports. 

1235 

1236 Parameters 

1237 ---------- 

1238 user : `str` 

1239 Run lookup restricted to given user. 

1240 hist : `float` 

1241 How many previous days to search for run information. 

1242 pass_thru : `str` 

1243 Advanced users can define the HTCondor constraint to be used 

1244 when searching queue and history. 

1245 

1246 Returns 

1247 ------- 

1248 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1249 Run information for the summary report. The keys are HTCondor ids and 

1250 the values are collections of report information for each run. 

1251 message : `str` 

1252 Message to be printed with the summary report. 

1253 """ 

1254 # only doing summary report so only look for dagman jobs 

1255 if pass_thru: 

1256 constraint = pass_thru 

1257 else: 

1258 # Notes: 

1259 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1260 # manually restarted. 

1261 # * Any job with DAGManJobID isn't a DAG job 

1262 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1263 if user: 

1264 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1265 

1266 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1267 

1268 # Have list of DAGMan jobs, need to get run_report info. 

1269 run_reports = {} 

1270 for jobs in job_info.values(): 

1271 for job_id, job in jobs.items(): 

1272 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1273 # If didn't get from queue information (e.g., Kerberos bug), 

1274 # try reading from file. 

1275 if total_jobs == 0: 

1276 try: 

1277 job.update(read_dag_status(job["Iwd"])) 

1278 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1279 except StopIteration: 

1280 pass # don't kill report can't find htcondor files 

1281 

1282 if "bps_run" not in job: 

1283 _add_run_info(job["Iwd"], job) 

1284 report = WmsRunReport( 

1285 wms_id=job_id, 

1286 global_wms_id=job["GlobalJobId"], 

1287 path=job["Iwd"], 

1288 label=job.get("bps_job_label", "MISS"), 

1289 run=job.get("bps_run", "MISS"), 

1290 project=job.get("bps_project", "MISS"), 

1291 campaign=job.get("bps_campaign", "MISS"), 

1292 payload=job.get("bps_payload", "MISS"), 

1293 operator=_get_owner(job), 

1294 run_summary=_get_run_summary(job), 

1295 state=_htc_status_to_wms_state(job), 

1296 jobs=[], 

1297 total_number_jobs=total_jobs, 

1298 job_state_counts=state_counts, 

1299 ) 

1300 run_reports[report.global_wms_id] = report 

1301 

1302 return run_reports, "" 

1303 

1304 

1305def _add_run_info(wms_path, job): 

1306 """Find BPS run information elsewhere for runs without bps attributes. 

1307 

1308 Parameters 

1309 ---------- 

1310 wms_path : `str` 

1311 Path to submit files for the run. 

1312 job : `dict` [`str`, `Any`] 

1313 HTCondor dag job information. 

1314 

1315 Raises 

1316 ------ 

1317 StopIteration 

1318 If cannot find file it is looking for. Permission errors are 

1319 caught and job's run is marked with error. 

1320 """ 

1321 path = Path(wms_path) / "jobs" 

1322 try: 

1323 subfile = next(path.glob("**/*.sub")) 

1324 except (StopIteration, PermissionError): 

1325 job["bps_run"] = "Unavailable" 

1326 else: 

1327 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1328 try: 

1329 with open(subfile, encoding="utf-8") as fh: 

1330 for line in fh: 

1331 if line.startswith("+bps_"): 

1332 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1333 if m: 

1334 _LOG.debug("Matching line: %s", line) 

1335 job[m.group(1)] = m.group(2).replace('"', "") 

1336 else: 

1337 _LOG.debug("Could not parse attribute: %s", line) 

1338 except PermissionError: 

1339 job["bps_run"] = "PermissionError" 

1340 _LOG.debug("After adding job = %s", job) 

1341 

1342 

1343def _get_owner(job): 

1344 """Get the owner of a dag job. 

1345 

1346 Parameters 

1347 ---------- 

1348 job : `dict` [`str`, `Any`] 

1349 HTCondor dag job information. 

1350 

1351 Returns 

1352 ------- 

1353 owner : `str` 

1354 Owner of the dag job. 

1355 """ 

1356 owner = job.get("bps_operator", None) 

1357 if not owner: 

1358 owner = job.get("Owner", None) 

1359 if not owner: 

1360 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1361 owner = "MISS" 

1362 return owner 

1363 

1364 

1365def _get_run_summary(job): 

1366 """Get the run summary for a job. 

1367 

1368 Parameters 

1369 ---------- 

1370 job : `dict` [`str`, `Any`] 

1371 HTCondor dag job information. 

1372 

1373 Returns 

1374 ------- 

1375 summary : `str` 

1376 Number of jobs per PipelineTask label in approximate pipeline order. 

1377 Format: <label>:<count>[;<label>:<count>]+ 

1378 """ 

1379 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1380 if not summary: 

1381 summary, _ = summary_from_dag(job["Iwd"]) 

1382 if not summary: 

1383 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1384 _LOG.debug("_get_run_summary: summary=%s", summary) 

1385 

1386 # Workaround sometimes using init vs pipetaskInit 

1387 summary = summary.replace("init:", "pipetaskInit:") 

1388 

1389 if "pegasus_version" in job and "pegasus" not in summary: 

1390 summary += ";pegasus:0" 

1391 

1392 return summary 

1393 

1394 

1395def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1396 """Count number of jobs per WMS state. 

1397 

1398 Parameters 

1399 ---------- 

1400 wms_workflow_id : `str` 

1401 HTCondor job id. 

1402 jobs : `dict` [`str`, `Any`] 

1403 HTCondor dag job information. 

1404 

1405 Returns 

1406 ------- 

1407 total_count : `int` 

1408 Total number of dag nodes. 

1409 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1410 Keys are the different WMS states and values are counts of jobs 

1411 that are in that WMS state. 

1412 """ 

1413 state_counts = dict.fromkeys(WmsStates, 0) 

1414 

1415 for jid, jinfo in jobs.items(): 

1416 if jid != wms_workflow_id: 

1417 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1418 

1419 total_counted = sum(state_counts.values()) 

1420 if "NodesTotal" in jobs[wms_workflow_id]: 

1421 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1422 else: 

1423 total_count = total_counted 

1424 

1425 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1426 

1427 return total_count, state_counts 

1428 

1429 

1430def _get_state_counts_from_dag_job(job): 

1431 """Count number of jobs per WMS state. 

1432 

1433 Parameters 

1434 ---------- 

1435 job : `dict` [`str`, `Any`] 

1436 HTCondor dag job information. 

1437 

1438 Returns 

1439 ------- 

1440 total_count : `int` 

1441 Total number of dag nodes. 

1442 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1443 Keys are the different WMS states and values are counts of jobs 

1444 that are in that WMS state. 

1445 """ 

1446 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1447 state_counts = dict.fromkeys(WmsStates, 0) 

1448 if "DAG_NodesReady" in job: 

1449 state_counts = { 

1450 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1451 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1452 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1453 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1454 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1455 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1456 } 

1457 total_jobs = job.get("DAG_NodesTotal") 

1458 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1459 elif "NodesFailed" in job: 

1460 state_counts = { 

1461 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1462 WmsStates.READY: job.get("NodesReady", 0), 

1463 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1464 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1465 WmsStates.FAILED: job.get("NodesFailed", 0), 

1466 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1467 } 

1468 try: 

1469 total_jobs = job.get("NodesTotal") 

1470 except KeyError as ex: 

1471 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1472 raise 

1473 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1474 else: 

1475 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1476 # for every DAG. 

1477 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1478 total_jobs = 0 

1479 

1480 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1481 return total_jobs, state_counts 

1482 

1483 

1484def _htc_status_to_wms_state(job): 

1485 """Convert HTCondor job status to generic wms state. 

1486 

1487 Parameters 

1488 ---------- 

1489 job : `dict` [`str`, `Any`] 

1490 HTCondor job information. 

1491 

1492 Returns 

1493 ------- 

1494 wms_state : `WmsStates` 

1495 The equivalent WmsState to given job's status. 

1496 """ 

1497 wms_state = WmsStates.MISFIT 

1498 if "JobStatus" in job: 

1499 wms_state = _htc_job_status_to_wms_state(job) 

1500 elif "NodeStatus" in job: 

1501 wms_state = _htc_node_status_to_wms_state(job) 

1502 return wms_state 

1503 

1504 

1505def _htc_job_status_to_wms_state(job): 

1506 """Convert HTCondor job status to generic wms state. 

1507 

1508 Parameters 

1509 ---------- 

1510 job : `dict` [`str`, `Any`] 

1511 HTCondor job information. 

1512 

1513 Returns 

1514 ------- 

1515 wms_state : `lsst.ctrl.bps.WmsStates` 

1516 The equivalent WmsState to given job's status. 

1517 """ 

1518 _LOG.debug( 

1519 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1520 ) 

1521 job_status = int(job["JobStatus"]) 

1522 wms_state = WmsStates.MISFIT 

1523 

1524 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1525 if job_status == JobStatus.IDLE: 

1526 wms_state = WmsStates.PENDING 

1527 elif job_status == JobStatus.RUNNING: 

1528 wms_state = WmsStates.RUNNING 

1529 elif job_status == JobStatus.REMOVED: 

1530 wms_state = WmsStates.DELETED 

1531 elif job_status == JobStatus.COMPLETED: 

1532 if ( 

1533 job.get("ExitBySignal", False) 

1534 or job.get("ExitCode", 0) 

1535 or job.get("ExitSignal", 0) 

1536 or job.get("DAG_Status", 0) 

1537 or job.get("ReturnValue", 0) 

1538 ): 

1539 wms_state = WmsStates.FAILED 

1540 else: 

1541 wms_state = WmsStates.SUCCEEDED 

1542 elif job_status == JobStatus.HELD: 

1543 wms_state = WmsStates.HELD 

1544 

1545 return wms_state 

1546 

1547 

1548def _htc_node_status_to_wms_state(job): 

1549 """Convert HTCondor status to generic wms state. 

1550 

1551 Parameters 

1552 ---------- 

1553 job : `dict` [`str`, `Any`] 

1554 HTCondor job information. 

1555 

1556 Returns 

1557 ------- 

1558 wms_state : `lsst.ctrl.bps.WmsStates` 

1559 The equivalent WmsState to given node's status. 

1560 """ 

1561 wms_state = WmsStates.MISFIT 

1562 

1563 status = job["NodeStatus"] 

1564 if status == NodeStatus.NOT_READY: 

1565 wms_state = WmsStates.UNREADY 

1566 elif status == NodeStatus.READY: 

1567 wms_state = WmsStates.READY 

1568 elif status == NodeStatus.PRERUN: 

1569 wms_state = WmsStates.MISFIT 

1570 elif status == NodeStatus.SUBMITTED: 

1571 if job["JobProcsHeld"]: 

1572 wms_state = WmsStates.HELD 

1573 elif job["StatusDetails"] == "not_idle": 

1574 wms_state = WmsStates.RUNNING 

1575 elif job["JobProcsQueued"]: 

1576 wms_state = WmsStates.PENDING 

1577 elif status == NodeStatus.POSTRUN: 

1578 wms_state = WmsStates.MISFIT 

1579 elif status == NodeStatus.DONE: 

1580 wms_state = WmsStates.SUCCEEDED 

1581 elif status == NodeStatus.ERROR: 

1582 # Use job exist instead of post script exit 

1583 if "DAGMAN error 0" in job["StatusDetails"]: 

1584 wms_state = WmsStates.SUCCEEDED 

1585 else: 

1586 wms_state = WmsStates.FAILED 

1587 

1588 return wms_state 

1589 

1590 

1591def _update_jobs(jobs1, jobs2): 

1592 """Update jobs1 with info in jobs2. 

1593 

1594 (Basically an update for nested dictionaries.) 

1595 

1596 Parameters 

1597 ---------- 

1598 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1599 HTCondor job information to be updated. 

1600 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1601 Additional HTCondor job information. 

1602 """ 

1603 for jid, jinfo in jobs2.items(): 

1604 if jid in jobs1: 

1605 jobs1[jid].update(jinfo) 

1606 else: 

1607 jobs1[jid] = jinfo 

1608 

1609 

1610def _wms_id_type(wms_id): 

1611 """Determine the type of the WMS id. 

1612 

1613 Parameters 

1614 ---------- 

1615 wms_id : `str` 

1616 WMS id identifying a job. 

1617 

1618 Returns 

1619 ------- 

1620 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1621 Type of WMS id. 

1622 """ 

1623 try: 

1624 int(float(wms_id)) 

1625 except ValueError: 

1626 wms_path = Path(wms_id) 

1627 if wms_path.is_dir(): 

1628 id_type = WmsIdType.PATH 

1629 else: 

1630 id_type = WmsIdType.GLOBAL 

1631 except TypeError: 

1632 id_type = WmsIdType.UNKNOWN 

1633 else: 

1634 id_type = WmsIdType.LOCAL 

1635 return id_type 

1636 

1637 

1638def _wms_id_to_cluster(wms_id): 

1639 """Convert WMS id to cluster id. 

1640 

1641 Parameters 

1642 ---------- 

1643 wms_id : `int` or `float` or `str` 

1644 HTCondor job id or path. 

1645 

1646 Returns 

1647 ------- 

1648 schedd_ad : `classad.ClassAd` 

1649 ClassAd describing the scheduler managing the job with the given id. 

1650 cluster_id : `int` 

1651 HTCondor cluster id. 

1652 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1653 The type of the provided id. 

1654 """ 

1655 coll = htcondor.Collector() 

1656 

1657 schedd_ad = None 

1658 cluster_id = None 

1659 id_type = _wms_id_type(wms_id) 

1660 if id_type == WmsIdType.LOCAL: 

1661 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1662 cluster_id = int(float(wms_id)) 

1663 elif id_type == WmsIdType.GLOBAL: 

1664 constraint = f'GlobalJobId == "{wms_id}"' 

1665 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1666 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()} 

1667 job_info = condor_q(constraint=constraint, schedds=schedds) 

1668 if job_info: 

1669 schedd_name, job_rec = job_info.popitem() 

1670 job_id, _ = job_rec.popitem() 

1671 schedd_ad = schedd_ads[schedd_name] 

1672 cluster_id = int(float(job_id)) 

1673 elif id_type == WmsIdType.PATH: 

1674 try: 

1675 job_info = read_dag_info(wms_id) 

1676 except (FileNotFoundError, PermissionError, OSError): 

1677 pass 

1678 else: 

1679 schedd_name, job_rec = job_info.popitem() 

1680 job_id, _ = job_rec.popitem() 

1681 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1682 cluster_id = int(float(job_id)) 

1683 else: 

1684 pass 

1685 return schedd_ad, cluster_id, id_type 

1686 

1687 

1688def _wms_id_to_dir(wms_id): 

1689 """Convert WMS id to a submit directory candidate. 

1690 

1691 The function does not check if the directory exists or if it is a valid 

1692 BPS submit directory. 

1693 

1694 Parameters 

1695 ---------- 

1696 wms_id : `int` or `float` or `str` 

1697 HTCondor job id or path. 

1698 

1699 Returns 

1700 ------- 

1701 wms_path : `pathlib.Path` or None 

1702 Submit directory candidate for the run with the given job id. If no 

1703 directory can be associated with the provided WMS id, it will be set 

1704 to None. 

1705 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1706 The type of the provided id. 

1707 

1708 Raises 

1709 ------ 

1710 TypeError 

1711 Raised if provided WMS id has invalid type. 

1712 """ 

1713 coll = htcondor.Collector() 

1714 schedd_ads = [] 

1715 

1716 constraint = None 

1717 wms_path = None 

1718 id_type = _wms_id_type(wms_id) 

1719 match id_type: 

1720 case WmsIdType.LOCAL: 

1721 constraint = f"ClusterId == {int(float(wms_id))}" 

1722 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1723 case WmsIdType.GLOBAL: 

1724 constraint = f'GlobalJobId == "{wms_id}"' 

1725 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1726 case WmsIdType.PATH: 

1727 wms_path = Path(wms_id) 

1728 case WmsIdType.UNKNOWN: 

1729 raise TypeError(f"Invalid job id type: {wms_id}") 

1730 if constraint is not None: 

1731 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1732 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"]) 

1733 if job_info: 

1734 _, job_rec = job_info.popitem() 

1735 _, job_ad = job_rec.popitem() 

1736 wms_path = Path(job_ad["Iwd"]) 

1737 return wms_path, id_type 

1738 

1739 

1740def _create_periodic_release_expr(memory, multiplier, limit): 

1741 """Construct an HTCondorAd expression for releasing held jobs. 

1742 

1743 The expression instruct HTCondor to release any job which was put on hold 

1744 due to exceeding memory requirements back to the job queue providing it 

1745 satisfies all of the conditions below: 

1746 

1747 * number of run attempts did not reach allowable number of retries, 

1748 * the memory requirements in the last failed run attempt did not reach 

1749 the specified memory limit. 

1750 

1751 Parameters 

1752 ---------- 

1753 memory : `int` 

1754 Requested memory in MB. 

1755 multiplier : `float` 

1756 Memory growth rate between retires. 

1757 limit : `int` 

1758 Memory limit. 

1759 

1760 Returns 

1761 ------- 

1762 expr : `str` 

1763 A string representing an HTCondor ClassAd expression for releasing jobs 

1764 which have been held due to exceeding the memory requirements. 

1765 """ 

1766 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1767 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1768 

1769 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1770 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1771 # The special comparison operators ensure that all comparisons below will 

1772 # evaluate to FALSE in this case. 

1773 # 

1774 # Note: 

1775 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1776 # the entire expression should evaluate to FALSE when the job is not HELD. 

1777 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1778 # but better safe than sorry. 

1779 was_mem_exceeded = ( 

1780 "JobStatus == 5 " 

1781 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1782 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1783 ) 

1784 

1785 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1786 return expr 

1787 

1788 

1789def _create_periodic_remove_expr(memory, multiplier, limit): 

1790 """Construct an HTCondorAd expression for removing jobs from the queue. 

1791 

1792 The expression instruct HTCondor to remove any job which was put on hold 

1793 due to exceeding memory requirements from the job queue providing it 

1794 satisfies any of the conditions below: 

1795 

1796 * allowable number of retries was reached, 

1797 * the memory requirements during the last failed run attempt reached 

1798 the specified memory limit. 

1799 

1800 Parameters 

1801 ---------- 

1802 memory : `int` 

1803 Requested memory in MB. 

1804 multiplier : `float` 

1805 Memory growth rate between retires. 

1806 limit : `int` 

1807 Memory limit. 

1808 

1809 Returns 

1810 ------- 

1811 expr : `str` 

1812 A string representing an HTCondor ClassAd expression for removing jobs 

1813 which were run at the maximal allowable memory and still exceeded 

1814 the memory requirements. 

1815 """ 

1816 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1817 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1818 

1819 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1820 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1821 # The special comparison operators ensure that all comparisons below will 

1822 # evaluate to FALSE in this case. 

1823 # 

1824 # Note: 

1825 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1826 # the entire expression should evaluate to FALSE when the job is not HELD. 

1827 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1828 # but better safe than sorry. 

1829 was_mem_exceeded = ( 

1830 "JobStatus == 5 " 

1831 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1832 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1833 ) 

1834 

1835 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1836 return expr 

1837 

1838 

1839def _create_request_memory_expr(memory, multiplier, limit): 

1840 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1841 

1842 Parameters 

1843 ---------- 

1844 memory : `int` 

1845 Requested memory in MB. 

1846 multiplier : `float` 

1847 Memory growth rate between retires. 

1848 limit : `int` 

1849 Memory limit. 

1850 

1851 Returns 

1852 ------- 

1853 expr : `str` 

1854 A string representing an HTCondor ClassAd expression enabling safe 

1855 memory scaling between job retries. 

1856 """ 

1857 # The check if the job was held due to exceeding memory requirements 

1858 # will be made *after* job was released back to the job queue (is in 

1859 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1860 # the ones describing job's current state. 

1861 # 

1862 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1863 # initially put in the job queue. The special comparison operators ensure 

1864 # that all comparisons below will evaluate to FALSE in this case. 

1865 was_mem_exceeded = ( 

1866 "LastJobStatus =?= 5 " 

1867 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1868 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1869 ) 

1870 

1871 # If job runs the first time or was held for reasons other than exceeding 

1872 # the memory, set the required memory to the requested value or use 

1873 # the memory value measured by HTCondor (MemoryUsage) depending on 

1874 # whichever is greater. 

1875 expr = ( 

1876 f"({was_mem_exceeded}) " 

1877 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1878 f": max({{{memory}, MemoryUsage ?: 0}})" 

1879 ) 

1880 return expr 

1881 

1882 

1883def _locate_schedds(locate_all=False): 

1884 """Find out Scheduler daemons in an HTCondor pool. 

1885 

1886 Parameters 

1887 ---------- 

1888 locate_all : `bool`, optional 

1889 If True, all available schedulers in the HTCondor pool will be located. 

1890 False by default which means that the search will be limited to looking 

1891 for the Scheduler running on a local host. 

1892 

1893 Returns 

1894 ------- 

1895 schedds : `dict` [`str`, `htcondor.Schedd`] 

1896 A mapping between Scheduler names and Python objects allowing for 

1897 interacting with them. 

1898 """ 

1899 coll = htcondor.Collector() 

1900 

1901 schedd_ads = [] 

1902 if locate_all: 

1903 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1904 else: 

1905 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1906 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1907 

1908 

1909def _gather_site_values(config, compute_site): 

1910 """Gather values specific to given site. 

1911 

1912 Parameters 

1913 ---------- 

1914 config : `lsst.ctrl.bps.BpsConfig` 

1915 BPS configuration that includes necessary submit/runtime 

1916 information. 

1917 compute_site : `str` 

1918 Compute site name. 

1919 

1920 Returns 

1921 ------- 

1922 site_values : `dict` [`str`, `Any`] 

1923 Values specific to the given site. 

1924 """ 

1925 site_values = {"attrs": {}, "profile": {}} 

1926 search_opts = {} 

1927 if compute_site: 

1928 search_opts["curvals"] = {"curr_site": compute_site} 

1929 

1930 # Determine the hard limit for the memory requirement. 

1931 found, limit = config.search("memoryLimit", opt=search_opts) 

1932 if not found: 

1933 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1934 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1935 del search_opts["default"] 

1936 

1937 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1938 # by definition, they cannot have more memory than 

1939 # the partitionable slot they are the part of. 

1940 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1941 pool_info = condor_status(constraint=constraint) 

1942 try: 

1943 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1944 except ValueError: 

1945 _LOG.debug("No execute machine in the pool matches %s", patt) 

1946 if limit: 

1947 config[".bps_defined.memory_limit"] = limit 

1948 

1949 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1950 site_values["memoryLimit"] = limit 

1951 

1952 found, value = config.search("accountingGroup", opt=search_opts) 

1953 if found: 

1954 site_values["accountingGroup"] = value 

1955 found, value = config.search("accountingUser", opt=search_opts) 

1956 if found: 

1957 site_values["accountingUser"] = value 

1958 

1959 key = f".site.{compute_site}.profile.condor" 

1960 if key in config: 

1961 for key, val in config[key].items(): 

1962 if key.startswith("+"): 

1963 site_values["attrs"][key[1:]] = val 

1964 else: 

1965 site_values["profile"][key] = val 

1966 

1967 return site_values