Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%

693 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-10 19:16 +0000

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Interface between generic workflow to HTCondor workflow system. 

29""" 

30 

31__all__ = ["HTCondorService", "HTCondorWorkflow"] 

32 

33 

34import logging 

35import os 

36import re 

37from collections import defaultdict 

38from enum import IntEnum, auto 

39from pathlib import Path 

40 

41import htcondor 

42from lsst.ctrl.bps import ( 

43 BaseWmsService, 

44 BaseWmsWorkflow, 

45 GenericWorkflow, 

46 GenericWorkflowJob, 

47 WmsJobReport, 

48 WmsRunReport, 

49 WmsStates, 

50) 

51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary 

52from lsst.utils.timer import time_this 

53from packaging import version 

54 

55from .lssthtc import ( 

56 MISSING_ID, 

57 HTCDag, 

58 HTCJob, 

59 JobStatus, 

60 NodeStatus, 

61 condor_q, 

62 condor_search, 

63 condor_status, 

64 htc_backup_files, 

65 htc_check_dagman_output, 

66 htc_create_submit_from_cmd, 

67 htc_create_submit_from_dag, 

68 htc_create_submit_from_file, 

69 htc_escape, 

70 htc_submit_dag, 

71 htc_version, 

72 pegasus_name_to_label, 

73 read_dag_info, 

74 read_dag_log, 

75 read_dag_status, 

76 read_node_status, 

77 summary_from_dag, 

78 write_dag_info, 

79) 

80 

81 

82class WmsIdType(IntEnum): 

83 """Type of valid WMS ids.""" 

84 

85 UNKNOWN = auto() 

86 """The type of id cannot be determined. 

87 """ 

88 

89 LOCAL = auto() 

90 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

91 """ 

92 

93 GLOBAL = auto() 

94 """Id is a HTCondor's global job id. 

95 """ 

96 

97 PATH = auto() 

98 """Id is a submission path. 

99 """ 

100 

101 

102DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

103"""Default pattern for searching execute machines in an HTCondor pool. 

104""" 

105 

106_LOG = logging.getLogger(__name__) 

107 

108 

109class HTCondorService(BaseWmsService): 

110 """HTCondor version of WMS service.""" 

111 

112 def prepare(self, config, generic_workflow, out_prefix=None): 

113 """Convert generic workflow to an HTCondor DAG ready for submission. 

114 

115 Parameters 

116 ---------- 

117 config : `lsst.ctrl.bps.BpsConfig` 

118 BPS configuration that includes necessary submit/runtime 

119 information. 

120 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

121 The generic workflow (e.g., has executable name and arguments). 

122 out_prefix : `str` 

123 The root directory into which all WMS-specific files are written. 

124 

125 Returns 

126 ------- 

127 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

128 HTCondor workflow ready to be run. 

129 """ 

130 _LOG.debug("out_prefix = '%s'", out_prefix) 

131 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

132 workflow = HTCondorWorkflow.from_generic_workflow( 

133 config, 

134 generic_workflow, 

135 out_prefix, 

136 f"{self.__class__.__module__}.{self.__class__.__name__}", 

137 ) 

138 

139 with time_this( 

140 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

141 ): 

142 workflow.write(out_prefix) 

143 return workflow 

144 

145 def submit(self, workflow): 

146 """Submit a single HTCondor workflow. 

147 

148 Parameters 

149 ---------- 

150 workflow : `lsst.ctrl.bps.BaseWorkflow` 

151 A single HTCondor workflow to submit. run_id is updated after 

152 successful submission to WMS. 

153 """ 

154 dag = workflow.dag 

155 

156 ver = version.parse(htc_version()) 

157 if ver >= version.parse("8.9.3"): 

158 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

159 else: 

160 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

161 

162 # For workflow portability, internal paths are all relative. Hence 

163 # the DAG needs to be submitted to HTCondor from inside the submit 

164 # directory. 

165 with chdir(workflow.submit_path): 

166 _LOG.info("Submitting from directory: %s", os.getcwd()) 

167 schedd_dag_info = htc_submit_dag(sub) 

168 if schedd_dag_info: 

169 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

170 

171 _, dag_info = schedd_dag_info.popitem() 

172 _, dag_ad = dag_info.popitem() 

173 

174 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

175 workflow.run_id = dag.run_id 

176 else: 

177 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

178 

179 def restart(self, wms_workflow_id): 

180 """Restart a failed DAGMan workflow. 

181 

182 Parameters 

183 ---------- 

184 wms_workflow_id : `str` 

185 The directory with HTCondor files. 

186 

187 Returns 

188 ------- 

189 run_id : `str` 

190 HTCondor id of the restarted DAGMan job. If restart failed, it will 

191 be set to None. 

192 run_name : `str` 

193 Name of the restarted workflow. If restart failed, it will be set 

194 to None. 

195 message : `str` 

196 A message describing any issues encountered during the restart. 

197 If there were no issues, an empty string is returned. 

198 """ 

199 wms_path = Path(wms_workflow_id) 

200 if not wms_path.is_dir(): 

201 return None, None, f"Directory '{wms_path}' not found" 

202 

203 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

204 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

205 if not rescue_dags: 

206 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

207 

208 _LOG.info("Verifying that the workflow is not already in the job queue") 

209 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"') 

210 if schedd_dag_info: 

211 _, dag_info = schedd_dag_info.popitem() 

212 _, dag_ad = dag_info.popitem() 

213 id_ = dag_ad["GlobalJobId"] 

214 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

215 

216 _LOG.info("Checking execution status of the workflow") 

217 warn = False 

218 dag_ad = read_dag_status(str(wms_path)) 

219 if dag_ad: 

220 nodes_total = dag_ad.get("NodesTotal", 0) 

221 if nodes_total != 0: 

222 nodes_done = dag_ad.get("NodesDone", 0) 

223 if nodes_total == nodes_done: 

224 return None, None, "All jobs in the workflow finished successfully" 

225 else: 

226 warn = True 

227 else: 

228 warn = True 

229 if warn: 

230 _LOG.warning( 

231 "Cannot determine the execution status of the workflow, continuing with restart regardless" 

232 ) 

233 

234 _LOG.info("Backing up select HTCondor files from previous run attempt") 

235 htc_backup_files(wms_path, subdir="backups") 

236 

237 # For workflow portability, internal paths are all relative. Hence 

238 # the DAG needs to be resubmitted to HTCondor from inside the submit 

239 # directory. 

240 _LOG.info("Adding workflow to the job queue") 

241 run_id, run_name, message = None, None, "" 

242 with chdir(wms_path): 

243 try: 

244 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

245 except StopIteration: 

246 message = f"DAGMan submit description file not found in '{wms_path}'" 

247 else: 

248 sub = htc_create_submit_from_file(dag_path.name) 

249 schedd_dag_info = htc_submit_dag(sub) 

250 

251 # Save select information about the DAGMan job to a file. Use 

252 # the run name (available in the ClassAd) as the filename. 

253 if schedd_dag_info: 

254 dag_info = next(iter(schedd_dag_info.values())) 

255 dag_ad = next(iter(dag_info.values())) 

256 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

257 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

258 run_name = dag_ad["bps_run"] 

259 else: 

260 message = "DAGMan job information unavailable" 

261 

262 return run_id, run_name, message 

263 

264 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

265 """Query WMS for list of submitted WMS workflows/jobs. 

266 

267 This should be a quick lookup function to create list of jobs for 

268 other functions. 

269 

270 Parameters 

271 ---------- 

272 wms_id : `int` or `str`, optional 

273 Id or path that can be used by WMS service to look up job. 

274 user : `str`, optional 

275 User whose submitted jobs should be listed. 

276 require_bps : `bool`, optional 

277 Whether to require jobs returned in list to be bps-submitted jobs. 

278 pass_thru : `str`, optional 

279 Information to pass through to WMS. 

280 is_global : `bool`, optional 

281 If set, all job queues (and their histories) will be queried for 

282 job information. Defaults to False which means that only the local 

283 job queue will be queried. 

284 

285 Returns 

286 ------- 

287 job_ids : `list` [`Any`] 

288 Only job ids to be used by cancel and other functions. Typically 

289 this means top-level jobs (i.e., not children jobs). 

290 """ 

291 _LOG.debug( 

292 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

293 wms_id, 

294 user, 

295 require_bps, 

296 pass_thru, 

297 is_global, 

298 ) 

299 

300 # Determine which Schedds will be queried for job information. 

301 coll = htcondor.Collector() 

302 

303 schedd_ads = [] 

304 if is_global: 

305 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

306 else: 

307 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

308 

309 # Construct appropriate constraint expression using provided arguments. 

310 constraint = "False" 

311 if wms_id is None: 

312 if user is not None: 

313 constraint = f'(Owner == "{user}")' 

314 else: 

315 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

316 if cluster_id is not None: 

317 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

318 

319 # If provided id is either a submission path or a global id, 

320 # make sure the right Schedd will be queried regardless of 

321 # 'is_global' value. 

322 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

323 schedd_ads = [schedd_ad] 

324 if require_bps: 

325 constraint += ' && (bps_isjob == "True")' 

326 if pass_thru: 

327 if "-forcex" in pass_thru: 

328 pass_thru_2 = pass_thru.replace("-forcex", "") 

329 if pass_thru_2 and not pass_thru_2.isspace(): 

330 constraint += f" && ({pass_thru_2})" 

331 else: 

332 constraint += f" && ({pass_thru})" 

333 

334 # Create a list of scheduler daemons which need to be queried. 

335 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

336 

337 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

338 results = condor_q(constraint=constraint, schedds=schedds) 

339 

340 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

341 job_ids = [] 

342 for schedd_name, job_info in results.items(): 

343 for job_id, job_ad in job_info.items(): 

344 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

345 if "DAGManJobId" not in job_ad: 

346 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

347 else: 

348 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

349 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

350 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

351 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

352 

353 _LOG.debug("job_ids = %s", job_ids) 

354 return job_ids 

355 

356 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

357 """Return run information based upon given constraints. 

358 

359 Parameters 

360 ---------- 

361 wms_workflow_id : `str`, optional 

362 Limit to specific run based on id. 

363 user : `str`, optional 

364 Limit results to runs for this user. 

365 hist : `float`, optional 

366 Limit history search to this many days. Defaults to 0. 

367 pass_thru : `str`, optional 

368 Constraints to pass through to HTCondor. 

369 is_global : `bool`, optional 

370 If set, all job queues (and their histories) will be queried for 

371 job information. Defaults to False which means that only the local 

372 job queue will be queried. 

373 

374 Returns 

375 ------- 

376 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

377 Information about runs from given job information. 

378 message : `str` 

379 Extra message for report command to print. This could be pointers 

380 to documentation or to WMS specific commands. 

381 """ 

382 if wms_workflow_id: 

383 id_type = _wms_id_type(wms_workflow_id) 

384 if id_type == WmsIdType.LOCAL: 

385 schedulers = _locate_schedds(locate_all=is_global) 

386 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

387 elif id_type == WmsIdType.GLOBAL: 

388 schedulers = _locate_schedds(locate_all=True) 

389 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

390 elif id_type == WmsIdType.PATH: 

391 run_reports, message = _report_from_path(wms_workflow_id) 

392 else: 

393 run_reports, message = {}, "Invalid job id" 

394 else: 

395 schedulers = _locate_schedds(locate_all=is_global) 

396 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

397 _LOG.debug("report: %s, %s", run_reports, message) 

398 

399 return list(run_reports.values()), message 

400 

401 def cancel(self, wms_id, pass_thru=None): 

402 """Cancel submitted workflows/jobs. 

403 

404 Parameters 

405 ---------- 

406 wms_id : `str` 

407 Id or path of job that should be canceled. 

408 pass_thru : `str`, optional 

409 Information to pass through to WMS. 

410 

411 Returns 

412 ------- 

413 deleted : `bool` 

414 Whether successful deletion or not. Currently, if any doubt or any 

415 individual jobs not deleted, return False. 

416 message : `str` 

417 Any message from WMS (e.g., error details). 

418 """ 

419 _LOG.debug("Canceling wms_id = %s", wms_id) 

420 

421 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

422 

423 if cluster_id is None: 

424 deleted = False 

425 message = "invalid id" 

426 else: 

427 _LOG.debug( 

428 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

429 cluster_id, 

430 schedd_ad["Name"], 

431 ) 

432 schedd = htcondor.Schedd(schedd_ad) 

433 

434 constraint = f"ClusterId == {cluster_id}" 

435 if pass_thru is not None and "-forcex" in pass_thru: 

436 pass_thru_2 = pass_thru.replace("-forcex", "") 

437 if pass_thru_2 and not pass_thru_2.isspace(): 

438 constraint += f"&& ({pass_thru_2})" 

439 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

440 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

441 else: 

442 if pass_thru: 

443 constraint += f"&& ({pass_thru})" 

444 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

445 results = schedd.act(htcondor.JobAction.Remove, constraint) 

446 _LOG.debug("Remove results: %s", results) 

447 

448 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

449 deleted = True 

450 message = "" 

451 else: 

452 deleted = False 

453 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

454 message = "no such bps job in batch queue" 

455 else: 

456 message = f"unknown problems deleting: {results}" 

457 

458 _LOG.debug("deleted: %s; message = %s", deleted, message) 

459 return deleted, message 

460 

461 

462class HTCondorWorkflow(BaseWmsWorkflow): 

463 """Single HTCondor workflow. 

464 

465 Parameters 

466 ---------- 

467 name : `str` 

468 Unique name for Workflow used when naming files. 

469 config : `lsst.ctrl.bps.BpsConfig` 

470 BPS configuration that includes necessary submit/runtime information. 

471 """ 

472 

473 def __init__(self, name, config=None): 

474 super().__init__(name, config) 

475 self.dag = None 

476 

477 @classmethod 

478 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

479 # Docstring inherited 

480 htc_workflow = cls(generic_workflow.name, config) 

481 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

482 

483 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

484 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

485 htc_workflow.dag.add_attribs( 

486 { 

487 "bps_wms_service": service_class, 

488 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

489 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

490 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

491 } 

492 ) 

493 

494 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

495 if isinstance(tmp_template, str): 

496 subdir_template = defaultdict(lambda: tmp_template) 

497 else: 

498 subdir_template = tmp_template 

499 

500 # Create all DAG jobs 

501 site_values = {} # cache compute site specific values to reduce config lookups 

502 for job_name in generic_workflow: 

503 gwjob = generic_workflow.get_job(job_name) 

504 if gwjob.compute_site not in site_values: 

505 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

506 htc_job = _create_job( 

507 subdir_template[gwjob.label], 

508 site_values[gwjob.compute_site], 

509 generic_workflow, 

510 gwjob, 

511 out_prefix, 

512 ) 

513 htc_workflow.dag.add_job(htc_job) 

514 

515 # Add job dependencies to the DAG 

516 for job_name in generic_workflow: 

517 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

518 

519 # If final job exists in generic workflow, create DAG final job 

520 final = generic_workflow.get_final() 

521 if final and isinstance(final, GenericWorkflowJob): 

522 if final.compute_site and final.compute_site not in site_values: 

523 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

524 final_htjob = _create_job( 

525 subdir_template[final.label], 

526 site_values[final.compute_site], 

527 generic_workflow, 

528 final, 

529 out_prefix, 

530 ) 

531 if "post" not in final_htjob.dagcmds: 

532 final_htjob.dagcmds[ 

533 "post" 

534 ] = f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN" 

535 htc_workflow.dag.add_final_job(final_htjob) 

536 elif final and isinstance(final, GenericWorkflow): 

537 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

538 elif final: 

539 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

540 

541 return htc_workflow 

542 

543 def write(self, out_prefix): 

544 """Output HTCondor DAGMan files needed for workflow submission. 

545 

546 Parameters 

547 ---------- 

548 out_prefix : `str` 

549 Directory prefix for HTCondor files. 

550 """ 

551 self.submit_path = out_prefix 

552 os.makedirs(out_prefix, exist_ok=True) 

553 

554 # Write down the workflow in HTCondor format. 

555 self.dag.write(out_prefix, "jobs/{self.label}") 

556 

557 

558def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

559 """Convert GenericWorkflow job nodes to DAG jobs. 

560 

561 Parameters 

562 ---------- 

563 subdir_template : `str` 

564 Template for making subdirs. 

565 site_values : `dict` 

566 Site specific values 

567 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

568 Generic workflow that is being converted. 

569 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

570 The generic job to convert to a HTCondor job. 

571 out_prefix : `str` 

572 Directory prefix for HTCondor files. 

573 

574 Returns 

575 ------- 

576 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

577 The HTCondor job equivalent to the given generic job. 

578 """ 

579 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

580 

581 curvals = defaultdict(str) 

582 curvals["label"] = gwjob.label 

583 if gwjob.tags: 

584 curvals.update(gwjob.tags) 

585 

586 subdir = subdir_template.format_map(curvals) 

587 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

588 

589 htc_job_cmds = { 

590 "universe": "vanilla", 

591 "should_transfer_files": "YES", 

592 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

593 "transfer_output_files": '""', # Set to empty string to disable 

594 "transfer_executable": "False", 

595 "getenv": "True", 

596 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

597 # htcondor to put on hold any jobs which exited by a signal. 

598 "on_exit_hold": "ExitBySignal == true", 

599 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

600 '"Handling signal as if job has gone over memory limit.")', 

601 "on_exit_hold_subcode": "34", 

602 } 

603 

604 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

605 

606 # job stdout, stderr, htcondor user log. 

607 for key in ("output", "error", "log"): 

608 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

609 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

610 

611 htc_job_cmds.update( 

612 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

613 ) 

614 

615 # Add the job cmds dict to the job object. 

616 htc_job.add_job_cmds(htc_job_cmds) 

617 

618 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

619 

620 # Add job attributes to job. 

621 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

622 htc_job.add_job_attrs(gwjob.attrs) 

623 htc_job.add_job_attrs(site_values["attrs"]) 

624 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

625 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

626 

627 return htc_job 

628 

629 

630def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

631 """Translate the job data that are one to one mapping 

632 

633 Parameters 

634 ---------- 

635 cached_vals : `dict` [`str`, `Any`] 

636 Config values common to jobs with same label. 

637 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

638 Generic workflow that contains job to being converted. 

639 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

640 Generic workflow job to be converted. 

641 

642 Returns 

643 ------- 

644 htc_job_commands : `dict` [`str`, `Any`] 

645 Contains commands which can appear in the HTCondor submit description 

646 file. 

647 """ 

648 # Values in the job script that just are name mappings. 

649 job_translation = { 

650 "mail_to": "notify_user", 

651 "when_to_mail": "notification", 

652 "request_cpus": "request_cpus", 

653 "priority": "priority", 

654 "category": "category", 

655 "accounting_group": "accounting_group", 

656 "accounting_user": "accounting_group_user", 

657 } 

658 

659 jobcmds = {} 

660 for gwkey, htckey in job_translation.items(): 

661 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

662 

663 # If accounting info was not set explicitly, use site settings if any. 

664 if not gwjob.accounting_group: 

665 jobcmds["accounting_group"] = cached_vals.get("accountingGroup") 

666 if not gwjob.accounting_user: 

667 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser") 

668 

669 # job commands that need modification 

670 if gwjob.number_of_retries: 

671 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

672 

673 if gwjob.retry_unless_exit: 

674 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

675 

676 if gwjob.request_disk: 

677 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

678 

679 if gwjob.request_memory: 

680 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

681 

682 if gwjob.memory_multiplier: 

683 # Do not use try-except! At the moment, BpsConfig returns an empty 

684 # string if it does not contain the key. 

685 memory_limit = cached_vals["memoryLimit"] 

686 if not memory_limit: 

687 raise RuntimeError( 

688 "Memory autoscaling enabled, but automatic detection of the memory limit " 

689 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

690 "search pattern 'executeMachinesPattern' might help." 

691 ) 

692 

693 # Set maximal amount of memory job can ask for. 

694 # 

695 # The check below assumes that 'memory_limit' was set to a value which 

696 # realistically reflects actual physical limitations of a given compute 

697 # resource. 

698 memory_max = memory_limit 

699 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

700 memory_max = gwjob.request_memory_max 

701 

702 # Make job ask for more memory each time it failed due to insufficient 

703 # memory requirements. 

704 jobcmds["request_memory"] = _create_request_memory_expr( 

705 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

706 ) 

707 

708 # Periodically release jobs which are being held due to exceeding 

709 # memory. Stop doing that (by removing the job from the HTCondor queue) 

710 # after the maximal number of retries has been reached or the job was 

711 # already run at maximal allowed memory. 

712 jobcmds["periodic_release"] = _create_periodic_release_expr( 

713 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

714 ) 

715 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

716 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

717 ) 

718 

719 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

720 # May need to move to special site-specific implementation if sites use 

721 # other mechanisms. 

722 if gwjob.concurrency_limit: 

723 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

724 

725 # Handle command line 

726 if gwjob.executable.transfer_executable: 

727 jobcmds["transfer_executable"] = "True" 

728 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

729 else: 

730 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

731 

732 if gwjob.arguments: 

733 arguments = gwjob.arguments 

734 arguments = _replace_cmd_vars(arguments, gwjob) 

735 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

736 arguments = _fix_env_var_syntax(arguments) 

737 jobcmds["arguments"] = arguments 

738 

739 # Add extra "pass-thru" job commands 

740 if gwjob.profile: 

741 for key, val in gwjob.profile.items(): 

742 jobcmds[key] = htc_escape(val) 

743 for key, val in cached_vals["profile"].items(): 

744 jobcmds[key] = htc_escape(val) 

745 

746 return jobcmds 

747 

748 

749def _translate_dag_cmds(gwjob): 

750 """Translate job values into DAGMan commands. 

751 

752 Parameters 

753 ---------- 

754 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

755 Job containing values to be translated. 

756 

757 Returns 

758 ------- 

759 dagcmds : `dict` [`str`, `Any`] 

760 DAGMan commands for the job. 

761 """ 

762 # Values in the dag script that just are name mappings. 

763 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

764 

765 dagcmds = {} 

766 for gwkey, htckey in dag_translation.items(): 

767 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

768 

769 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

770 return dagcmds 

771 

772 

773def _fix_env_var_syntax(oldstr): 

774 """Change ENV place holders to HTCondor Env var syntax. 

775 

776 Parameters 

777 ---------- 

778 oldstr : `str` 

779 String in which environment variable syntax is to be fixed. 

780 

781 Returns 

782 ------- 

783 newstr : `str` 

784 Given string with environment variable syntax fixed. 

785 """ 

786 newstr = oldstr 

787 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

788 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

789 return newstr 

790 

791 

792def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

793 """Replace file placeholders in command line arguments with correct 

794 physical file names. 

795 

796 Parameters 

797 ---------- 

798 use_shared : `bool` 

799 Whether HTCondor can assume shared filesystem. 

800 arguments : `str` 

801 Arguments string in which to replace file placeholders. 

802 workflow : `lsst.ctrl.bps.GenericWorkflow` 

803 Generic workflow that contains file information. 

804 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

805 The job corresponding to the arguments. 

806 

807 Returns 

808 ------- 

809 arguments : `str` 

810 Given arguments string with file placeholders replaced. 

811 """ 

812 # Replace input file placeholders with paths. 

813 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

814 if not gwfile.wms_transfer: 

815 # Must assume full URI if in command line and told WMS is not 

816 # responsible for transferring file. 

817 uri = gwfile.src_uri 

818 elif use_shared: 

819 if gwfile.job_shared: 

820 # Have shared filesystems and jobs can share file. 

821 uri = gwfile.src_uri 

822 else: 

823 # Taking advantage of inside knowledge. Not future-proof. 

824 # Temporary fix until have job wrapper that pulls files 

825 # within job. 

826 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

827 uri = "butler.yaml" 

828 else: 

829 uri = os.path.basename(gwfile.src_uri) 

830 else: # Using push transfer 

831 uri = os.path.basename(gwfile.src_uri) 

832 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

833 

834 # Replace output file placeholders with paths. 

835 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

836 if not gwfile.wms_transfer: 

837 # Must assume full URI if in command line and told WMS is not 

838 # responsible for transferring file. 

839 uri = gwfile.src_uri 

840 elif use_shared: 

841 if gwfile.job_shared: 

842 # Have shared filesystems and jobs can share file. 

843 uri = gwfile.src_uri 

844 else: 

845 uri = os.path.basename(gwfile.src_uri) 

846 else: # Using push transfer 

847 uri = os.path.basename(gwfile.src_uri) 

848 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

849 return arguments 

850 

851 

852def _replace_cmd_vars(arguments, gwjob): 

853 """Replace format-style placeholders in arguments. 

854 

855 Parameters 

856 ---------- 

857 arguments : `str` 

858 Arguments string in which to replace placeholders. 

859 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

860 Job containing values to be used to replace placeholders 

861 (in particular gwjob.cmdvals). 

862 

863 Returns 

864 ------- 

865 arguments : `str` 

866 Given arguments string with placeholders replaced. 

867 """ 

868 try: 

869 arguments = arguments.format(**gwjob.cmdvals) 

870 except (KeyError, TypeError): # TypeError in case None instead of {} 

871 _LOG.error( 

872 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals 

873 ) 

874 raise 

875 return arguments 

876 

877 

878def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

879 """Add job input files from generic workflow to job. 

880 

881 Parameters 

882 ---------- 

883 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

884 The generic workflow (e.g., has executable name and arguments). 

885 job_name : `str` 

886 Unique name for the job. 

887 use_shared : `bool` 

888 Whether job has access to files via shared filesystem. 

889 out_prefix : `str` 

890 The root directory into which all WMS-specific files are written. 

891 

892 Returns 

893 ------- 

894 htc_commands : `dict` [`str`, `str`] 

895 HTCondor commands for the job submission script. 

896 """ 

897 htc_commands = {} 

898 inputs = [] 

899 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

900 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

901 

902 uri = Path(gwf_file.src_uri) 

903 

904 # Note if use_shared and job_shared, don't need to transfer file. 

905 

906 if not use_shared: # Copy file using push to job 

907 inputs.append(str(uri.relative_to(out_prefix))) 

908 elif not gwf_file.job_shared: # Jobs require own copy 

909 # if using shared filesystem, but still need copy in job. Use 

910 # HTCondor's curl plugin for a local copy. 

911 

912 # Execution butler is represented as a directory which the 

913 # curl plugin does not handle. Taking advantage of inside 

914 # knowledge for temporary fix until have job wrapper that pulls 

915 # files within job. 

916 if gwf_file.name == "butlerConfig": 

917 # The execution butler directory doesn't normally exist until 

918 # the submit phase so checking for suffix instead of using 

919 # is_dir(). If other non-yaml file exists they would have a 

920 # different gwf_file.name. 

921 if uri.suffix == ".yaml": # Single file, so just copy. 

922 inputs.append(f"file://{uri}") 

923 else: 

924 inputs.append(f"file://{uri / 'butler.yaml'}") 

925 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

926 elif uri.is_dir(): 

927 raise RuntimeError( 

928 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}" 

929 ) 

930 else: 

931 inputs.append(f"file://{uri}") 

932 

933 if inputs: 

934 htc_commands["transfer_input_files"] = ",".join(inputs) 

935 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

936 return htc_commands 

937 

938 

939def _report_from_path(wms_path): 

940 """Gather run information from a given run directory. 

941 

942 Parameters 

943 ---------- 

944 wms_path : `str` 

945 The directory containing the submit side files (e.g., HTCondor files). 

946 

947 Returns 

948 ------- 

949 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

950 Run information for the detailed report. The key is the HTCondor id 

951 and the value is a collection of report information for that run. 

952 message : `str` 

953 Message to be printed with the summary report. 

954 """ 

955 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

956 if wms_workflow_id == MISSING_ID: 

957 run_reports = {} 

958 else: 

959 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

960 return run_reports, message 

961 

962 

963def _report_from_id(wms_workflow_id, hist, schedds=None): 

964 """Gather run information using workflow id. 

965 

966 Parameters 

967 ---------- 

968 wms_workflow_id : `str` 

969 Limit to specific run based on id. 

970 hist : `float` 

971 Limit history search to this many days. 

972 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

973 HTCondor schedulers which to query for job information. If None 

974 (default), all queries will be run against the local scheduler only. 

975 

976 Returns 

977 ------- 

978 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

979 Run information for the detailed report. The key is the HTCondor id 

980 and the value is a collection of report information for that run. 

981 message : `str` 

982 Message to be printed with the summary report. 

983 """ 

984 messages = [] 

985 

986 # Collect information about the job by querying HTCondor schedd and 

987 # HTCondor history. 

988 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds) 

989 if len(schedd_dag_info) == 1: 

990 # Extract the DAG info without altering the results of the query. 

991 schedd_name = next(iter(schedd_dag_info)) 

992 dag_id = next(iter(schedd_dag_info[schedd_name])) 

993 dag_ad = schedd_dag_info[schedd_name][dag_id] 

994 

995 # If the provided workflow id does not correspond to the one extracted 

996 # from the DAGMan log file in the submit directory, rerun the query 

997 # with the id found in the file. 

998 # 

999 # This is to cover the situation in which the user provided the old job 

1000 # id of a restarted run. 

1001 try: 

1002 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"]) 

1003 except FileNotFoundError as exc: 

1004 # At the moment missing DAGMan log is pretty much a fatal error. 

1005 # So empty the DAG info to finish early (see the if statement 

1006 # below). 

1007 schedd_dag_info.clean() 

1008 messages.append(f"Cannot create the report for '{dag_id}': {exc}") 

1009 else: 

1010 if path_dag_id != dag_id: 

1011 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds) 

1012 messages.append( 

1013 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. " 

1014 "This normally occurs when a run is restarted. The report shown is for the most " 

1015 f"recent status with run id '{path_dag_id}'" 

1016 ) 

1017 

1018 if len(schedd_dag_info) == 0: 

1019 run_reports = {} 

1020 elif len(schedd_dag_info) == 1: 

1021 _, dag_info = schedd_dag_info.popitem() 

1022 dag_id, dag_ad = dag_info.popitem() 

1023 

1024 # Create a mapping between jobs and their classads. The keys will 

1025 # be of format 'ClusterId.ProcId'. 

1026 job_info = {dag_id: dag_ad} 

1027 

1028 # Find jobs (nodes) belonging to that DAGMan job. 

1029 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1030 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1031 if schedd_job_info: 

1032 _, node_info = schedd_job_info.popitem() 

1033 job_info.update(node_info) 

1034 

1035 # Collect additional pieces of information about jobs using HTCondor 

1036 # files in the submission directory. 

1037 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1038 _update_jobs(job_info, path_jobs) 

1039 if message: 

1040 messages.append(message) 

1041 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1042 else: 

1043 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1044 message = ( 

1045 f"More than one job matches id '{wms_workflow_id}', " 

1046 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1047 ) 

1048 messages.append(message) 

1049 run_reports = {} 

1050 

1051 message = "\n".join(messages) 

1052 return run_reports, message 

1053 

1054 

1055def _get_info_from_schedd(wms_workflow_id, hist, schedds): 

1056 """Gather run information from HTCondor. 

1057 

1058 Parameters 

1059 ---------- 

1060 wms_workflow_id : `str` 

1061 Limit to specific run based on id. 

1062 hist : 

1063 Limit history search to this many days. 

1064 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1065 HTCondor schedulers which to query for job information. If None 

1066 (default), all queries will be run against the local scheduler only. 

1067 

1068 Returns 

1069 ------- 

1070 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1071 Information about jobs satisfying the search criteria where for each 

1072 Scheduler, local HTCondor job ids are mapped to their respective 

1073 classads. 

1074 """ 

1075 dag_constraint = 'regexp("dagman$", Cmd)' 

1076 try: 

1077 cluster_id = int(float(wms_workflow_id)) 

1078 except ValueError: 

1079 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

1080 else: 

1081 dag_constraint += f" && ClusterId == {cluster_id}" 

1082 

1083 # With the current implementation of the condor_* functions the query 

1084 # will always return only one match per Scheduler. 

1085 # 

1086 # Even in the highly unlikely situation where HTCondor history (which 

1087 # condor_search queries too) is long enough to have jobs from before 

1088 # the cluster ids were rolled over (and as a result there is more then 

1089 # one job with the same cluster id) they will not show up in 

1090 # the results. 

1091 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

1092 return schedd_dag_info 

1093 

1094 

1095def _get_info_from_path(wms_path): 

1096 """Gather run information from a given run directory. 

1097 

1098 Parameters 

1099 ---------- 

1100 wms_path : `str` 

1101 Directory containing HTCondor files. 

1102 

1103 Returns 

1104 ------- 

1105 wms_workflow_id : `str` 

1106 The run id which is a DAGman job id. 

1107 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1108 Information about jobs read from files in the given directory. 

1109 The key is the HTCondor id and the value is a dictionary of HTCondor 

1110 keys and values. 

1111 message : `str` 

1112 Message to be printed with the summary report. 

1113 """ 

1114 messages = [] 

1115 try: 

1116 wms_workflow_id, jobs = read_dag_log(wms_path) 

1117 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1118 _update_jobs(jobs, read_node_status(wms_path)) 

1119 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1120 

1121 # Add more info for DAGman job 

1122 job = jobs[wms_workflow_id] 

1123 job.update(read_dag_status(wms_path)) 

1124 

1125 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1126 if "bps_run" not in job: 

1127 _add_run_info(wms_path, job) 

1128 

1129 message = htc_check_dagman_output(wms_path) 

1130 if message: 

1131 messages.append(message) 

1132 _LOG.debug( 

1133 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1134 ) 

1135 

1136 # Add extra pieces of information which cannot be found in HTCondor 

1137 # generated files like 'GlobalJobId'. 

1138 # 

1139 # Do not treat absence of this file as a serious error. Neither runs 

1140 # submitted with earlier versions of the plugin nor the runs submitted 

1141 # with Pegasus plugin will have it at the moment. However, once enough 

1142 # time passes and Pegasus plugin will have its own report() method 

1143 # (instead of sneakily using HTCondor's one), the lack of that file 

1144 # should be treated as seriously as lack of any other file. 

1145 try: 

1146 job_info = read_dag_info(wms_path) 

1147 except FileNotFoundError as exc: 

1148 message = f"Warn: Some information may not be available: {exc}" 

1149 messages.append(message) 

1150 else: 

1151 schedd_name = next(iter(job_info)) 

1152 job_ad = next(iter(job_info[schedd_name].values())) 

1153 job.update(job_ad) 

1154 except FileNotFoundError: 

1155 message = f"Could not find HTCondor files in '{wms_path}'" 

1156 _LOG.warning(message) 

1157 messages.append(message) 

1158 wms_workflow_id = MISSING_ID 

1159 jobs = {} 

1160 

1161 message = "\n".join([msg for msg in messages if msg]) 

1162 return wms_workflow_id, jobs, message 

1163 

1164 

1165def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1166 """Gather run information to be used in generating summary reports. 

1167 

1168 Parameters 

1169 ---------- 

1170 wms_workflow_id : `str` 

1171 The run id to create the report for. 

1172 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1173 Mapping HTCondor job id to job information. 

1174 

1175 Returns 

1176 ------- 

1177 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1178 Run information for the detailed report. The key is the given HTCondor 

1179 id and the value is a collection of report information for that run. 

1180 """ 

1181 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1182 dag_job = jobs[wms_workflow_id] 

1183 report = WmsRunReport( 

1184 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1185 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1186 path=dag_job["Iwd"], 

1187 label=dag_job.get("bps_job_label", "MISS"), 

1188 run=dag_job.get("bps_run", "MISS"), 

1189 project=dag_job.get("bps_project", "MISS"), 

1190 campaign=dag_job.get("bps_campaign", "MISS"), 

1191 payload=dag_job.get("bps_payload", "MISS"), 

1192 operator=_get_owner(dag_job), 

1193 run_summary=_get_run_summary(dag_job), 

1194 state=_htc_status_to_wms_state(dag_job), 

1195 jobs=[], 

1196 total_number_jobs=dag_job["total_jobs"], 

1197 job_state_counts=dag_job["state_counts"], 

1198 ) 

1199 

1200 for job_id, job_info in jobs.items(): 

1201 try: 

1202 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

1203 job_report = WmsJobReport( 

1204 wms_id=job_id, 

1205 name=job_info.get("DAGNodeName", job_id), 

1206 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1207 state=_htc_status_to_wms_state(job_info), 

1208 ) 

1209 if job_report.label == "init": 

1210 job_report.label = "pipetaskInit" 

1211 report.jobs.append(job_report) 

1212 except KeyError as ex: 

1213 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1214 raise 

1215 

1216 run_reports = {report.wms_id: report} 

1217 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1218 return run_reports 

1219 

1220 

1221def _summary_report(user, hist, pass_thru, schedds=None): 

1222 """Gather run information to be used in generating summary reports. 

1223 

1224 Parameters 

1225 ---------- 

1226 user : `str` 

1227 Run lookup restricted to given user. 

1228 hist : `float` 

1229 How many previous days to search for run information. 

1230 pass_thru : `str` 

1231 Advanced users can define the HTCondor constraint to be used 

1232 when searching queue and history. 

1233 

1234 Returns 

1235 ------- 

1236 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1237 Run information for the summary report. The keys are HTCondor ids and 

1238 the values are collections of report information for each run. 

1239 message : `str` 

1240 Message to be printed with the summary report. 

1241 """ 

1242 # only doing summary report so only look for dagman jobs 

1243 if pass_thru: 

1244 constraint = pass_thru 

1245 else: 

1246 # Notes: 

1247 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1248 # manually restarted. 

1249 # * Any job with DAGManJobID isn't a DAG job 

1250 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1251 if user: 

1252 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1253 

1254 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1255 

1256 # Have list of DAGMan jobs, need to get run_report info. 

1257 run_reports = {} 

1258 for jobs in job_info.values(): 

1259 for job_id, job in jobs.items(): 

1260 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1261 # If didn't get from queue information (e.g., Kerberos bug), 

1262 # try reading from file. 

1263 if total_jobs == 0: 

1264 try: 

1265 job.update(read_dag_status(job["Iwd"])) 

1266 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1267 except StopIteration: 

1268 pass # don't kill report can't find htcondor files 

1269 

1270 if "bps_run" not in job: 

1271 _add_run_info(job["Iwd"], job) 

1272 report = WmsRunReport( 

1273 wms_id=job_id, 

1274 global_wms_id=job["GlobalJobId"], 

1275 path=job["Iwd"], 

1276 label=job.get("bps_job_label", "MISS"), 

1277 run=job.get("bps_run", "MISS"), 

1278 project=job.get("bps_project", "MISS"), 

1279 campaign=job.get("bps_campaign", "MISS"), 

1280 payload=job.get("bps_payload", "MISS"), 

1281 operator=_get_owner(job), 

1282 run_summary=_get_run_summary(job), 

1283 state=_htc_status_to_wms_state(job), 

1284 jobs=[], 

1285 total_number_jobs=total_jobs, 

1286 job_state_counts=state_counts, 

1287 ) 

1288 run_reports[report.global_wms_id] = report 

1289 

1290 return run_reports, "" 

1291 

1292 

1293def _add_run_info(wms_path, job): 

1294 """Find BPS run information elsewhere for runs without bps attributes. 

1295 

1296 Parameters 

1297 ---------- 

1298 wms_path : `str` 

1299 Path to submit files for the run. 

1300 job : `dict` [`str`, `Any`] 

1301 HTCondor dag job information. 

1302 

1303 Raises 

1304 ------ 

1305 StopIteration 

1306 If cannot find file it is looking for. Permission errors are 

1307 caught and job's run is marked with error. 

1308 """ 

1309 path = Path(wms_path) / "jobs" 

1310 try: 

1311 subfile = next(path.glob("**/*.sub")) 

1312 except (StopIteration, PermissionError): 

1313 job["bps_run"] = "Unavailable" 

1314 else: 

1315 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1316 try: 

1317 with open(subfile, encoding="utf-8") as fh: 

1318 for line in fh: 

1319 if line.startswith("+bps_"): 

1320 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1321 if m: 

1322 _LOG.debug("Matching line: %s", line) 

1323 job[m.group(1)] = m.group(2).replace('"', "") 

1324 else: 

1325 _LOG.debug("Could not parse attribute: %s", line) 

1326 except PermissionError: 

1327 job["bps_run"] = "PermissionError" 

1328 _LOG.debug("After adding job = %s", job) 

1329 

1330 

1331def _get_owner(job): 

1332 """Get the owner of a dag job. 

1333 

1334 Parameters 

1335 ---------- 

1336 job : `dict` [`str`, `Any`] 

1337 HTCondor dag job information. 

1338 

1339 Returns 

1340 ------- 

1341 owner : `str` 

1342 Owner of the dag job. 

1343 """ 

1344 owner = job.get("bps_operator", None) 

1345 if not owner: 

1346 owner = job.get("Owner", None) 

1347 if not owner: 

1348 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1349 owner = "MISS" 

1350 return owner 

1351 

1352 

1353def _get_run_summary(job): 

1354 """Get the run summary for a job. 

1355 

1356 Parameters 

1357 ---------- 

1358 job : `dict` [`str`, `Any`] 

1359 HTCondor dag job information. 

1360 

1361 Returns 

1362 ------- 

1363 summary : `str` 

1364 Number of jobs per PipelineTask label in approximate pipeline order. 

1365 Format: <label>:<count>[;<label>:<count>]+ 

1366 """ 

1367 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1368 if not summary: 

1369 summary, _ = summary_from_dag(job["Iwd"]) 

1370 if not summary: 

1371 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1372 _LOG.debug("_get_run_summary: summary=%s", summary) 

1373 

1374 # Workaround sometimes using init vs pipetaskInit 

1375 summary = summary.replace("init:", "pipetaskInit:") 

1376 

1377 if "pegasus_version" in job and "pegasus" not in summary: 

1378 summary += ";pegasus:0" 

1379 

1380 return summary 

1381 

1382 

1383def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1384 """Count number of jobs per WMS state. 

1385 

1386 Parameters 

1387 ---------- 

1388 wms_workflow_id : `str` 

1389 HTCondor job id. 

1390 jobs : `dict` [`str`, `Any`] 

1391 HTCondor dag job information. 

1392 

1393 Returns 

1394 ------- 

1395 total_count : `int` 

1396 Total number of dag nodes. 

1397 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1398 Keys are the different WMS states and values are counts of jobs 

1399 that are in that WMS state. 

1400 """ 

1401 state_counts = dict.fromkeys(WmsStates, 0) 

1402 

1403 for jid, jinfo in jobs.items(): 

1404 if jid != wms_workflow_id: 

1405 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1406 

1407 total_counted = sum(state_counts.values()) 

1408 if "NodesTotal" in jobs[wms_workflow_id]: 

1409 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1410 else: 

1411 total_count = total_counted 

1412 

1413 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1414 

1415 return total_count, state_counts 

1416 

1417 

1418def _get_state_counts_from_dag_job(job): 

1419 """Count number of jobs per WMS state. 

1420 

1421 Parameters 

1422 ---------- 

1423 job : `dict` [`str`, `Any`] 

1424 HTCondor dag job information. 

1425 

1426 Returns 

1427 ------- 

1428 total_count : `int` 

1429 Total number of dag nodes. 

1430 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1431 Keys are the different WMS states and values are counts of jobs 

1432 that are in that WMS state. 

1433 """ 

1434 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1435 state_counts = dict.fromkeys(WmsStates, 0) 

1436 if "DAG_NodesReady" in job: 

1437 state_counts = { 

1438 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1439 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1440 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1441 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1442 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1443 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1444 } 

1445 total_jobs = job.get("DAG_NodesTotal") 

1446 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1447 elif "NodesFailed" in job: 

1448 state_counts = { 

1449 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1450 WmsStates.READY: job.get("NodesReady", 0), 

1451 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1452 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1453 WmsStates.FAILED: job.get("NodesFailed", 0), 

1454 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1455 } 

1456 try: 

1457 total_jobs = job.get("NodesTotal") 

1458 except KeyError as ex: 

1459 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1460 raise 

1461 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1462 else: 

1463 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1464 # for every DAG. 

1465 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1466 total_jobs = 0 

1467 

1468 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1469 return total_jobs, state_counts 

1470 

1471 

1472def _htc_status_to_wms_state(job): 

1473 """Convert HTCondor job status to generic wms state. 

1474 

1475 Parameters 

1476 ---------- 

1477 job : `dict` [`str`, `Any`] 

1478 HTCondor job information. 

1479 

1480 Returns 

1481 ------- 

1482 wms_state : `WmsStates` 

1483 The equivalent WmsState to given job's status. 

1484 """ 

1485 wms_state = WmsStates.MISFIT 

1486 if "JobStatus" in job: 

1487 wms_state = _htc_job_status_to_wms_state(job) 

1488 elif "NodeStatus" in job: 

1489 wms_state = _htc_node_status_to_wms_state(job) 

1490 return wms_state 

1491 

1492 

1493def _htc_job_status_to_wms_state(job): 

1494 """Convert HTCondor job status to generic wms state. 

1495 

1496 Parameters 

1497 ---------- 

1498 job : `dict` [`str`, `Any`] 

1499 HTCondor job information. 

1500 

1501 Returns 

1502 ------- 

1503 wms_state : `lsst.ctrl.bps.WmsStates` 

1504 The equivalent WmsState to given job's status. 

1505 """ 

1506 _LOG.debug( 

1507 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1508 ) 

1509 job_status = int(job["JobStatus"]) 

1510 wms_state = WmsStates.MISFIT 

1511 

1512 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1513 if job_status == JobStatus.IDLE: 

1514 wms_state = WmsStates.PENDING 

1515 elif job_status == JobStatus.RUNNING: 

1516 wms_state = WmsStates.RUNNING 

1517 elif job_status == JobStatus.REMOVED: 

1518 wms_state = WmsStates.DELETED 

1519 elif job_status == JobStatus.COMPLETED: 

1520 if ( 

1521 job.get("ExitBySignal", False) 

1522 or job.get("ExitCode", 0) 

1523 or job.get("ExitSignal", 0) 

1524 or job.get("DAG_Status", 0) 

1525 or job.get("ReturnValue", 0) 

1526 ): 

1527 wms_state = WmsStates.FAILED 

1528 else: 

1529 wms_state = WmsStates.SUCCEEDED 

1530 elif job_status == JobStatus.HELD: 

1531 wms_state = WmsStates.HELD 

1532 

1533 return wms_state 

1534 

1535 

1536def _htc_node_status_to_wms_state(job): 

1537 """Convert HTCondor status to generic wms state. 

1538 

1539 Parameters 

1540 ---------- 

1541 job : `dict` [`str`, `Any`] 

1542 HTCondor job information. 

1543 

1544 Returns 

1545 ------- 

1546 wms_state : `lsst.ctrl.bps.WmsStates` 

1547 The equivalent WmsState to given node's status. 

1548 """ 

1549 wms_state = WmsStates.MISFIT 

1550 

1551 status = job["NodeStatus"] 

1552 if status == NodeStatus.NOT_READY: 

1553 wms_state = WmsStates.UNREADY 

1554 elif status == NodeStatus.READY: 

1555 wms_state = WmsStates.READY 

1556 elif status == NodeStatus.PRERUN: 

1557 wms_state = WmsStates.MISFIT 

1558 elif status == NodeStatus.SUBMITTED: 

1559 if job["JobProcsHeld"]: 

1560 wms_state = WmsStates.HELD 

1561 elif job["StatusDetails"] == "not_idle": 

1562 wms_state = WmsStates.RUNNING 

1563 elif job["JobProcsQueued"]: 

1564 wms_state = WmsStates.PENDING 

1565 elif status == NodeStatus.POSTRUN: 

1566 wms_state = WmsStates.MISFIT 

1567 elif status == NodeStatus.DONE: 

1568 wms_state = WmsStates.SUCCEEDED 

1569 elif status == NodeStatus.ERROR: 

1570 # Use job exist instead of post script exit 

1571 if "DAGMAN error 0" in job["StatusDetails"]: 

1572 wms_state = WmsStates.SUCCEEDED 

1573 else: 

1574 wms_state = WmsStates.FAILED 

1575 

1576 return wms_state 

1577 

1578 

1579def _update_jobs(jobs1, jobs2): 

1580 """Update jobs1 with info in jobs2. 

1581 

1582 (Basically an update for nested dictionaries.) 

1583 

1584 Parameters 

1585 ---------- 

1586 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1587 HTCondor job information to be updated. 

1588 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1589 Additional HTCondor job information. 

1590 """ 

1591 for jid, jinfo in jobs2.items(): 

1592 if jid in jobs1: 

1593 jobs1[jid].update(jinfo) 

1594 else: 

1595 jobs1[jid] = jinfo 

1596 

1597 

1598def _wms_id_type(wms_id): 

1599 """Determine the type of the WMS id. 

1600 

1601 Parameters 

1602 ---------- 

1603 wms_id : `str` 

1604 WMS id identifying a job. 

1605 

1606 Returns 

1607 ------- 

1608 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1609 Type of WMS id. 

1610 """ 

1611 try: 

1612 int(float(wms_id)) 

1613 except ValueError: 

1614 wms_path = Path(wms_id) 

1615 if wms_path.exists(): 

1616 id_type = WmsIdType.PATH 

1617 else: 

1618 id_type = WmsIdType.GLOBAL 

1619 except TypeError: 

1620 id_type = WmsIdType.UNKNOWN 

1621 else: 

1622 id_type = WmsIdType.LOCAL 

1623 return id_type 

1624 

1625 

1626def _wms_id_to_cluster(wms_id): 

1627 """Convert WMS id to cluster id. 

1628 

1629 Parameters 

1630 ---------- 

1631 wms_id : `int` or `float` or `str` 

1632 HTCondor job id or path. 

1633 

1634 Returns 

1635 ------- 

1636 schedd_ad : `classad.ClassAd` 

1637 ClassAd describing the scheduler managing the job with the given id. 

1638 cluster_id : `int` 

1639 HTCondor cluster id. 

1640 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1641 The type of the provided id. 

1642 """ 

1643 coll = htcondor.Collector() 

1644 

1645 schedd_ad = None 

1646 cluster_id = None 

1647 id_type = _wms_id_type(wms_id) 

1648 if id_type == WmsIdType.LOCAL: 

1649 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1650 cluster_id = int(float(wms_id)) 

1651 elif id_type == WmsIdType.GLOBAL: 

1652 constraint = f'GlobalJobId == "{wms_id}"' 

1653 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1654 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()} 

1655 job_info = condor_q(constraint=constraint, schedds=schedds) 

1656 if job_info: 

1657 schedd_name, job_rec = job_info.popitem() 

1658 job_id, _ = job_rec.popitem() 

1659 schedd_ad = schedd_ads[schedd_name] 

1660 cluster_id = int(float(job_id)) 

1661 elif id_type == WmsIdType.PATH: 

1662 try: 

1663 job_info = read_dag_info(wms_id) 

1664 except (FileNotFoundError, PermissionError, OSError): 

1665 pass 

1666 else: 

1667 schedd_name, job_rec = job_info.popitem() 

1668 job_id, _ = job_rec.popitem() 

1669 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1670 cluster_id = int(float(job_id)) 

1671 else: 

1672 pass 

1673 return schedd_ad, cluster_id, id_type 

1674 

1675 

1676def _create_periodic_release_expr(memory, multiplier, limit): 

1677 """Construct an HTCondorAd expression for releasing held jobs. 

1678 

1679 The expression instruct HTCondor to release any job which was put on hold 

1680 due to exceeding memory requirements back to the job queue providing it 

1681 satisfies all of the conditions below: 

1682 

1683 * number of run attempts did not reach allowable number of retries, 

1684 * the memory requirements in the last failed run attempt did not reach 

1685 the specified memory limit. 

1686 

1687 Parameters 

1688 ---------- 

1689 memory : `int` 

1690 Requested memory in MB. 

1691 multiplier : `float` 

1692 Memory growth rate between retires. 

1693 limit : `int` 

1694 Memory limit. 

1695 

1696 Returns 

1697 ------- 

1698 expr : `str` 

1699 A string representing an HTCondor ClassAd expression for releasing jobs 

1700 which have been held due to exceeding the memory requirements. 

1701 """ 

1702 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1703 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1704 

1705 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1706 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1707 # The special comparison operators ensure that all comparisons below will 

1708 # evaluate to FALSE in this case. 

1709 # 

1710 # Note: 

1711 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1712 # the entire expression should evaluate to FALSE when the job is not HELD. 

1713 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1714 # but better safe than sorry. 

1715 was_mem_exceeded = ( 

1716 "JobStatus == 5 " 

1717 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1718 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1719 ) 

1720 

1721 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1722 return expr 

1723 

1724 

1725def _create_periodic_remove_expr(memory, multiplier, limit): 

1726 """Construct an HTCondorAd expression for removing jobs from the queue. 

1727 

1728 The expression instruct HTCondor to remove any job which was put on hold 

1729 due to exceeding memory requirements from the job queue providing it 

1730 satisfies any of the conditions below: 

1731 

1732 * allowable number of retries was reached, 

1733 * the memory requirements during the last failed run attempt reached 

1734 the specified memory limit. 

1735 

1736 Parameters 

1737 ---------- 

1738 memory : `int` 

1739 Requested memory in MB. 

1740 multiplier : `float` 

1741 Memory growth rate between retires. 

1742 limit : `int` 

1743 Memory limit. 

1744 

1745 Returns 

1746 ------- 

1747 expr : `str` 

1748 A string representing an HTCondor ClassAd expression for removing jobs 

1749 which were run at the maximal allowable memory and still exceeded 

1750 the memory requirements. 

1751 """ 

1752 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1753 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1754 

1755 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1756 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1757 # The special comparison operators ensure that all comparisons below will 

1758 # evaluate to FALSE in this case. 

1759 # 

1760 # Note: 

1761 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1762 # the entire expression should evaluate to FALSE when the job is not HELD. 

1763 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1764 # but better safe than sorry. 

1765 was_mem_exceeded = ( 

1766 "JobStatus == 5 " 

1767 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1768 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1769 ) 

1770 

1771 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1772 return expr 

1773 

1774 

1775def _create_request_memory_expr(memory, multiplier, limit): 

1776 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1777 

1778 Parameters 

1779 ---------- 

1780 memory : `int` 

1781 Requested memory in MB. 

1782 multiplier : `float` 

1783 Memory growth rate between retires. 

1784 limit : `int` 

1785 Memory limit. 

1786 

1787 Returns 

1788 ------- 

1789 expr : `str` 

1790 A string representing an HTCondor ClassAd expression enabling safe 

1791 memory scaling between job retries. 

1792 """ 

1793 # The check if the job was held due to exceeding memory requirements 

1794 # will be made *after* job was released back to the job queue (is in 

1795 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1796 # the ones describing job's current state. 

1797 # 

1798 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1799 # initially put in the job queue. The special comparison operators ensure 

1800 # that all comparisons below will evaluate to FALSE in this case. 

1801 was_mem_exceeded = ( 

1802 "LastJobStatus =?= 5 " 

1803 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1804 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1805 ) 

1806 

1807 # If job runs the first time or was held for reasons other than exceeding 

1808 # the memory, set the required memory to the requested value or use 

1809 # the memory value measured by HTCondor (MemoryUsage) depending on 

1810 # whichever is greater. 

1811 expr = ( 

1812 f"({was_mem_exceeded}) " 

1813 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1814 f": max({{{memory}, MemoryUsage ?: 0}})" 

1815 ) 

1816 return expr 

1817 

1818 

1819def _locate_schedds(locate_all=False): 

1820 """Find out Scheduler daemons in an HTCondor pool. 

1821 

1822 Parameters 

1823 ---------- 

1824 locate_all : `bool`, optional 

1825 If True, all available schedulers in the HTCondor pool will be located. 

1826 False by default which means that the search will be limited to looking 

1827 for the Scheduler running on a local host. 

1828 

1829 Returns 

1830 ------- 

1831 schedds : `dict` [`str`, `htcondor.Schedd`] 

1832 A mapping between Scheduler names and Python objects allowing for 

1833 interacting with them. 

1834 """ 

1835 coll = htcondor.Collector() 

1836 

1837 schedd_ads = [] 

1838 if locate_all: 

1839 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1840 else: 

1841 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1842 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1843 

1844 

1845def _gather_site_values(config, compute_site): 

1846 """Gather values specific to given site. 

1847 

1848 Parameters 

1849 ---------- 

1850 config : `lsst.ctrl.bps.BpsConfig` 

1851 BPS configuration that includes necessary submit/runtime 

1852 information. 

1853 compute_site : `str` 

1854 Compute site name. 

1855 

1856 Returns 

1857 ------- 

1858 site_values : `dict` [`str`, `Any`] 

1859 Values specific to the given site. 

1860 """ 

1861 site_values = {"attrs": {}, "profile": {}} 

1862 search_opts = {} 

1863 if compute_site: 

1864 search_opts["curvals"] = {"curr_site": compute_site} 

1865 

1866 # Determine the hard limit for the memory requirement. 

1867 found, limit = config.search("memoryLimit", opt=search_opts) 

1868 if not found: 

1869 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1870 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1871 del search_opts["default"] 

1872 

1873 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1874 # by definition, they cannot have more memory than 

1875 # the partitionable slot they are the part of. 

1876 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1877 pool_info = condor_status(constraint=constraint) 

1878 try: 

1879 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1880 except ValueError: 

1881 _LOG.debug("No execute machine in the pool matches %s", patt) 

1882 if limit: 

1883 config[".bps_defined.memory_limit"] = limit 

1884 

1885 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1886 site_values["memoryLimit"] = limit 

1887 

1888 found, value = config.search("accountingGroup", opt=search_opts) 

1889 if found: 

1890 site_values["accountingGroup"] = value 

1891 found, value = config.search("accountingUser", opt=search_opts) 

1892 if found: 

1893 site_values["accountingUser"] = value 

1894 

1895 key = f".site.{compute_site}.profile.condor" 

1896 if key in config: 

1897 for key, val in config[key].items(): 

1898 if key.startswith("+"): 

1899 site_values["attrs"][key[1:]] = val 

1900 else: 

1901 site_values["profile"][key] = val 

1902 

1903 return site_values