Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 1%

693 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-22 02:05 -0700

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import logging 

29import os 

30import re 

31from collections import defaultdict 

32from enum import IntEnum, auto 

33from pathlib import Path 

34 

35import htcondor 

36from lsst.ctrl.bps import ( 

37 BaseWmsService, 

38 BaseWmsWorkflow, 

39 GenericWorkflow, 

40 GenericWorkflowJob, 

41 WmsJobReport, 

42 WmsRunReport, 

43 WmsStates, 

44) 

45from lsst.ctrl.bps.bps_utils import chdir, create_count_summary 

46from lsst.utils.timer import time_this 

47from packaging import version 

48 

49from .lssthtc import ( 

50 MISSING_ID, 

51 HTCDag, 

52 HTCJob, 

53 JobStatus, 

54 NodeStatus, 

55 condor_q, 

56 condor_search, 

57 condor_status, 

58 htc_backup_files, 

59 htc_check_dagman_output, 

60 htc_create_submit_from_cmd, 

61 htc_create_submit_from_dag, 

62 htc_create_submit_from_file, 

63 htc_escape, 

64 htc_submit_dag, 

65 htc_version, 

66 pegasus_name_to_label, 

67 read_dag_info, 

68 read_dag_log, 

69 read_dag_status, 

70 read_node_status, 

71 summary_from_dag, 

72 write_dag_info, 

73) 

74 

75 

76class WmsIdType(IntEnum): 

77 """Type of valid WMS ids.""" 

78 

79 UNKNOWN = auto() 

80 """The type of id cannot be determined. 

81 """ 

82 

83 LOCAL = auto() 

84 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

85 """ 

86 

87 GLOBAL = auto() 

88 """Id is a HTCondor's global job id. 

89 """ 

90 

91 PATH = auto() 

92 """Id is a submission path. 

93 """ 

94 

95 

96DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

97"""Default pattern for searching execute machines in an HTCondor pool. 

98""" 

99 

100_LOG = logging.getLogger(__name__) 

101 

102 

103class HTCondorService(BaseWmsService): 

104 """HTCondor version of WMS service.""" 

105 

106 def prepare(self, config, generic_workflow, out_prefix=None): 

107 """Convert generic workflow to an HTCondor DAG ready for submission. 

108 

109 Parameters 

110 ---------- 

111 config : `lsst.ctrl.bps.BpsConfig` 

112 BPS configuration that includes necessary submit/runtime 

113 information. 

114 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

115 The generic workflow (e.g., has executable name and arguments). 

116 out_prefix : `str` 

117 The root directory into which all WMS-specific files are written. 

118 

119 Returns 

120 ------- 

121 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

122 HTCondor workflow ready to be run. 

123 """ 

124 _LOG.debug("out_prefix = '%s'", out_prefix) 

125 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

126 workflow = HTCondorWorkflow.from_generic_workflow( 

127 config, 

128 generic_workflow, 

129 out_prefix, 

130 f"{self.__class__.__module__}." f"{self.__class__.__name__}", 

131 ) 

132 

133 with time_this( 

134 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

135 ): 

136 workflow.write(out_prefix) 

137 return workflow 

138 

139 def submit(self, workflow): 

140 """Submit a single HTCondor workflow. 

141 

142 Parameters 

143 ---------- 

144 workflow : `lsst.ctrl.bps.BaseWorkflow` 

145 A single HTCondor workflow to submit. run_id is updated after 

146 successful submission to WMS. 

147 """ 

148 dag = workflow.dag 

149 

150 ver = version.parse(htc_version()) 

151 if ver >= version.parse("8.9.3"): 

152 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

153 else: 

154 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

155 

156 # For workflow portability, internal paths are all relative. Hence 

157 # the DAG needs to be submitted to HTCondor from inside the submit 

158 # directory. 

159 with chdir(workflow.submit_path): 

160 _LOG.info("Submitting from directory: %s", os.getcwd()) 

161 schedd_dag_info = htc_submit_dag(sub) 

162 if schedd_dag_info: 

163 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

164 

165 _, dag_info = schedd_dag_info.popitem() 

166 _, dag_ad = dag_info.popitem() 

167 

168 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

169 workflow.run_id = dag.run_id 

170 else: 

171 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

172 

173 def restart(self, wms_workflow_id): 

174 """Restart a failed DAGMan workflow. 

175 

176 Parameters 

177 ---------- 

178 wms_workflow_id : `str` 

179 The directory with HTCondor files. 

180 

181 Returns 

182 ------- 

183 run_id : `str` 

184 HTCondor id of the restarted DAGMan job. If restart failed, it will 

185 be set to None. 

186 run_name : `str` 

187 Name of the restarted workflow. If restart failed, it will be set 

188 to None. 

189 message : `str` 

190 A message describing any issues encountered during the restart. 

191 If there were no issues, an empty string is returned. 

192 """ 

193 wms_path = Path(wms_workflow_id) 

194 if not wms_path.is_dir(): 

195 return None, None, f"Directory '{wms_path}' not found" 

196 

197 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

198 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

199 if not rescue_dags: 

200 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

201 

202 _LOG.info("Verifying that the workflow is not already in the job queue") 

203 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"') 

204 if schedd_dag_info: 

205 _, dag_info = schedd_dag_info.popitem() 

206 _, dag_ad = dag_info.popitem() 

207 id_ = dag_ad["GlobalJobId"] 

208 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

209 

210 _LOG.info("Checking execution status of the workflow") 

211 warn = False 

212 dag_ad = read_dag_status(str(wms_path)) 

213 if dag_ad: 

214 nodes_total = dag_ad.get("NodesTotal", 0) 

215 if nodes_total != 0: 

216 nodes_done = dag_ad.get("NodesDone", 0) 

217 if nodes_total == nodes_done: 

218 return None, None, "All jobs in the workflow finished successfully" 

219 else: 

220 warn = True 

221 else: 

222 warn = True 

223 if warn: 

224 _LOG.warning( 

225 "Cannot determine the execution status of the workflow, continuing with restart regardless" 

226 ) 

227 

228 _LOG.info("Backing up select HTCondor files from previous run attempt") 

229 htc_backup_files(wms_path, subdir="backups") 

230 

231 # For workflow portability, internal paths are all relative. Hence 

232 # the DAG needs to be resubmitted to HTCondor from inside the submit 

233 # directory. 

234 _LOG.info("Adding workflow to the job queue") 

235 run_id, run_name, message = None, None, "" 

236 with chdir(wms_path): 

237 try: 

238 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

239 except StopIteration: 

240 message = f"DAGMan submit description file not found in '{wms_path}'" 

241 else: 

242 sub = htc_create_submit_from_file(dag_path.name) 

243 schedd_dag_info = htc_submit_dag(sub) 

244 

245 # Save select information about the DAGMan job to a file. Use 

246 # the run name (available in the ClassAd) as the filename. 

247 if schedd_dag_info: 

248 dag_info = next(iter(schedd_dag_info.values())) 

249 dag_ad = next(iter(dag_info.values())) 

250 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

251 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

252 run_name = dag_ad["bps_run"] 

253 else: 

254 message = "DAGMan job information unavailable" 

255 

256 return run_id, run_name, message 

257 

258 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

259 """Query WMS for list of submitted WMS workflows/jobs. 

260 

261 This should be a quick lookup function to create list of jobs for 

262 other functions. 

263 

264 Parameters 

265 ---------- 

266 wms_id : `int` or `str`, optional 

267 Id or path that can be used by WMS service to look up job. 

268 user : `str`, optional 

269 User whose submitted jobs should be listed. 

270 require_bps : `bool`, optional 

271 Whether to require jobs returned in list to be bps-submitted jobs. 

272 pass_thru : `str`, optional 

273 Information to pass through to WMS. 

274 is_global : `bool`, optional 

275 If set, all job queues (and their histories) will be queried for 

276 job information. Defaults to False which means that only the local 

277 job queue will be queried. 

278 

279 Returns 

280 ------- 

281 job_ids : `list` [`Any`] 

282 Only job ids to be used by cancel and other functions. Typically 

283 this means top-level jobs (i.e., not children jobs). 

284 """ 

285 _LOG.debug( 

286 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

287 wms_id, 

288 user, 

289 require_bps, 

290 pass_thru, 

291 is_global, 

292 ) 

293 

294 # Determine which Schedds will be queried for job information. 

295 coll = htcondor.Collector() 

296 

297 schedd_ads = [] 

298 if is_global: 

299 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

300 else: 

301 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

302 

303 # Construct appropriate constraint expression using provided arguments. 

304 constraint = "False" 

305 if wms_id is None: 

306 if user is not None: 

307 constraint = f'(Owner == "{user}")' 

308 else: 

309 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

310 if cluster_id is not None: 

311 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

312 

313 # If provided id is either a submission path or a global id, 

314 # make sure the right Schedd will be queried regardless of 

315 # 'is_global' value. 

316 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

317 schedd_ads = [schedd_ad] 

318 if require_bps: 

319 constraint += ' && (bps_isjob == "True")' 

320 if pass_thru: 

321 if "-forcex" in pass_thru: 

322 pass_thru_2 = pass_thru.replace("-forcex", "") 

323 if pass_thru_2 and not pass_thru_2.isspace(): 

324 constraint += f" && ({pass_thru_2})" 

325 else: 

326 constraint += f" && ({pass_thru})" 

327 

328 # Create a list of scheduler daemons which need to be queried. 

329 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

330 

331 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

332 results = condor_q(constraint=constraint, schedds=schedds) 

333 

334 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

335 job_ids = [] 

336 for schedd_name, job_info in results.items(): 

337 for job_id, job_ad in job_info.items(): 

338 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

339 if "DAGManJobId" not in job_ad: 

340 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

341 else: 

342 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

343 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

344 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

345 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

346 

347 _LOG.debug("job_ids = %s", job_ids) 

348 return job_ids 

349 

350 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

351 """Return run information based upon given constraints. 

352 

353 Parameters 

354 ---------- 

355 wms_workflow_id : `str`, optional 

356 Limit to specific run based on id. 

357 user : `str`, optional 

358 Limit results to runs for this user. 

359 hist : `float`, optional 

360 Limit history search to this many days. Defaults to 0. 

361 pass_thru : `str`, optional 

362 Constraints to pass through to HTCondor. 

363 is_global : `bool`, optional 

364 If set, all job queues (and their histories) will be queried for 

365 job information. Defaults to False which means that only the local 

366 job queue will be queried. 

367 

368 Returns 

369 ------- 

370 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

371 Information about runs from given job information. 

372 message : `str` 

373 Extra message for report command to print. This could be pointers 

374 to documentation or to WMS specific commands. 

375 """ 

376 if wms_workflow_id: 

377 id_type = _wms_id_type(wms_workflow_id) 

378 if id_type == WmsIdType.LOCAL: 

379 schedulers = _locate_schedds(locate_all=is_global) 

380 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

381 elif id_type == WmsIdType.GLOBAL: 

382 schedulers = _locate_schedds(locate_all=True) 

383 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

384 elif id_type == WmsIdType.PATH: 

385 run_reports, message = _report_from_path(wms_workflow_id) 

386 else: 

387 run_reports, message = {}, "Invalid job id" 

388 else: 

389 schedulers = _locate_schedds(locate_all=is_global) 

390 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

391 _LOG.debug("report: %s, %s", run_reports, message) 

392 

393 return list(run_reports.values()), message 

394 

395 def cancel(self, wms_id, pass_thru=None): 

396 """Cancel submitted workflows/jobs. 

397 

398 Parameters 

399 ---------- 

400 wms_id : `str` 

401 Id or path of job that should be canceled. 

402 pass_thru : `str`, optional 

403 Information to pass through to WMS. 

404 

405 Returns 

406 ------- 

407 deleted : `bool` 

408 Whether successful deletion or not. Currently, if any doubt or any 

409 individual jobs not deleted, return False. 

410 message : `str` 

411 Any message from WMS (e.g., error details). 

412 """ 

413 _LOG.debug("Canceling wms_id = %s", wms_id) 

414 

415 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

416 

417 if cluster_id is None: 

418 deleted = False 

419 message = "invalid id" 

420 else: 

421 _LOG.debug( 

422 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

423 cluster_id, 

424 schedd_ad["Name"], 

425 ) 

426 schedd = htcondor.Schedd(schedd_ad) 

427 

428 constraint = f"ClusterId == {cluster_id}" 

429 if pass_thru is not None and "-forcex" in pass_thru: 

430 pass_thru_2 = pass_thru.replace("-forcex", "") 

431 if pass_thru_2 and not pass_thru_2.isspace(): 

432 constraint += f"&& ({pass_thru_2})" 

433 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

434 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

435 else: 

436 if pass_thru: 

437 constraint += f"&& ({pass_thru})" 

438 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

439 results = schedd.act(htcondor.JobAction.Remove, constraint) 

440 _LOG.debug("Remove results: %s", results) 

441 

442 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

443 deleted = True 

444 message = "" 

445 else: 

446 deleted = False 

447 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

448 message = "no such bps job in batch queue" 

449 else: 

450 message = f"unknown problems deleting: {results}" 

451 

452 _LOG.debug("deleted: %s; message = %s", deleted, message) 

453 return deleted, message 

454 

455 

456class HTCondorWorkflow(BaseWmsWorkflow): 

457 """Single HTCondor workflow. 

458 

459 Parameters 

460 ---------- 

461 name : `str` 

462 Unique name for Workflow used when naming files. 

463 config : `lsst.ctrl.bps.BpsConfig` 

464 BPS configuration that includes necessary submit/runtime information. 

465 """ 

466 

467 def __init__(self, name, config=None): 

468 super().__init__(name, config) 

469 self.dag = None 

470 

471 @classmethod 

472 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

473 # Docstring inherited 

474 htc_workflow = cls(generic_workflow.name, config) 

475 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

476 

477 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

478 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

479 htc_workflow.dag.add_attribs( 

480 { 

481 "bps_wms_service": service_class, 

482 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

483 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

484 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

485 } 

486 ) 

487 

488 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

489 if isinstance(tmp_template, str): 

490 subdir_template = defaultdict(lambda: tmp_template) 

491 else: 

492 subdir_template = tmp_template 

493 

494 # Create all DAG jobs 

495 site_values = {} # cache compute site specific values to reduce config lookups 

496 for job_name in generic_workflow: 

497 gwjob = generic_workflow.get_job(job_name) 

498 if gwjob.compute_site not in site_values: 

499 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

500 htc_job = _create_job( 

501 subdir_template[gwjob.label], 

502 site_values[gwjob.compute_site], 

503 generic_workflow, 

504 gwjob, 

505 out_prefix, 

506 ) 

507 htc_workflow.dag.add_job(htc_job) 

508 

509 # Add job dependencies to the DAG 

510 for job_name in generic_workflow: 

511 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

512 

513 # If final job exists in generic workflow, create DAG final job 

514 final = generic_workflow.get_final() 

515 if final and isinstance(final, GenericWorkflowJob): 

516 if final.compute_site and final.compute_site not in site_values: 

517 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

518 final_htjob = _create_job( 

519 subdir_template[final.label], 

520 site_values[final.compute_site], 

521 generic_workflow, 

522 final, 

523 out_prefix, 

524 ) 

525 if "post" not in final_htjob.dagcmds: 

526 final_htjob.dagcmds["post"] = ( 

527 f"{os.path.dirname(__file__)}/final_post.sh" f" {final.name} $DAG_STATUS $RETURN" 

528 ) 

529 htc_workflow.dag.add_final_job(final_htjob) 

530 elif final and isinstance(final, GenericWorkflow): 

531 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

532 elif final: 

533 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

534 

535 return htc_workflow 

536 

537 def write(self, out_prefix): 

538 """Output HTCondor DAGMan files needed for workflow submission. 

539 

540 Parameters 

541 ---------- 

542 out_prefix : `str` 

543 Directory prefix for HTCondor files. 

544 """ 

545 self.submit_path = out_prefix 

546 os.makedirs(out_prefix, exist_ok=True) 

547 

548 # Write down the workflow in HTCondor format. 

549 self.dag.write(out_prefix, "jobs/{self.label}") 

550 

551 

552def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

553 """Convert GenericWorkflow job nodes to DAG jobs. 

554 

555 Parameters 

556 ---------- 

557 subdir_template : `str` 

558 Template for making subdirs. 

559 site_values : `dict` 

560 Site specific values 

561 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

562 Generic workflow that is being converted. 

563 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

564 The generic job to convert to a HTCondor job. 

565 out_prefix : `str` 

566 Directory prefix for HTCondor files. 

567 

568 Returns 

569 ------- 

570 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

571 The HTCondor job equivalent to the given generic job. 

572 """ 

573 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

574 

575 curvals = defaultdict(str) 

576 curvals["label"] = gwjob.label 

577 if gwjob.tags: 

578 curvals.update(gwjob.tags) 

579 

580 subdir = subdir_template.format_map(curvals) 

581 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

582 

583 htc_job_cmds = { 

584 "universe": "vanilla", 

585 "should_transfer_files": "YES", 

586 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

587 "transfer_output_files": '""', # Set to empty string to disable 

588 "transfer_executable": "False", 

589 "getenv": "True", 

590 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

591 # htcondor to put on hold any jobs which exited by a signal. 

592 "on_exit_hold": "ExitBySignal == true", 

593 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

594 '"Handling signal as if job has gone over memory limit.")', 

595 "on_exit_hold_subcode": "34", 

596 } 

597 

598 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

599 

600 # job stdout, stderr, htcondor user log. 

601 for key in ("output", "error", "log"): 

602 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

603 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

604 

605 htc_job_cmds.update( 

606 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

607 ) 

608 

609 # Add the job cmds dict to the job object. 

610 htc_job.add_job_cmds(htc_job_cmds) 

611 

612 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

613 

614 # Add job attributes to job. 

615 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

616 htc_job.add_job_attrs(gwjob.attrs) 

617 htc_job.add_job_attrs(site_values["attrs"]) 

618 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

619 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

620 

621 return htc_job 

622 

623 

624def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

625 """Translate the job data that are one to one mapping 

626 

627 Parameters 

628 ---------- 

629 cached_vals : `dict` [`str`, `Any`] 

630 Config values common to jobs with same label. 

631 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

632 Generic workflow that contains job to being converted. 

633 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

634 Generic workflow job to be converted. 

635 

636 Returns 

637 ------- 

638 htc_job_commands : `dict` [`str`, `Any`] 

639 Contains commands which can appear in the HTCondor submit description 

640 file. 

641 """ 

642 # Values in the job script that just are name mappings. 

643 job_translation = { 

644 "mail_to": "notify_user", 

645 "when_to_mail": "notification", 

646 "request_cpus": "request_cpus", 

647 "priority": "priority", 

648 "category": "category", 

649 "accounting_group": "accounting_group", 

650 "accounting_user": "accounting_group_user", 

651 } 

652 

653 jobcmds = {} 

654 for gwkey, htckey in job_translation.items(): 

655 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

656 

657 # If accounting info was not set explicitly, use site settings if any. 

658 if not gwjob.accounting_group: 

659 jobcmds["accounting_group"] = cached_vals.get("accountingGroup") 

660 if not gwjob.accounting_user: 

661 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser") 

662 

663 # job commands that need modification 

664 if gwjob.number_of_retries: 

665 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

666 

667 if gwjob.retry_unless_exit: 

668 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

669 

670 if gwjob.request_disk: 

671 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

672 

673 if gwjob.request_memory: 

674 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

675 

676 if gwjob.memory_multiplier: 

677 # Do not use try-except! At the moment, BpsConfig returns an empty 

678 # string if it does not contain the key. 

679 memory_limit = cached_vals["memoryLimit"] 

680 if not memory_limit: 

681 raise RuntimeError( 

682 "Memory autoscaling enabled, but automatic detection of the memory limit " 

683 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

684 "search pattern 'executeMachinesPattern' might help." 

685 ) 

686 

687 # Set maximal amount of memory job can ask for. 

688 # 

689 # The check below assumes that 'memory_limit' was set to a value which 

690 # realistically reflects actual physical limitations of a given compute 

691 # resource. 

692 memory_max = memory_limit 

693 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

694 memory_max = gwjob.request_memory_max 

695 

696 # Make job ask for more memory each time it failed due to insufficient 

697 # memory requirements. 

698 jobcmds["request_memory"] = _create_request_memory_expr( 

699 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

700 ) 

701 

702 # Periodically release jobs which are being held due to exceeding 

703 # memory. Stop doing that (by removing the job from the HTCondor queue) 

704 # after the maximal number of retries has been reached or the job was 

705 # already run at maximal allowed memory. 

706 jobcmds["periodic_release"] = _create_periodic_release_expr( 

707 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

708 ) 

709 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

710 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

711 ) 

712 

713 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

714 # May need to move to special site-specific implementation if sites use 

715 # other mechanisms. 

716 if gwjob.concurrency_limit: 

717 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

718 

719 # Handle command line 

720 if gwjob.executable.transfer_executable: 

721 jobcmds["transfer_executable"] = "True" 

722 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

723 else: 

724 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

725 

726 if gwjob.arguments: 

727 arguments = gwjob.arguments 

728 arguments = _replace_cmd_vars(arguments, gwjob) 

729 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

730 arguments = _fix_env_var_syntax(arguments) 

731 jobcmds["arguments"] = arguments 

732 

733 # Add extra "pass-thru" job commands 

734 if gwjob.profile: 

735 for key, val in gwjob.profile.items(): 

736 jobcmds[key] = htc_escape(val) 

737 for key, val in cached_vals["profile"].items(): 

738 jobcmds[key] = htc_escape(val) 

739 

740 return jobcmds 

741 

742 

743def _translate_dag_cmds(gwjob): 

744 """Translate job values into DAGMan commands. 

745 

746 Parameters 

747 ---------- 

748 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

749 Job containing values to be translated. 

750 

751 Returns 

752 ------- 

753 dagcmds : `dict` [`str`, `Any`] 

754 DAGMan commands for the job. 

755 """ 

756 # Values in the dag script that just are name mappings. 

757 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

758 

759 dagcmds = {} 

760 for gwkey, htckey in dag_translation.items(): 

761 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

762 

763 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

764 return dagcmds 

765 

766 

767def _fix_env_var_syntax(oldstr): 

768 """Change ENV place holders to HTCondor Env var syntax. 

769 

770 Parameters 

771 ---------- 

772 oldstr : `str` 

773 String in which environment variable syntax is to be fixed. 

774 

775 Returns 

776 ------- 

777 newstr : `str` 

778 Given string with environment variable syntax fixed. 

779 """ 

780 newstr = oldstr 

781 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

782 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

783 return newstr 

784 

785 

786def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

787 """Replace file placeholders in command line arguments with correct 

788 physical file names. 

789 

790 Parameters 

791 ---------- 

792 use_shared : `bool` 

793 Whether HTCondor can assume shared filesystem. 

794 arguments : `str` 

795 Arguments string in which to replace file placeholders. 

796 workflow : `lsst.ctrl.bps.GenericWorkflow` 

797 Generic workflow that contains file information. 

798 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

799 The job corresponding to the arguments. 

800 

801 Returns 

802 ------- 

803 arguments : `str` 

804 Given arguments string with file placeholders replaced. 

805 """ 

806 # Replace input file placeholders with paths. 

807 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

808 if not gwfile.wms_transfer: 

809 # Must assume full URI if in command line and told WMS is not 

810 # responsible for transferring file. 

811 uri = gwfile.src_uri 

812 elif use_shared: 

813 if gwfile.job_shared: 

814 # Have shared filesystems and jobs can share file. 

815 uri = gwfile.src_uri 

816 else: 

817 # Taking advantage of inside knowledge. Not future-proof. 

818 # Temporary fix until have job wrapper that pulls files 

819 # within job. 

820 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

821 uri = "butler.yaml" 

822 else: 

823 uri = os.path.basename(gwfile.src_uri) 

824 else: # Using push transfer 

825 uri = os.path.basename(gwfile.src_uri) 

826 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

827 

828 # Replace output file placeholders with paths. 

829 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

830 if not gwfile.wms_transfer: 

831 # Must assume full URI if in command line and told WMS is not 

832 # responsible for transferring file. 

833 uri = gwfile.src_uri 

834 elif use_shared: 

835 if gwfile.job_shared: 

836 # Have shared filesystems and jobs can share file. 

837 uri = gwfile.src_uri 

838 else: 

839 uri = os.path.basename(gwfile.src_uri) 

840 else: # Using push transfer 

841 uri = os.path.basename(gwfile.src_uri) 

842 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

843 return arguments 

844 

845 

846def _replace_cmd_vars(arguments, gwjob): 

847 """Replace format-style placeholders in arguments. 

848 

849 Parameters 

850 ---------- 

851 arguments : `str` 

852 Arguments string in which to replace placeholders. 

853 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

854 Job containing values to be used to replace placeholders 

855 (in particular gwjob.cmdvals). 

856 

857 Returns 

858 ------- 

859 arguments : `str` 

860 Given arguments string with placeholders replaced. 

861 """ 

862 try: 

863 arguments = arguments.format(**gwjob.cmdvals) 

864 except (KeyError, TypeError): # TypeError in case None instead of {} 

865 _LOG.error( 

866 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals 

867 ) 

868 raise 

869 return arguments 

870 

871 

872def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

873 """Add job input files from generic workflow to job. 

874 

875 Parameters 

876 ---------- 

877 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

878 The generic workflow (e.g., has executable name and arguments). 

879 job_name : `str` 

880 Unique name for the job. 

881 use_shared : `bool` 

882 Whether job has access to files via shared filesystem. 

883 out_prefix : `str` 

884 The root directory into which all WMS-specific files are written. 

885 

886 Returns 

887 ------- 

888 htc_commands : `dict` [`str`, `str`] 

889 HTCondor commands for the job submission script. 

890 """ 

891 htc_commands = {} 

892 inputs = [] 

893 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

894 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

895 

896 uri = Path(gwf_file.src_uri) 

897 

898 # Note if use_shared and job_shared, don't need to transfer file. 

899 

900 if not use_shared: # Copy file using push to job 

901 inputs.append(str(uri.relative_to(out_prefix))) 

902 elif not gwf_file.job_shared: # Jobs require own copy 

903 

904 # if using shared filesystem, but still need copy in job. Use 

905 # HTCondor's curl plugin for a local copy. 

906 

907 # Execution butler is represented as a directory which the 

908 # curl plugin does not handle. Taking advantage of inside 

909 # knowledge for temporary fix until have job wrapper that pulls 

910 # files within job. 

911 if gwf_file.name == "butlerConfig": 

912 # The execution butler directory doesn't normally exist until 

913 # the submit phase so checking for suffix instead of using 

914 # is_dir(). If other non-yaml file exists they would have a 

915 # different gwf_file.name. 

916 if uri.suffix == ".yaml": # Single file, so just copy. 

917 inputs.append(f"file://{uri}") 

918 else: 

919 inputs.append(f"file://{uri / 'butler.yaml'}") 

920 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

921 elif uri.is_dir(): 

922 raise RuntimeError( 

923 "HTCondor plugin cannot transfer directories locally within job " f"{gwf_file.src_uri}" 

924 ) 

925 else: 

926 inputs.append(f"file://{uri}") 

927 

928 if inputs: 

929 htc_commands["transfer_input_files"] = ",".join(inputs) 

930 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

931 return htc_commands 

932 

933 

934def _report_from_path(wms_path): 

935 """Gather run information from a given run directory. 

936 

937 Parameters 

938 ---------- 

939 wms_path : `str` 

940 The directory containing the submit side files (e.g., HTCondor files). 

941 

942 Returns 

943 ------- 

944 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

945 Run information for the detailed report. The key is the HTCondor id 

946 and the value is a collection of report information for that run. 

947 message : `str` 

948 Message to be printed with the summary report. 

949 """ 

950 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

951 if wms_workflow_id == MISSING_ID: 

952 run_reports = {} 

953 else: 

954 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

955 return run_reports, message 

956 

957 

958def _report_from_id(wms_workflow_id, hist, schedds=None): 

959 """Gather run information using workflow id. 

960 

961 Parameters 

962 ---------- 

963 wms_workflow_id : `str` 

964 Limit to specific run based on id. 

965 hist : `float` 

966 Limit history search to this many days. 

967 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

968 HTCondor schedulers which to query for job information. If None 

969 (default), all queries will be run against the local scheduler only. 

970 

971 Returns 

972 ------- 

973 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

974 Run information for the detailed report. The key is the HTCondor id 

975 and the value is a collection of report information for that run. 

976 message : `str` 

977 Message to be printed with the summary report. 

978 """ 

979 messages = [] 

980 

981 # Collect information about the job by querying HTCondor schedd and 

982 # HTCondor history. 

983 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds) 

984 if len(schedd_dag_info) == 1: 

985 

986 # Extract the DAG info without altering the results of the query. 

987 schedd_name = next(iter(schedd_dag_info)) 

988 dag_id = next(iter(schedd_dag_info[schedd_name])) 

989 dag_ad = schedd_dag_info[schedd_name][dag_id] 

990 

991 # If the provided workflow id does not correspond to the one extracted 

992 # from the DAGMan log file in the submit directory, rerun the query 

993 # with the id found in the file. 

994 # 

995 # This is to cover the situation in which the user provided the old job 

996 # id of a restarted run. 

997 try: 

998 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"]) 

999 except FileNotFoundError as exc: 

1000 # At the moment missing DAGMan log is pretty much a fatal error. 

1001 # So empty the DAG info to finish early (see the if statement 

1002 # below). 

1003 schedd_dag_info.clean() 

1004 messages.append(f"Cannot create the report for '{dag_id}': {exc}") 

1005 else: 

1006 if path_dag_id != dag_id: 

1007 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds) 

1008 messages.append( 

1009 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. " 

1010 f"This normally occurs when a run is restarted. The report shown is for the most " 

1011 f"recent status with run id '{path_dag_id}'" 

1012 ) 

1013 

1014 if len(schedd_dag_info) == 0: 

1015 run_reports = {} 

1016 elif len(schedd_dag_info) == 1: 

1017 _, dag_info = schedd_dag_info.popitem() 

1018 dag_id, dag_ad = dag_info.popitem() 

1019 

1020 # Create a mapping between jobs and their classads. The keys will 

1021 # be of format 'ClusterId.ProcId'. 

1022 job_info = {dag_id: dag_ad} 

1023 

1024 # Find jobs (nodes) belonging to that DAGMan job. 

1025 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1026 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1027 if schedd_job_info: 

1028 _, node_info = schedd_job_info.popitem() 

1029 job_info.update(node_info) 

1030 

1031 # Collect additional pieces of information about jobs using HTCondor 

1032 # files in the submission directory. 

1033 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1034 _update_jobs(job_info, path_jobs) 

1035 if message: 

1036 messages.append(message) 

1037 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1038 else: 

1039 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1040 message = ( 

1041 f"More than one job matches id '{wms_workflow_id}', " 

1042 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1043 ) 

1044 messages.append(message) 

1045 run_reports = {} 

1046 

1047 message = "\n".join(messages) 

1048 return run_reports, message 

1049 

1050 

1051def _get_info_from_schedd(wms_workflow_id, hist, schedds): 

1052 """Gather run information from HTCondor. 

1053 

1054 Parameters 

1055 ---------- 

1056 wms_workflow_id : `str` 

1057 Limit to specific run based on id. 

1058 hist : 

1059 Limit history search to this many days. 

1060 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1061 HTCondor schedulers which to query for job information. If None 

1062 (default), all queries will be run against the local scheduler only. 

1063 

1064 Returns 

1065 ------- 

1066 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1067 Information about jobs satisfying the search criteria where for each 

1068 Scheduler, local HTCondor job ids are mapped to their respective 

1069 classads. 

1070 """ 

1071 dag_constraint = 'regexp("dagman$", Cmd)' 

1072 try: 

1073 cluster_id = int(float(wms_workflow_id)) 

1074 except ValueError: 

1075 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

1076 else: 

1077 dag_constraint += f" && ClusterId == {cluster_id}" 

1078 

1079 # With the current implementation of the condor_* functions the query 

1080 # will always return only one match per Scheduler. 

1081 # 

1082 # Even in the highly unlikely situation where HTCondor history (which 

1083 # condor_search queries too) is long enough to have jobs from before 

1084 # the cluster ids were rolled over (and as a result there is more then 

1085 # one job with the same cluster id) they will not show up in 

1086 # the results. 

1087 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

1088 return schedd_dag_info 

1089 

1090 

1091def _get_info_from_path(wms_path): 

1092 """Gather run information from a given run directory. 

1093 

1094 Parameters 

1095 ---------- 

1096 wms_path : `str` 

1097 Directory containing HTCondor files. 

1098 

1099 Returns 

1100 ------- 

1101 wms_workflow_id : `str` 

1102 The run id which is a DAGman job id. 

1103 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1104 Information about jobs read from files in the given directory. 

1105 The key is the HTCondor id and the value is a dictionary of HTCondor 

1106 keys and values. 

1107 message : `str` 

1108 Message to be printed with the summary report. 

1109 """ 

1110 messages = [] 

1111 try: 

1112 wms_workflow_id, jobs = read_dag_log(wms_path) 

1113 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1114 _update_jobs(jobs, read_node_status(wms_path)) 

1115 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1116 

1117 # Add more info for DAGman job 

1118 job = jobs[wms_workflow_id] 

1119 job.update(read_dag_status(wms_path)) 

1120 

1121 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1122 if "bps_run" not in job: 

1123 _add_run_info(wms_path, job) 

1124 

1125 message = htc_check_dagman_output(wms_path) 

1126 if message: 

1127 messages.append(message) 

1128 _LOG.debug( 

1129 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1130 ) 

1131 

1132 # Add extra pieces of information which cannot be found in HTCondor 

1133 # generated files like 'GlobalJobId'. 

1134 # 

1135 # Do not treat absence of this file as a serious error. Neither runs 

1136 # submitted with earlier versions of the plugin nor the runs submitted 

1137 # with Pegasus plugin will have it at the moment. However, once enough 

1138 # time passes and Pegasus plugin will have its own report() method 

1139 # (instead of sneakily using HTCondor's one), the lack of that file 

1140 # should be treated as seriously as lack of any other file. 

1141 try: 

1142 job_info = read_dag_info(wms_path) 

1143 except FileNotFoundError as exc: 

1144 message = f"Warn: Some information may not be available: {exc}" 

1145 messages.append(message) 

1146 else: 

1147 schedd_name = next(iter(job_info)) 

1148 job_ad = next(iter(job_info[schedd_name].values())) 

1149 job.update(job_ad) 

1150 except FileNotFoundError: 

1151 message = f"Could not find HTCondor files in '{wms_path}'" 

1152 _LOG.warning(message) 

1153 messages.append(message) 

1154 wms_workflow_id = MISSING_ID 

1155 jobs = {} 

1156 

1157 message = "\n".join([msg for msg in messages if msg]) 

1158 return wms_workflow_id, jobs, message 

1159 

1160 

1161def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1162 """Gather run information to be used in generating summary reports. 

1163 

1164 Parameters 

1165 ---------- 

1166 wms_workflow_id : `str` 

1167 The run id to create the report for. 

1168 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1169 Mapping HTCondor job id to job information. 

1170 

1171 Returns 

1172 ------- 

1173 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1174 Run information for the detailed report. The key is the given HTCondor 

1175 id and the value is a collection of report information for that run. 

1176 """ 

1177 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1178 dag_job = jobs[wms_workflow_id] 

1179 report = WmsRunReport( 

1180 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1181 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1182 path=dag_job["Iwd"], 

1183 label=dag_job.get("bps_job_label", "MISS"), 

1184 run=dag_job.get("bps_run", "MISS"), 

1185 project=dag_job.get("bps_project", "MISS"), 

1186 campaign=dag_job.get("bps_campaign", "MISS"), 

1187 payload=dag_job.get("bps_payload", "MISS"), 

1188 operator=_get_owner(dag_job), 

1189 run_summary=_get_run_summary(dag_job), 

1190 state=_htc_status_to_wms_state(dag_job), 

1191 jobs=[], 

1192 total_number_jobs=dag_job["total_jobs"], 

1193 job_state_counts=dag_job["state_counts"], 

1194 ) 

1195 

1196 for job_id, job_info in jobs.items(): 

1197 try: 

1198 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

1199 job_report = WmsJobReport( 

1200 wms_id=job_id, 

1201 name=job_info.get("DAGNodeName", job_id), 

1202 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1203 state=_htc_status_to_wms_state(job_info), 

1204 ) 

1205 if job_report.label == "init": 

1206 job_report.label = "pipetaskInit" 

1207 report.jobs.append(job_report) 

1208 except KeyError as ex: 

1209 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1210 raise 

1211 

1212 run_reports = {report.wms_id: report} 

1213 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1214 return run_reports 

1215 

1216 

1217def _summary_report(user, hist, pass_thru, schedds=None): 

1218 """Gather run information to be used in generating summary reports. 

1219 

1220 Parameters 

1221 ---------- 

1222 user : `str` 

1223 Run lookup restricted to given user. 

1224 hist : `float` 

1225 How many previous days to search for run information. 

1226 pass_thru : `str` 

1227 Advanced users can define the HTCondor constraint to be used 

1228 when searching queue and history. 

1229 

1230 Returns 

1231 ------- 

1232 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1233 Run information for the summary report. The keys are HTCondor ids and 

1234 the values are collections of report information for each run. 

1235 message : `str` 

1236 Message to be printed with the summary report. 

1237 """ 

1238 # only doing summary report so only look for dagman jobs 

1239 if pass_thru: 

1240 constraint = pass_thru 

1241 else: 

1242 # Notes: 

1243 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1244 # manually restarted. 

1245 # * Any job with DAGManJobID isn't a DAG job 

1246 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1247 if user: 

1248 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1249 

1250 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1251 

1252 # Have list of DAGMan jobs, need to get run_report info. 

1253 run_reports = {} 

1254 for jobs in job_info.values(): 

1255 for job_id, job in jobs.items(): 

1256 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1257 # If didn't get from queue information (e.g., Kerberos bug), 

1258 # try reading from file. 

1259 if total_jobs == 0: 

1260 try: 

1261 job.update(read_dag_status(job["Iwd"])) 

1262 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1263 except StopIteration: 

1264 pass # don't kill report can't find htcondor files 

1265 

1266 if "bps_run" not in job: 

1267 _add_run_info(job["Iwd"], job) 

1268 report = WmsRunReport( 

1269 wms_id=job_id, 

1270 global_wms_id=job["GlobalJobId"], 

1271 path=job["Iwd"], 

1272 label=job.get("bps_job_label", "MISS"), 

1273 run=job.get("bps_run", "MISS"), 

1274 project=job.get("bps_project", "MISS"), 

1275 campaign=job.get("bps_campaign", "MISS"), 

1276 payload=job.get("bps_payload", "MISS"), 

1277 operator=_get_owner(job), 

1278 run_summary=_get_run_summary(job), 

1279 state=_htc_status_to_wms_state(job), 

1280 jobs=[], 

1281 total_number_jobs=total_jobs, 

1282 job_state_counts=state_counts, 

1283 ) 

1284 run_reports[report.global_wms_id] = report 

1285 

1286 return run_reports, "" 

1287 

1288 

1289def _add_run_info(wms_path, job): 

1290 """Find BPS run information elsewhere for runs without bps attributes. 

1291 

1292 Parameters 

1293 ---------- 

1294 wms_path : `str` 

1295 Path to submit files for the run. 

1296 job : `dict` [`str`, `Any`] 

1297 HTCondor dag job information. 

1298 

1299 Raises 

1300 ------ 

1301 StopIteration 

1302 If cannot find file it is looking for. Permission errors are 

1303 caught and job's run is marked with error. 

1304 """ 

1305 path = Path(wms_path) / "jobs" 

1306 try: 

1307 subfile = next(path.glob("**/*.sub")) 

1308 except (StopIteration, PermissionError): 

1309 job["bps_run"] = "Unavailable" 

1310 else: 

1311 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1312 try: 

1313 with open(subfile, "r", encoding="utf-8") as fh: 

1314 for line in fh: 

1315 if line.startswith("+bps_"): 

1316 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1317 if m: 

1318 _LOG.debug("Matching line: %s", line) 

1319 job[m.group(1)] = m.group(2).replace('"', "") 

1320 else: 

1321 _LOG.debug("Could not parse attribute: %s", line) 

1322 except PermissionError: 

1323 job["bps_run"] = "PermissionError" 

1324 _LOG.debug("After adding job = %s", job) 

1325 

1326 

1327def _get_owner(job): 

1328 """Get the owner of a dag job. 

1329 

1330 Parameters 

1331 ---------- 

1332 job : `dict` [`str`, `Any`] 

1333 HTCondor dag job information. 

1334 

1335 Returns 

1336 ------- 

1337 owner : `str` 

1338 Owner of the dag job. 

1339 """ 

1340 owner = job.get("bps_operator", None) 

1341 if not owner: 

1342 owner = job.get("Owner", None) 

1343 if not owner: 

1344 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1345 owner = "MISS" 

1346 return owner 

1347 

1348 

1349def _get_run_summary(job): 

1350 """Get the run summary for a job. 

1351 

1352 Parameters 

1353 ---------- 

1354 job : `dict` [`str`, `Any`] 

1355 HTCondor dag job information. 

1356 

1357 Returns 

1358 ------- 

1359 summary : `str` 

1360 Number of jobs per PipelineTask label in approximate pipeline order. 

1361 Format: <label>:<count>[;<label>:<count>]+ 

1362 """ 

1363 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1364 if not summary: 

1365 summary, _ = summary_from_dag(job["Iwd"]) 

1366 if not summary: 

1367 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1368 _LOG.debug("_get_run_summary: summary=%s", summary) 

1369 

1370 # Workaround sometimes using init vs pipetaskInit 

1371 summary = summary.replace("init:", "pipetaskInit:") 

1372 

1373 if "pegasus_version" in job and "pegasus" not in summary: 

1374 summary += ";pegasus:0" 

1375 

1376 return summary 

1377 

1378 

1379def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1380 """Count number of jobs per WMS state. 

1381 

1382 Parameters 

1383 ---------- 

1384 wms_workflow_id : `str` 

1385 HTCondor job id. 

1386 jobs : `dict` [`str`, `Any`] 

1387 HTCondor dag job information. 

1388 

1389 Returns 

1390 ------- 

1391 total_count : `int` 

1392 Total number of dag nodes. 

1393 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1394 Keys are the different WMS states and values are counts of jobs 

1395 that are in that WMS state. 

1396 """ 

1397 state_counts = dict.fromkeys(WmsStates, 0) 

1398 

1399 for jid, jinfo in jobs.items(): 

1400 if jid != wms_workflow_id: 

1401 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1402 

1403 total_counted = sum(state_counts.values()) 

1404 if "NodesTotal" in jobs[wms_workflow_id]: 

1405 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1406 else: 

1407 total_count = total_counted 

1408 

1409 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1410 

1411 return total_count, state_counts 

1412 

1413 

1414def _get_state_counts_from_dag_job(job): 

1415 """Count number of jobs per WMS state. 

1416 

1417 Parameters 

1418 ---------- 

1419 job : `dict` [`str`, `Any`] 

1420 HTCondor dag job information. 

1421 

1422 Returns 

1423 ------- 

1424 total_count : `int` 

1425 Total number of dag nodes. 

1426 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1427 Keys are the different WMS states and values are counts of jobs 

1428 that are in that WMS state. 

1429 """ 

1430 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1431 state_counts = dict.fromkeys(WmsStates, 0) 

1432 if "DAG_NodesReady" in job: 

1433 state_counts = { 

1434 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1435 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1436 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1437 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1438 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1439 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1440 } 

1441 total_jobs = job.get("DAG_NodesTotal") 

1442 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1443 elif "NodesFailed" in job: 

1444 state_counts = { 

1445 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1446 WmsStates.READY: job.get("NodesReady", 0), 

1447 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1448 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1449 WmsStates.FAILED: job.get("NodesFailed", 0), 

1450 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1451 } 

1452 try: 

1453 total_jobs = job.get("NodesTotal") 

1454 except KeyError as ex: 

1455 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1456 raise 

1457 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1458 else: 

1459 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1460 # for every DAG. 

1461 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1462 total_jobs = 0 

1463 

1464 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1465 return total_jobs, state_counts 

1466 

1467 

1468def _htc_status_to_wms_state(job): 

1469 """Convert HTCondor job status to generic wms state. 

1470 

1471 Parameters 

1472 ---------- 

1473 job : `dict` [`str`, `Any`] 

1474 HTCondor job information. 

1475 

1476 Returns 

1477 ------- 

1478 wms_state : `WmsStates` 

1479 The equivalent WmsState to given job's status. 

1480 """ 

1481 wms_state = WmsStates.MISFIT 

1482 if "JobStatus" in job: 

1483 wms_state = _htc_job_status_to_wms_state(job) 

1484 elif "NodeStatus" in job: 

1485 wms_state = _htc_node_status_to_wms_state(job) 

1486 return wms_state 

1487 

1488 

1489def _htc_job_status_to_wms_state(job): 

1490 """Convert HTCondor job status to generic wms state. 

1491 

1492 Parameters 

1493 ---------- 

1494 job : `dict` [`str`, `Any`] 

1495 HTCondor job information. 

1496 

1497 Returns 

1498 ------- 

1499 wms_state : `lsst.ctrl.bps.WmsStates` 

1500 The equivalent WmsState to given job's status. 

1501 """ 

1502 _LOG.debug( 

1503 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1504 ) 

1505 job_status = int(job["JobStatus"]) 

1506 wms_state = WmsStates.MISFIT 

1507 

1508 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1509 if job_status == JobStatus.IDLE: 

1510 wms_state = WmsStates.PENDING 

1511 elif job_status == JobStatus.RUNNING: 

1512 wms_state = WmsStates.RUNNING 

1513 elif job_status == JobStatus.REMOVED: 

1514 wms_state = WmsStates.DELETED 

1515 elif job_status == JobStatus.COMPLETED: 

1516 if ( 

1517 job.get("ExitBySignal", False) 

1518 or job.get("ExitCode", 0) 

1519 or job.get("ExitSignal", 0) 

1520 or job.get("DAG_Status", 0) 

1521 or job.get("ReturnValue", 0) 

1522 ): 

1523 wms_state = WmsStates.FAILED 

1524 else: 

1525 wms_state = WmsStates.SUCCEEDED 

1526 elif job_status == JobStatus.HELD: 

1527 wms_state = WmsStates.HELD 

1528 

1529 return wms_state 

1530 

1531 

1532def _htc_node_status_to_wms_state(job): 

1533 """Convert HTCondor status to generic wms state. 

1534 

1535 Parameters 

1536 ---------- 

1537 job : `dict` [`str`, `Any`] 

1538 HTCondor job information. 

1539 

1540 Returns 

1541 ------- 

1542 wms_state : `lsst.ctrl.bps.WmsStates` 

1543 The equivalent WmsState to given node's status. 

1544 """ 

1545 wms_state = WmsStates.MISFIT 

1546 

1547 status = job["NodeStatus"] 

1548 if status == NodeStatus.NOT_READY: 

1549 wms_state = WmsStates.UNREADY 

1550 elif status == NodeStatus.READY: 

1551 wms_state = WmsStates.READY 

1552 elif status == NodeStatus.PRERUN: 

1553 wms_state = WmsStates.MISFIT 

1554 elif status == NodeStatus.SUBMITTED: 

1555 if job["JobProcsHeld"]: 

1556 wms_state = WmsStates.HELD 

1557 elif job["StatusDetails"] == "not_idle": 

1558 wms_state = WmsStates.RUNNING 

1559 elif job["JobProcsQueued"]: 

1560 wms_state = WmsStates.PENDING 

1561 elif status == NodeStatus.POSTRUN: 

1562 wms_state = WmsStates.MISFIT 

1563 elif status == NodeStatus.DONE: 

1564 wms_state = WmsStates.SUCCEEDED 

1565 elif status == NodeStatus.ERROR: 

1566 # Use job exist instead of post script exit 

1567 if "DAGMAN error 0" in job["StatusDetails"]: 

1568 wms_state = WmsStates.SUCCEEDED 

1569 else: 

1570 wms_state = WmsStates.FAILED 

1571 

1572 return wms_state 

1573 

1574 

1575def _update_jobs(jobs1, jobs2): 

1576 """Update jobs1 with info in jobs2. 

1577 

1578 (Basically an update for nested dictionaries.) 

1579 

1580 Parameters 

1581 ---------- 

1582 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1583 HTCondor job information to be updated. 

1584 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1585 Additional HTCondor job information. 

1586 """ 

1587 for jid, jinfo in jobs2.items(): 

1588 if jid in jobs1: 

1589 jobs1[jid].update(jinfo) 

1590 else: 

1591 jobs1[jid] = jinfo 

1592 

1593 

1594def _wms_id_type(wms_id): 

1595 """Determine the type of the WMS id. 

1596 

1597 Parameters 

1598 ---------- 

1599 wms_id : `str` 

1600 WMS id identifying a job. 

1601 

1602 Returns 

1603 ------- 

1604 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1605 Type of WMS id. 

1606 """ 

1607 try: 

1608 int(float(wms_id)) 

1609 except ValueError: 

1610 wms_path = Path(wms_id) 

1611 if wms_path.exists(): 

1612 id_type = WmsIdType.PATH 

1613 else: 

1614 id_type = WmsIdType.GLOBAL 

1615 except TypeError: 

1616 id_type = WmsIdType.UNKNOWN 

1617 else: 

1618 id_type = WmsIdType.LOCAL 

1619 return id_type 

1620 

1621 

1622def _wms_id_to_cluster(wms_id): 

1623 """Convert WMS id to cluster id. 

1624 

1625 Parameters 

1626 ---------- 

1627 wms_id : `int` or `float` or `str` 

1628 HTCondor job id or path. 

1629 

1630 Returns 

1631 ------- 

1632 schedd_ad : `classad.ClassAd` 

1633 ClassAd describing the scheduler managing the job with the given id. 

1634 cluster_id : `int` 

1635 HTCondor cluster id. 

1636 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1637 The type of the provided id. 

1638 """ 

1639 coll = htcondor.Collector() 

1640 

1641 schedd_ad = None 

1642 cluster_id = None 

1643 id_type = _wms_id_type(wms_id) 

1644 if id_type == WmsIdType.LOCAL: 

1645 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1646 cluster_id = int(float(wms_id)) 

1647 elif id_type == WmsIdType.GLOBAL: 

1648 constraint = f'GlobalJobId == "{wms_id}"' 

1649 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1650 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1651 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1652 results = { 

1653 query.tag(): dict(ads[0]) 

1654 for query in htcondor.poll(queries) 

1655 if (ads := query.nextAdsNonBlocking()) 

1656 } 

1657 if results: 

1658 schedd_name = next(iter(results)) 

1659 schedd_ad = schedd_ads[schedd_name] 

1660 cluster_id = results[schedd_name]["ClusterId"] 

1661 elif id_type == WmsIdType.PATH: 

1662 try: 

1663 job_info = read_dag_info(wms_id) 

1664 except (FileNotFoundError, PermissionError, IOError): 

1665 pass 

1666 else: 

1667 schedd_name = next(iter(job_info)) 

1668 job_id = next(iter(job_info[schedd_name])) 

1669 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1670 cluster_id = int(float(job_id)) 

1671 else: 

1672 pass 

1673 return schedd_ad, cluster_id, id_type 

1674 

1675 

1676def _create_periodic_release_expr(memory, multiplier, limit): 

1677 """Construct an HTCondorAd expression for releasing held jobs. 

1678 

1679 The expression instruct HTCondor to release any job which was put on hold 

1680 due to exceeding memory requirements back to the job queue providing it 

1681 satisfies all of the conditions below: 

1682 

1683 * number of run attempts did not reach allowable number of retries, 

1684 * the memory requirements in the last failed run attempt did not reach 

1685 the specified memory limit. 

1686 

1687 Parameters 

1688 ---------- 

1689 memory : `int` 

1690 Requested memory in MB. 

1691 multiplier : `float` 

1692 Memory growth rate between retires. 

1693 limit : `int` 

1694 Memory limit. 

1695 

1696 Returns 

1697 ------- 

1698 expr : `str` 

1699 A string representing an HTCondor ClassAd expression for releasing jobs 

1700 which have been held due to exceeding the memory requirements. 

1701 """ 

1702 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1703 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1704 

1705 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1706 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1707 # The special comparison operators ensure that all comparisons below will 

1708 # evaluate to FALSE in this case. 

1709 # 

1710 # Note: 

1711 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1712 # the entire expression should evaluate to FALSE when the job is not HELD. 

1713 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1714 # but better safe than sorry. 

1715 was_mem_exceeded = ( 

1716 "JobStatus == 5 " 

1717 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1718 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1719 ) 

1720 

1721 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1722 return expr 

1723 

1724 

1725def _create_periodic_remove_expr(memory, multiplier, limit): 

1726 """Construct an HTCondorAd expression for removing jobs from the queue. 

1727 

1728 The expression instruct HTCondor to remove any job which was put on hold 

1729 due to exceeding memory requirements from the job queue providing it 

1730 satisfies any of the conditions below: 

1731 

1732 * allowable number of retries was reached, 

1733 * the memory requirements during the last failed run attempt reached 

1734 the specified memory limit. 

1735 

1736 Parameters 

1737 ---------- 

1738 memory : `int` 

1739 Requested memory in MB. 

1740 multiplier : `float` 

1741 Memory growth rate between retires. 

1742 limit : `int` 

1743 Memory limit. 

1744 

1745 Returns 

1746 ------- 

1747 expr : `str` 

1748 A string representing an HTCondor ClassAd expression for removing jobs 

1749 which were run at the maximal allowable memory and still exceeded 

1750 the memory requirements. 

1751 """ 

1752 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1753 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1754 

1755 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1756 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1757 # The special comparison operators ensure that all comparisons below will 

1758 # evaluate to FALSE in this case. 

1759 # 

1760 # Note: 

1761 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1762 # the entire expression should evaluate to FALSE when the job is not HELD. 

1763 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1764 # but better safe than sorry. 

1765 was_mem_exceeded = ( 

1766 "JobStatus == 5 " 

1767 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1768 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1769 ) 

1770 

1771 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1772 return expr 

1773 

1774 

1775def _create_request_memory_expr(memory, multiplier, limit): 

1776 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1777 

1778 Parameters 

1779 ---------- 

1780 memory : `int` 

1781 Requested memory in MB. 

1782 multiplier : `float` 

1783 Memory growth rate between retires. 

1784 limit : `int` 

1785 Memory limit. 

1786 

1787 Returns 

1788 ------- 

1789 expr : `str` 

1790 A string representing an HTCondor ClassAd expression enabling safe 

1791 memory scaling between job retries. 

1792 """ 

1793 # The check if the job was held due to exceeding memory requirements 

1794 # will be made *after* job was released back to the job queue (is in 

1795 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1796 # the ones describing job's current state. 

1797 # 

1798 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1799 # initially put in the job queue. The special comparison operators ensure 

1800 # that all comparisons below will evaluate to FALSE in this case. 

1801 was_mem_exceeded = ( 

1802 "LastJobStatus =?= 5 " 

1803 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1804 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1805 ) 

1806 

1807 # If job runs the first time or was held for reasons other than exceeding 

1808 # the memory, set the required memory to the requested value or use 

1809 # the memory value measured by HTCondor (MemoryUsage) depending on 

1810 # whichever is greater. 

1811 expr = ( 

1812 f"({was_mem_exceeded}) " 

1813 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1814 f": max({{{memory}, MemoryUsage ?: 0}})" 

1815 ) 

1816 return expr 

1817 

1818 

1819def _locate_schedds(locate_all=False): 

1820 """Find out Scheduler daemons in an HTCondor pool. 

1821 

1822 Parameters 

1823 ---------- 

1824 locate_all : `bool`, optional 

1825 If True, all available schedulers in the HTCondor pool will be located. 

1826 False by default which means that the search will be limited to looking 

1827 for the Scheduler running on a local host. 

1828 

1829 Returns 

1830 ------- 

1831 schedds : `dict` [`str`, `htcondor.Schedd`] 

1832 A mapping between Scheduler names and Python objects allowing for 

1833 interacting with them. 

1834 """ 

1835 coll = htcondor.Collector() 

1836 

1837 schedd_ads = [] 

1838 if locate_all: 

1839 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1840 else: 

1841 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1842 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1843 

1844 

1845def _gather_site_values(config, compute_site): 

1846 """Gather values specific to given site. 

1847 

1848 Parameters 

1849 ---------- 

1850 config : `lsst.ctrl.bps.BpsConfig` 

1851 BPS configuration that includes necessary submit/runtime 

1852 information. 

1853 compute_site : `str` 

1854 Compute site name. 

1855 

1856 Returns 

1857 ------- 

1858 site_values : `dict` [`str`, `Any`] 

1859 Values specific to the given site. 

1860 """ 

1861 site_values = {"attrs": {}, "profile": {}} 

1862 search_opts = {} 

1863 if compute_site: 

1864 search_opts["curvals"] = {"curr_site": compute_site} 

1865 

1866 # Determine the hard limit for the memory requirement. 

1867 found, limit = config.search("memoryLimit", opt=search_opts) 

1868 if not found: 

1869 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1870 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1871 del search_opts["default"] 

1872 

1873 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1874 # by definition, they cannot have more memory than 

1875 # the partitionable slot they are the part of. 

1876 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1877 pool_info = condor_status(constraint=constraint) 

1878 try: 

1879 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1880 except ValueError: 

1881 _LOG.debug("No execute machine in the pool matches %s", patt) 

1882 if limit: 

1883 config[".bps_defined.memory_limit"] = limit 

1884 

1885 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1886 site_values["memoryLimit"] = limit 

1887 

1888 found, value = config.search("accountingGroup", opt=search_opts) 

1889 if found: 

1890 site_values["accountingGroup"] = value 

1891 found, value = config.search("accountingUser", opt=search_opts) 

1892 if found: 

1893 site_values["accountingUser"] = value 

1894 

1895 key = f".site.{compute_site}.profile.condor" 

1896 if key in config: 

1897 for key, val in config[key].items(): 

1898 if key.startswith("+"): 

1899 site_values["attrs"][key[1:]] = val 

1900 else: 

1901 site_values["profile"][key] = val 

1902 

1903 return site_values