Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%

743 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-18 10:12 +0000

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Interface between generic workflow to HTCondor workflow system. 

29""" 

30 

31__all__ = ["HTCondorService", "HTCondorWorkflow"] 

32 

33 

34import logging 

35import os 

36import re 

37from collections import defaultdict 

38from enum import IntEnum, auto 

39from pathlib import Path 

40 

41import htcondor 

42from lsst.ctrl.bps import ( 

43 BaseWmsService, 

44 BaseWmsWorkflow, 

45 GenericWorkflow, 

46 GenericWorkflowJob, 

47 WmsJobReport, 

48 WmsRunReport, 

49 WmsStates, 

50) 

51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary 

52from lsst.utils.timer import time_this 

53from packaging import version 

54 

55from .lssthtc import ( 

56 MISSING_ID, 

57 HTCDag, 

58 HTCJob, 

59 JobStatus, 

60 NodeStatus, 

61 condor_history, 

62 condor_q, 

63 condor_search, 

64 condor_status, 

65 htc_backup_files, 

66 htc_check_dagman_output, 

67 htc_create_submit_from_cmd, 

68 htc_create_submit_from_dag, 

69 htc_create_submit_from_file, 

70 htc_escape, 

71 htc_submit_dag, 

72 htc_version, 

73 pegasus_name_to_label, 

74 read_dag_info, 

75 read_dag_log, 

76 read_dag_status, 

77 read_node_status, 

78 summary_from_dag, 

79 write_dag_info, 

80) 

81 

82 

83class WmsIdType(IntEnum): 

84 """Type of valid WMS ids.""" 

85 

86 UNKNOWN = auto() 

87 """The type of id cannot be determined. 

88 """ 

89 

90 LOCAL = auto() 

91 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

92 """ 

93 

94 GLOBAL = auto() 

95 """Id is a HTCondor's global job id. 

96 """ 

97 

98 PATH = auto() 

99 """Id is a submission path. 

100 """ 

101 

102 

103DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

104"""Default pattern for searching execute machines in an HTCondor pool. 

105""" 

106 

107_LOG = logging.getLogger(__name__) 

108 

109 

110class HTCondorService(BaseWmsService): 

111 """HTCondor version of WMS service.""" 

112 

113 def prepare(self, config, generic_workflow, out_prefix=None): 

114 """Convert generic workflow to an HTCondor DAG ready for submission. 

115 

116 Parameters 

117 ---------- 

118 config : `lsst.ctrl.bps.BpsConfig` 

119 BPS configuration that includes necessary submit/runtime 

120 information. 

121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

122 The generic workflow (e.g., has executable name and arguments). 

123 out_prefix : `str` 

124 The root directory into which all WMS-specific files are written. 

125 

126 Returns 

127 ------- 

128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

129 HTCondor workflow ready to be run. 

130 """ 

131 _LOG.debug("out_prefix = '%s'", out_prefix) 

132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

133 workflow = HTCondorWorkflow.from_generic_workflow( 

134 config, 

135 generic_workflow, 

136 out_prefix, 

137 f"{self.__class__.__module__}.{self.__class__.__name__}", 

138 ) 

139 

140 with time_this( 

141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

142 ): 

143 workflow.write(out_prefix) 

144 return workflow 

145 

146 def submit(self, workflow, **kwargs): 

147 """Submit a single HTCondor workflow. 

148 

149 Parameters 

150 ---------- 

151 workflow : `lsst.ctrl.bps.BaseWorkflow` 

152 A single HTCondor workflow to submit. run_id is updated after 

153 successful submission to WMS. 

154 **kwargs : `~typing.Any` 

155 Keyword arguments for the options. 

156 """ 

157 dag = workflow.dag 

158 

159 ver = version.parse(htc_version()) 

160 if ver >= version.parse("8.9.3"): 

161 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

162 else: 

163 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

164 

165 # For workflow portability, internal paths are all relative. Hence 

166 # the DAG needs to be submitted to HTCondor from inside the submit 

167 # directory. 

168 with chdir(workflow.submit_path): 

169 _LOG.info("Submitting from directory: %s", os.getcwd()) 

170 schedd_dag_info = htc_submit_dag(sub) 

171 if schedd_dag_info: 

172 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

173 

174 _, dag_info = schedd_dag_info.popitem() 

175 _, dag_ad = dag_info.popitem() 

176 

177 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

178 workflow.run_id = dag.run_id 

179 else: 

180 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

181 

182 def restart(self, wms_workflow_id): 

183 """Restart a failed DAGMan workflow. 

184 

185 Parameters 

186 ---------- 

187 wms_workflow_id : `str` 

188 The directory with HTCondor files. 

189 

190 Returns 

191 ------- 

192 run_id : `str` 

193 HTCondor id of the restarted DAGMan job. If restart failed, it will 

194 be set to None. 

195 run_name : `str` 

196 Name of the restarted workflow. If restart failed, it will be set 

197 to None. 

198 message : `str` 

199 A message describing any issues encountered during the restart. 

200 If there were no issues, an empty string is returned. 

201 """ 

202 wms_path, id_type = _wms_id_to_dir(wms_workflow_id) 

203 if wms_path is None: 

204 return ( 

205 None, 

206 None, 

207 ( 

208 f"workflow with run id '{wms_workflow_id}' not found. " 

209 f"Hint: use run's submit directory as the id instead" 

210 ), 

211 ) 

212 

213 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}: 

214 if not wms_path.is_dir(): 

215 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found." 

216 

217 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

218 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

219 if not rescue_dags: 

220 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

221 

222 _LOG.info("Verifying that the workflow is not already in the job queue") 

223 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"') 

224 if schedd_dag_info: 

225 _, dag_info = schedd_dag_info.popitem() 

226 _, dag_ad = dag_info.popitem() 

227 id_ = dag_ad["GlobalJobId"] 

228 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

229 

230 _LOG.info("Checking execution status of the workflow") 

231 warn = False 

232 dag_ad = read_dag_status(str(wms_path)) 

233 if dag_ad: 

234 nodes_total = dag_ad.get("NodesTotal", 0) 

235 if nodes_total != 0: 

236 nodes_done = dag_ad.get("NodesDone", 0) 

237 if nodes_total == nodes_done: 

238 return None, None, "All jobs in the workflow finished successfully" 

239 else: 

240 warn = True 

241 else: 

242 warn = True 

243 if warn: 

244 _LOG.warning( 

245 "Cannot determine the execution status of the workflow, continuing with restart regardless" 

246 ) 

247 

248 _LOG.info("Backing up select HTCondor files from previous run attempt") 

249 htc_backup_files(wms_path, subdir="backups") 

250 

251 # For workflow portability, internal paths are all relative. Hence 

252 # the DAG needs to be resubmitted to HTCondor from inside the submit 

253 # directory. 

254 _LOG.info("Adding workflow to the job queue") 

255 run_id, run_name, message = None, None, "" 

256 with chdir(wms_path): 

257 try: 

258 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

259 except StopIteration: 

260 message = f"DAGMan submit description file not found in '{wms_path}'" 

261 else: 

262 sub = htc_create_submit_from_file(dag_path.name) 

263 schedd_dag_info = htc_submit_dag(sub) 

264 

265 # Save select information about the DAGMan job to a file. Use 

266 # the run name (available in the ClassAd) as the filename. 

267 if schedd_dag_info: 

268 dag_info = next(iter(schedd_dag_info.values())) 

269 dag_ad = next(iter(dag_info.values())) 

270 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

271 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

272 run_name = dag_ad["bps_run"] 

273 else: 

274 message = "DAGMan job information unavailable" 

275 

276 return run_id, run_name, message 

277 

278 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

279 """Query WMS for list of submitted WMS workflows/jobs. 

280 

281 This should be a quick lookup function to create list of jobs for 

282 other functions. 

283 

284 Parameters 

285 ---------- 

286 wms_id : `int` or `str`, optional 

287 Id or path that can be used by WMS service to look up job. 

288 user : `str`, optional 

289 User whose submitted jobs should be listed. 

290 require_bps : `bool`, optional 

291 Whether to require jobs returned in list to be bps-submitted jobs. 

292 pass_thru : `str`, optional 

293 Information to pass through to WMS. 

294 is_global : `bool`, optional 

295 If set, all job queues (and their histories) will be queried for 

296 job information. Defaults to False which means that only the local 

297 job queue will be queried. 

298 

299 Returns 

300 ------- 

301 job_ids : `list` [`Any`] 

302 Only job ids to be used by cancel and other functions. Typically 

303 this means top-level jobs (i.e., not children jobs). 

304 """ 

305 _LOG.debug( 

306 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

307 wms_id, 

308 user, 

309 require_bps, 

310 pass_thru, 

311 is_global, 

312 ) 

313 

314 # Determine which Schedds will be queried for job information. 

315 coll = htcondor.Collector() 

316 

317 schedd_ads = [] 

318 if is_global: 

319 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

320 else: 

321 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

322 

323 # Construct appropriate constraint expression using provided arguments. 

324 constraint = "False" 

325 if wms_id is None: 

326 if user is not None: 

327 constraint = f'(Owner == "{user}")' 

328 else: 

329 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

330 if cluster_id is not None: 

331 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

332 

333 # If provided id is either a submission path or a global id, 

334 # make sure the right Schedd will be queried regardless of 

335 # 'is_global' value. 

336 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

337 schedd_ads = [schedd_ad] 

338 if require_bps: 

339 constraint += ' && (bps_isjob == "True")' 

340 if pass_thru: 

341 if "-forcex" in pass_thru: 

342 pass_thru_2 = pass_thru.replace("-forcex", "") 

343 if pass_thru_2 and not pass_thru_2.isspace(): 

344 constraint += f" && ({pass_thru_2})" 

345 else: 

346 constraint += f" && ({pass_thru})" 

347 

348 # Create a list of scheduler daemons which need to be queried. 

349 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

350 

351 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

352 results = condor_q(constraint=constraint, schedds=schedds) 

353 

354 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

355 job_ids = [] 

356 for schedd_name, job_info in results.items(): 

357 for job_id, job_ad in job_info.items(): 

358 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

359 if "DAGManJobId" not in job_ad: 

360 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

361 else: 

362 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

363 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

364 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

365 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

366 

367 _LOG.debug("job_ids = %s", job_ids) 

368 return job_ids 

369 

370 def report( 

371 self, 

372 wms_workflow_id=None, 

373 user=None, 

374 hist=0, 

375 pass_thru=None, 

376 is_global=False, 

377 return_exit_codes=False, 

378 ): 

379 """Return run information based upon given constraints. 

380 

381 Parameters 

382 ---------- 

383 wms_workflow_id : `str`, optional 

384 Limit to specific run based on id. 

385 user : `str`, optional 

386 Limit results to runs for this user. 

387 hist : `float`, optional 

388 Limit history search to this many days. Defaults to 0. 

389 pass_thru : `str`, optional 

390 Constraints to pass through to HTCondor. 

391 is_global : `bool`, optional 

392 If set, all job queues (and their histories) will be queried for 

393 job information. Defaults to False which means that only the local 

394 job queue will be queried. 

395 return_exit_codes : `bool`, optional 

396 If set, return exit codes related to jobs with a 

397 non-success status. Defaults to False, which means that only 

398 the summary state is returned. 

399 

400 Only applicable in the context of a WMS with associated 

401 handlers to return exit codes from jobs. 

402 

403 Returns 

404 ------- 

405 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

406 Information about runs from given job information. 

407 message : `str` 

408 Extra message for report command to print. This could be pointers 

409 to documentation or to WMS specific commands. 

410 """ 

411 if wms_workflow_id: 

412 id_type = _wms_id_type(wms_workflow_id) 

413 if id_type == WmsIdType.LOCAL: 

414 schedulers = _locate_schedds(locate_all=is_global) 

415 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

416 elif id_type == WmsIdType.GLOBAL: 

417 schedulers = _locate_schedds(locate_all=True) 

418 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

419 elif id_type == WmsIdType.PATH: 

420 run_reports, message = _report_from_path(wms_workflow_id) 

421 else: 

422 run_reports, message = {}, "Invalid job id" 

423 else: 

424 schedulers = _locate_schedds(locate_all=is_global) 

425 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

426 _LOG.debug("report: %s, %s", run_reports, message) 

427 

428 return list(run_reports.values()), message 

429 

430 def cancel(self, wms_id, pass_thru=None): 

431 """Cancel submitted workflows/jobs. 

432 

433 Parameters 

434 ---------- 

435 wms_id : `str` 

436 Id or path of job that should be canceled. 

437 pass_thru : `str`, optional 

438 Information to pass through to WMS. 

439 

440 Returns 

441 ------- 

442 deleted : `bool` 

443 Whether successful deletion or not. Currently, if any doubt or any 

444 individual jobs not deleted, return False. 

445 message : `str` 

446 Any message from WMS (e.g., error details). 

447 """ 

448 _LOG.debug("Canceling wms_id = %s", wms_id) 

449 

450 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

451 

452 if cluster_id is None: 

453 deleted = False 

454 message = "invalid id" 

455 else: 

456 _LOG.debug( 

457 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

458 cluster_id, 

459 schedd_ad["Name"], 

460 ) 

461 schedd = htcondor.Schedd(schedd_ad) 

462 

463 constraint = f"ClusterId == {cluster_id}" 

464 if pass_thru is not None and "-forcex" in pass_thru: 

465 pass_thru_2 = pass_thru.replace("-forcex", "") 

466 if pass_thru_2 and not pass_thru_2.isspace(): 

467 constraint += f"&& ({pass_thru_2})" 

468 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

469 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

470 else: 

471 if pass_thru: 

472 constraint += f"&& ({pass_thru})" 

473 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

474 results = schedd.act(htcondor.JobAction.Remove, constraint) 

475 _LOG.debug("Remove results: %s", results) 

476 

477 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

478 deleted = True 

479 message = "" 

480 else: 

481 deleted = False 

482 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

483 message = "no such bps job in batch queue" 

484 else: 

485 message = f"unknown problems deleting: {results}" 

486 

487 _LOG.debug("deleted: %s; message = %s", deleted, message) 

488 return deleted, message 

489 

490 

491class HTCondorWorkflow(BaseWmsWorkflow): 

492 """Single HTCondor workflow. 

493 

494 Parameters 

495 ---------- 

496 name : `str` 

497 Unique name for Workflow used when naming files. 

498 config : `lsst.ctrl.bps.BpsConfig` 

499 BPS configuration that includes necessary submit/runtime information. 

500 """ 

501 

502 def __init__(self, name, config=None): 

503 super().__init__(name, config) 

504 self.dag = None 

505 

506 @classmethod 

507 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

508 # Docstring inherited 

509 htc_workflow = cls(generic_workflow.name, config) 

510 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

511 

512 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

513 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

514 htc_workflow.dag.add_attribs( 

515 { 

516 "bps_wms_service": service_class, 

517 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

518 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

519 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

520 } 

521 ) 

522 

523 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

524 if isinstance(tmp_template, str): 

525 subdir_template = defaultdict(lambda: tmp_template) 

526 else: 

527 subdir_template = tmp_template 

528 

529 # Create all DAG jobs 

530 site_values = {} # cache compute site specific values to reduce config lookups 

531 for job_name in generic_workflow: 

532 gwjob = generic_workflow.get_job(job_name) 

533 if gwjob.compute_site not in site_values: 

534 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

535 htc_job = _create_job( 

536 subdir_template[gwjob.label], 

537 site_values[gwjob.compute_site], 

538 generic_workflow, 

539 gwjob, 

540 out_prefix, 

541 ) 

542 htc_workflow.dag.add_job(htc_job) 

543 

544 # Add job dependencies to the DAG 

545 for job_name in generic_workflow: 

546 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

547 

548 # If final job exists in generic workflow, create DAG final job 

549 final = generic_workflow.get_final() 

550 if final and isinstance(final, GenericWorkflowJob): 

551 if final.compute_site and final.compute_site not in site_values: 

552 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

553 final_htjob = _create_job( 

554 subdir_template[final.label], 

555 site_values[final.compute_site], 

556 generic_workflow, 

557 final, 

558 out_prefix, 

559 ) 

560 if "post" not in final_htjob.dagcmds: 

561 final_htjob.dagcmds["post"] = ( 

562 f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN" 

563 ) 

564 htc_workflow.dag.add_final_job(final_htjob) 

565 elif final and isinstance(final, GenericWorkflow): 

566 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

567 elif final: 

568 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

569 

570 return htc_workflow 

571 

572 def write(self, out_prefix): 

573 """Output HTCondor DAGMan files needed for workflow submission. 

574 

575 Parameters 

576 ---------- 

577 out_prefix : `str` 

578 Directory prefix for HTCondor files. 

579 """ 

580 self.submit_path = out_prefix 

581 os.makedirs(out_prefix, exist_ok=True) 

582 

583 # Write down the workflow in HTCondor format. 

584 self.dag.write(out_prefix, "jobs/{self.label}") 

585 

586 

587def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

588 """Convert GenericWorkflow job nodes to DAG jobs. 

589 

590 Parameters 

591 ---------- 

592 subdir_template : `str` 

593 Template for making subdirs. 

594 site_values : `dict` 

595 Site specific values 

596 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

597 Generic workflow that is being converted. 

598 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

599 The generic job to convert to a HTCondor job. 

600 out_prefix : `str` 

601 Directory prefix for HTCondor files. 

602 

603 Returns 

604 ------- 

605 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

606 The HTCondor job equivalent to the given generic job. 

607 """ 

608 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

609 

610 curvals = defaultdict(str) 

611 curvals["label"] = gwjob.label 

612 if gwjob.tags: 

613 curvals.update(gwjob.tags) 

614 

615 subdir = subdir_template.format_map(curvals) 

616 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

617 

618 htc_job_cmds = { 

619 "universe": "vanilla", 

620 "should_transfer_files": "YES", 

621 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

622 "transfer_output_files": '""', # Set to empty string to disable 

623 "transfer_executable": "False", 

624 "getenv": "True", 

625 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

626 # htcondor to put on hold any jobs which exited by a signal. 

627 "on_exit_hold": "ExitBySignal == true", 

628 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

629 '"Handling signal as if job has gone over memory limit.")', 

630 "on_exit_hold_subcode": "34", 

631 } 

632 

633 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

634 

635 # job stdout, stderr, htcondor user log. 

636 for key in ("output", "error", "log"): 

637 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

638 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

639 

640 htc_job_cmds.update( 

641 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

642 ) 

643 

644 # Add the job cmds dict to the job object. 

645 htc_job.add_job_cmds(htc_job_cmds) 

646 

647 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

648 

649 # Add job attributes to job. 

650 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

651 htc_job.add_job_attrs(gwjob.attrs) 

652 htc_job.add_job_attrs(site_values["attrs"]) 

653 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

654 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

655 

656 return htc_job 

657 

658 

659def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

660 """Translate the job data that are one to one mapping 

661 

662 Parameters 

663 ---------- 

664 cached_vals : `dict` [`str`, `Any`] 

665 Config values common to jobs with same label. 

666 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

667 Generic workflow that contains job to being converted. 

668 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

669 Generic workflow job to be converted. 

670 

671 Returns 

672 ------- 

673 htc_job_commands : `dict` [`str`, `Any`] 

674 Contains commands which can appear in the HTCondor submit description 

675 file. 

676 """ 

677 # Values in the job script that just are name mappings. 

678 job_translation = { 

679 "mail_to": "notify_user", 

680 "when_to_mail": "notification", 

681 "request_cpus": "request_cpus", 

682 "priority": "priority", 

683 "category": "category", 

684 "accounting_group": "accounting_group", 

685 "accounting_user": "accounting_group_user", 

686 } 

687 

688 jobcmds = {} 

689 for gwkey, htckey in job_translation.items(): 

690 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

691 

692 # If accounting info was not set explicitly, use site settings if any. 

693 if not gwjob.accounting_group: 

694 jobcmds["accounting_group"] = cached_vals.get("accountingGroup") 

695 if not gwjob.accounting_user: 

696 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser") 

697 

698 # job commands that need modification 

699 if gwjob.number_of_retries: 

700 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

701 

702 if gwjob.retry_unless_exit: 

703 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

704 

705 if gwjob.request_disk: 

706 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

707 

708 if gwjob.request_memory: 

709 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

710 

711 if gwjob.memory_multiplier: 

712 # Do not use try-except! At the moment, BpsConfig returns an empty 

713 # string if it does not contain the key. 

714 memory_limit = cached_vals["memoryLimit"] 

715 if not memory_limit: 

716 raise RuntimeError( 

717 "Memory autoscaling enabled, but automatic detection of the memory limit " 

718 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

719 "search pattern 'executeMachinesPattern' might help." 

720 ) 

721 

722 # Set maximal amount of memory job can ask for. 

723 # 

724 # The check below assumes that 'memory_limit' was set to a value which 

725 # realistically reflects actual physical limitations of a given compute 

726 # resource. 

727 memory_max = memory_limit 

728 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

729 memory_max = gwjob.request_memory_max 

730 

731 # Make job ask for more memory each time it failed due to insufficient 

732 # memory requirements. 

733 jobcmds["request_memory"] = _create_request_memory_expr( 

734 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

735 ) 

736 

737 # Periodically release jobs which are being held due to exceeding 

738 # memory. Stop doing that (by removing the job from the HTCondor queue) 

739 # after the maximal number of retries has been reached or the job was 

740 # already run at maximal allowed memory. 

741 jobcmds["periodic_release"] = _create_periodic_release_expr( 

742 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

743 ) 

744 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

745 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

746 ) 

747 

748 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

749 # May need to move to special site-specific implementation if sites use 

750 # other mechanisms. 

751 if gwjob.concurrency_limit: 

752 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

753 

754 # Handle command line 

755 if gwjob.executable.transfer_executable: 

756 jobcmds["transfer_executable"] = "True" 

757 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

758 else: 

759 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

760 

761 if gwjob.arguments: 

762 arguments = gwjob.arguments 

763 arguments = _replace_cmd_vars(arguments, gwjob) 

764 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

765 arguments = _fix_env_var_syntax(arguments) 

766 jobcmds["arguments"] = arguments 

767 

768 # Add extra "pass-thru" job commands 

769 if gwjob.profile: 

770 for key, val in gwjob.profile.items(): 

771 jobcmds[key] = htc_escape(val) 

772 for key, val in cached_vals["profile"].items(): 

773 jobcmds[key] = htc_escape(val) 

774 

775 return jobcmds 

776 

777 

778def _translate_dag_cmds(gwjob): 

779 """Translate job values into DAGMan commands. 

780 

781 Parameters 

782 ---------- 

783 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

784 Job containing values to be translated. 

785 

786 Returns 

787 ------- 

788 dagcmds : `dict` [`str`, `Any`] 

789 DAGMan commands for the job. 

790 """ 

791 # Values in the dag script that just are name mappings. 

792 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

793 

794 dagcmds = {} 

795 for gwkey, htckey in dag_translation.items(): 

796 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

797 

798 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

799 return dagcmds 

800 

801 

802def _fix_env_var_syntax(oldstr): 

803 """Change ENV place holders to HTCondor Env var syntax. 

804 

805 Parameters 

806 ---------- 

807 oldstr : `str` 

808 String in which environment variable syntax is to be fixed. 

809 

810 Returns 

811 ------- 

812 newstr : `str` 

813 Given string with environment variable syntax fixed. 

814 """ 

815 newstr = oldstr 

816 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

817 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

818 return newstr 

819 

820 

821def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

822 """Replace file placeholders in command line arguments with correct 

823 physical file names. 

824 

825 Parameters 

826 ---------- 

827 use_shared : `bool` 

828 Whether HTCondor can assume shared filesystem. 

829 arguments : `str` 

830 Arguments string in which to replace file placeholders. 

831 workflow : `lsst.ctrl.bps.GenericWorkflow` 

832 Generic workflow that contains file information. 

833 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

834 The job corresponding to the arguments. 

835 

836 Returns 

837 ------- 

838 arguments : `str` 

839 Given arguments string with file placeholders replaced. 

840 """ 

841 # Replace input file placeholders with paths. 

842 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

843 if not gwfile.wms_transfer: 

844 # Must assume full URI if in command line and told WMS is not 

845 # responsible for transferring file. 

846 uri = gwfile.src_uri 

847 elif use_shared: 

848 if gwfile.job_shared: 

849 # Have shared filesystems and jobs can share file. 

850 uri = gwfile.src_uri 

851 else: 

852 # Taking advantage of inside knowledge. Not future-proof. 

853 # Temporary fix until have job wrapper that pulls files 

854 # within job. 

855 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

856 uri = "butler.yaml" 

857 else: 

858 uri = os.path.basename(gwfile.src_uri) 

859 else: # Using push transfer 

860 uri = os.path.basename(gwfile.src_uri) 

861 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

862 

863 # Replace output file placeholders with paths. 

864 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

865 if not gwfile.wms_transfer: 

866 # Must assume full URI if in command line and told WMS is not 

867 # responsible for transferring file. 

868 uri = gwfile.src_uri 

869 elif use_shared: 

870 if gwfile.job_shared: 

871 # Have shared filesystems and jobs can share file. 

872 uri = gwfile.src_uri 

873 else: 

874 uri = os.path.basename(gwfile.src_uri) 

875 else: # Using push transfer 

876 uri = os.path.basename(gwfile.src_uri) 

877 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

878 return arguments 

879 

880 

881def _replace_cmd_vars(arguments, gwjob): 

882 """Replace format-style placeholders in arguments. 

883 

884 Parameters 

885 ---------- 

886 arguments : `str` 

887 Arguments string in which to replace placeholders. 

888 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

889 Job containing values to be used to replace placeholders 

890 (in particular gwjob.cmdvals). 

891 

892 Returns 

893 ------- 

894 arguments : `str` 

895 Given arguments string with placeholders replaced. 

896 """ 

897 try: 

898 arguments = arguments.format(**gwjob.cmdvals) 

899 except (KeyError, TypeError): # TypeError in case None instead of {} 

900 _LOG.error( 

901 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals 

902 ) 

903 raise 

904 return arguments 

905 

906 

907def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

908 """Add job input files from generic workflow to job. 

909 

910 Parameters 

911 ---------- 

912 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

913 The generic workflow (e.g., has executable name and arguments). 

914 job_name : `str` 

915 Unique name for the job. 

916 use_shared : `bool` 

917 Whether job has access to files via shared filesystem. 

918 out_prefix : `str` 

919 The root directory into which all WMS-specific files are written. 

920 

921 Returns 

922 ------- 

923 htc_commands : `dict` [`str`, `str`] 

924 HTCondor commands for the job submission script. 

925 """ 

926 htc_commands = {} 

927 inputs = [] 

928 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

929 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

930 

931 uri = Path(gwf_file.src_uri) 

932 

933 # Note if use_shared and job_shared, don't need to transfer file. 

934 

935 if not use_shared: # Copy file using push to job 

936 inputs.append(str(uri.relative_to(out_prefix))) 

937 elif not gwf_file.job_shared: # Jobs require own copy 

938 # if using shared filesystem, but still need copy in job. Use 

939 # HTCondor's curl plugin for a local copy. 

940 

941 # Execution butler is represented as a directory which the 

942 # curl plugin does not handle. Taking advantage of inside 

943 # knowledge for temporary fix until have job wrapper that pulls 

944 # files within job. 

945 if gwf_file.name == "butlerConfig": 

946 # The execution butler directory doesn't normally exist until 

947 # the submit phase so checking for suffix instead of using 

948 # is_dir(). If other non-yaml file exists they would have a 

949 # different gwf_file.name. 

950 if uri.suffix == ".yaml": # Single file, so just copy. 

951 inputs.append(f"file://{uri}") 

952 else: 

953 inputs.append(f"file://{uri / 'butler.yaml'}") 

954 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

955 elif uri.is_dir(): 

956 raise RuntimeError( 

957 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}" 

958 ) 

959 else: 

960 inputs.append(f"file://{uri}") 

961 

962 if inputs: 

963 htc_commands["transfer_input_files"] = ",".join(inputs) 

964 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

965 return htc_commands 

966 

967 

968def _report_from_path(wms_path): 

969 """Gather run information from a given run directory. 

970 

971 Parameters 

972 ---------- 

973 wms_path : `str` 

974 The directory containing the submit side files (e.g., HTCondor files). 

975 

976 Returns 

977 ------- 

978 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

979 Run information for the detailed report. The key is the HTCondor id 

980 and the value is a collection of report information for that run. 

981 message : `str` 

982 Message to be printed with the summary report. 

983 """ 

984 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

985 if wms_workflow_id == MISSING_ID: 

986 run_reports = {} 

987 else: 

988 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

989 return run_reports, message 

990 

991 

992def _report_from_id(wms_workflow_id, hist, schedds=None): 

993 """Gather run information using workflow id. 

994 

995 Parameters 

996 ---------- 

997 wms_workflow_id : `str` 

998 Limit to specific run based on id. 

999 hist : `float` 

1000 Limit history search to this many days. 

1001 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1002 HTCondor schedulers which to query for job information. If None 

1003 (default), all queries will be run against the local scheduler only. 

1004 

1005 Returns 

1006 ------- 

1007 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1008 Run information for the detailed report. The key is the HTCondor id 

1009 and the value is a collection of report information for that run. 

1010 message : `str` 

1011 Message to be printed with the summary report. 

1012 """ 

1013 messages = [] 

1014 

1015 # Collect information about the job by querying HTCondor schedd and 

1016 # HTCondor history. 

1017 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds) 

1018 if len(schedd_dag_info) == 1: 

1019 # Extract the DAG info without altering the results of the query. 

1020 schedd_name = next(iter(schedd_dag_info)) 

1021 dag_id = next(iter(schedd_dag_info[schedd_name])) 

1022 dag_ad = schedd_dag_info[schedd_name][dag_id] 

1023 

1024 # If the provided workflow id does not correspond to the one extracted 

1025 # from the DAGMan log file in the submit directory, rerun the query 

1026 # with the id found in the file. 

1027 # 

1028 # This is to cover the situation in which the user provided the old job 

1029 # id of a restarted run. 

1030 try: 

1031 path_dag_id, _ = read_dag_log(dag_ad["Iwd"]) 

1032 except FileNotFoundError as exc: 

1033 # At the moment missing DAGMan log is pretty much a fatal error. 

1034 # So empty the DAG info to finish early (see the if statement 

1035 # below). 

1036 schedd_dag_info.clean() 

1037 messages.append(f"Cannot create the report for '{dag_id}': {exc}") 

1038 else: 

1039 if path_dag_id != dag_id: 

1040 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds) 

1041 messages.append( 

1042 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. " 

1043 "This normally occurs when a run is restarted. The report shown is for the most " 

1044 f"recent status with run id '{path_dag_id}'" 

1045 ) 

1046 

1047 if len(schedd_dag_info) == 0: 

1048 run_reports = {} 

1049 elif len(schedd_dag_info) == 1: 

1050 _, dag_info = schedd_dag_info.popitem() 

1051 dag_id, dag_ad = dag_info.popitem() 

1052 

1053 # Create a mapping between jobs and their classads. The keys will 

1054 # be of format 'ClusterId.ProcId'. 

1055 job_info = {dag_id: dag_ad} 

1056 

1057 # Find jobs (nodes) belonging to that DAGMan job. 

1058 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1059 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1060 if schedd_job_info: 

1061 _, node_info = schedd_job_info.popitem() 

1062 job_info.update(node_info) 

1063 

1064 # Collect additional pieces of information about jobs using HTCondor 

1065 # files in the submission directory. 

1066 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1067 _update_jobs(job_info, path_jobs) 

1068 if message: 

1069 messages.append(message) 

1070 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1071 else: 

1072 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1073 message = ( 

1074 f"More than one job matches id '{wms_workflow_id}', " 

1075 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1076 ) 

1077 messages.append(message) 

1078 run_reports = {} 

1079 

1080 message = "\n".join(messages) 

1081 return run_reports, message 

1082 

1083 

1084def _get_info_from_schedd(wms_workflow_id, hist, schedds): 

1085 """Gather run information from HTCondor. 

1086 

1087 Parameters 

1088 ---------- 

1089 wms_workflow_id : `str` 

1090 Limit to specific run based on id. 

1091 hist : `int` 

1092 Limit history search to this many days. 

1093 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

1094 HTCondor schedulers which to query for job information. If None 

1095 (default), all queries will be run against the local scheduler only. 

1096 

1097 Returns 

1098 ------- 

1099 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1100 Information about jobs satisfying the search criteria where for each 

1101 Scheduler, local HTCondor job ids are mapped to their respective 

1102 classads. 

1103 """ 

1104 dag_constraint = 'regexp("dagman$", Cmd)' 

1105 try: 

1106 cluster_id = int(float(wms_workflow_id)) 

1107 except ValueError: 

1108 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

1109 else: 

1110 dag_constraint += f" && ClusterId == {cluster_id}" 

1111 

1112 # With the current implementation of the condor_* functions the query 

1113 # will always return only one match per Scheduler. 

1114 # 

1115 # Even in the highly unlikely situation where HTCondor history (which 

1116 # condor_search queries too) is long enough to have jobs from before 

1117 # the cluster ids were rolled over (and as a result there is more then 

1118 # one job with the same cluster id) they will not show up in 

1119 # the results. 

1120 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

1121 return schedd_dag_info 

1122 

1123 

1124def _get_info_from_path(wms_path): 

1125 """Gather run information from a given run directory. 

1126 

1127 Parameters 

1128 ---------- 

1129 wms_path : `str` 

1130 Directory containing HTCondor files. 

1131 

1132 Returns 

1133 ------- 

1134 wms_workflow_id : `str` 

1135 The run id which is a DAGman job id. 

1136 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1137 Information about jobs read from files in the given directory. 

1138 The key is the HTCondor id and the value is a dictionary of HTCondor 

1139 keys and values. 

1140 message : `str` 

1141 Message to be printed with the summary report. 

1142 """ 

1143 messages = [] 

1144 try: 

1145 wms_workflow_id, jobs = read_dag_log(wms_path) 

1146 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1147 _update_jobs(jobs, read_node_status(wms_path)) 

1148 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1149 

1150 # Add more info for DAGman job 

1151 job = jobs[wms_workflow_id] 

1152 job.update(read_dag_status(wms_path)) 

1153 

1154 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1155 if "bps_run" not in job: 

1156 _add_run_info(wms_path, job) 

1157 

1158 message = htc_check_dagman_output(wms_path) 

1159 if message: 

1160 messages.append(message) 

1161 _LOG.debug( 

1162 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1163 ) 

1164 

1165 # Add extra pieces of information which cannot be found in HTCondor 

1166 # generated files like 'GlobalJobId'. 

1167 # 

1168 # Do not treat absence of this file as a serious error. Neither runs 

1169 # submitted with earlier versions of the plugin nor the runs submitted 

1170 # with Pegasus plugin will have it at the moment. However, once enough 

1171 # time passes and Pegasus plugin will have its own report() method 

1172 # (instead of sneakily using HTCondor's one), the lack of that file 

1173 # should be treated as seriously as lack of any other file. 

1174 try: 

1175 job_info = read_dag_info(wms_path) 

1176 except FileNotFoundError as exc: 

1177 message = f"Warn: Some information may not be available: {exc}" 

1178 messages.append(message) 

1179 else: 

1180 schedd_name = next(iter(job_info)) 

1181 job_ad = next(iter(job_info[schedd_name].values())) 

1182 job.update(job_ad) 

1183 except FileNotFoundError: 

1184 message = f"Could not find HTCondor files in '{wms_path}'" 

1185 _LOG.warning(message) 

1186 messages.append(message) 

1187 wms_workflow_id = MISSING_ID 

1188 jobs = {} 

1189 

1190 message = "\n".join([msg for msg in messages if msg]) 

1191 return wms_workflow_id, jobs, message 

1192 

1193 

1194def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1195 """Gather run information to be used in generating summary reports. 

1196 

1197 Parameters 

1198 ---------- 

1199 wms_workflow_id : `str` 

1200 The run id to create the report for. 

1201 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1202 Mapping HTCondor job id to job information. 

1203 

1204 Returns 

1205 ------- 

1206 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1207 Run information for the detailed report. The key is the given HTCondor 

1208 id and the value is a collection of report information for that run. 

1209 """ 

1210 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1211 dag_job = jobs.pop(wms_workflow_id) 

1212 report = WmsRunReport( 

1213 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1214 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1215 path=dag_job["Iwd"], 

1216 label=dag_job.get("bps_job_label", "MISS"), 

1217 run=dag_job.get("bps_run", "MISS"), 

1218 project=dag_job.get("bps_project", "MISS"), 

1219 campaign=dag_job.get("bps_campaign", "MISS"), 

1220 payload=dag_job.get("bps_payload", "MISS"), 

1221 operator=_get_owner(dag_job), 

1222 run_summary=_get_run_summary(dag_job), 

1223 state=_htc_status_to_wms_state(dag_job), 

1224 jobs=[], 

1225 total_number_jobs=dag_job["total_jobs"], 

1226 job_state_counts=dag_job["state_counts"], 

1227 exit_code_summary=_get_exit_code_summary(jobs), 

1228 ) 

1229 

1230 for job_id, job_info in jobs.items(): 

1231 try: 

1232 job_report = WmsJobReport( 

1233 wms_id=job_id, 

1234 name=job_info.get("DAGNodeName", job_id), 

1235 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1236 state=_htc_status_to_wms_state(job_info), 

1237 ) 

1238 if job_report.label == "init": 

1239 job_report.label = "pipetaskInit" 

1240 report.jobs.append(job_report) 

1241 except KeyError as ex: 

1242 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1243 raise 

1244 

1245 # Add the removed entry to restore the original content of the dictionary. 

1246 # The ordering of keys will be change permanently though. 

1247 jobs.update({wms_workflow_id: dag_job}) 

1248 

1249 run_reports = {report.wms_id: report} 

1250 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1251 return run_reports 

1252 

1253 

1254def _summary_report(user, hist, pass_thru, schedds=None): 

1255 """Gather run information to be used in generating summary reports. 

1256 

1257 Parameters 

1258 ---------- 

1259 user : `str` 

1260 Run lookup restricted to given user. 

1261 hist : `float` 

1262 How many previous days to search for run information. 

1263 pass_thru : `str` 

1264 Advanced users can define the HTCondor constraint to be used 

1265 when searching queue and history. 

1266 

1267 Returns 

1268 ------- 

1269 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1270 Run information for the summary report. The keys are HTCondor ids and 

1271 the values are collections of report information for each run. 

1272 message : `str` 

1273 Message to be printed with the summary report. 

1274 """ 

1275 # only doing summary report so only look for dagman jobs 

1276 if pass_thru: 

1277 constraint = pass_thru 

1278 else: 

1279 # Notes: 

1280 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1281 # manually restarted. 

1282 # * Any job with DAGManJobID isn't a DAG job 

1283 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1284 if user: 

1285 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1286 

1287 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1288 

1289 # Have list of DAGMan jobs, need to get run_report info. 

1290 run_reports = {} 

1291 for jobs in job_info.values(): 

1292 for job_id, job in jobs.items(): 

1293 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1294 # If didn't get from queue information (e.g., Kerberos bug), 

1295 # try reading from file. 

1296 if total_jobs == 0: 

1297 try: 

1298 job.update(read_dag_status(job["Iwd"])) 

1299 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1300 except StopIteration: 

1301 pass # don't kill report can't find htcondor files 

1302 

1303 if "bps_run" not in job: 

1304 _add_run_info(job["Iwd"], job) 

1305 report = WmsRunReport( 

1306 wms_id=job_id, 

1307 global_wms_id=job["GlobalJobId"], 

1308 path=job["Iwd"], 

1309 label=job.get("bps_job_label", "MISS"), 

1310 run=job.get("bps_run", "MISS"), 

1311 project=job.get("bps_project", "MISS"), 

1312 campaign=job.get("bps_campaign", "MISS"), 

1313 payload=job.get("bps_payload", "MISS"), 

1314 operator=_get_owner(job), 

1315 run_summary=_get_run_summary(job), 

1316 state=_htc_status_to_wms_state(job), 

1317 jobs=[], 

1318 total_number_jobs=total_jobs, 

1319 job_state_counts=state_counts, 

1320 ) 

1321 run_reports[report.global_wms_id] = report 

1322 

1323 return run_reports, "" 

1324 

1325 

1326def _add_run_info(wms_path, job): 

1327 """Find BPS run information elsewhere for runs without bps attributes. 

1328 

1329 Parameters 

1330 ---------- 

1331 wms_path : `str` 

1332 Path to submit files for the run. 

1333 job : `dict` [`str`, `Any`] 

1334 HTCondor dag job information. 

1335 

1336 Raises 

1337 ------ 

1338 StopIteration 

1339 If cannot find file it is looking for. Permission errors are 

1340 caught and job's run is marked with error. 

1341 """ 

1342 path = Path(wms_path) / "jobs" 

1343 try: 

1344 subfile = next(path.glob("**/*.sub")) 

1345 except (StopIteration, PermissionError): 

1346 job["bps_run"] = "Unavailable" 

1347 else: 

1348 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1349 try: 

1350 with open(subfile, encoding="utf-8") as fh: 

1351 for line in fh: 

1352 if line.startswith("+bps_"): 

1353 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1354 if m: 

1355 _LOG.debug("Matching line: %s", line) 

1356 job[m.group(1)] = m.group(2).replace('"', "") 

1357 else: 

1358 _LOG.debug("Could not parse attribute: %s", line) 

1359 except PermissionError: 

1360 job["bps_run"] = "PermissionError" 

1361 _LOG.debug("After adding job = %s", job) 

1362 

1363 

1364def _get_owner(job): 

1365 """Get the owner of a dag job. 

1366 

1367 Parameters 

1368 ---------- 

1369 job : `dict` [`str`, `Any`] 

1370 HTCondor dag job information. 

1371 

1372 Returns 

1373 ------- 

1374 owner : `str` 

1375 Owner of the dag job. 

1376 """ 

1377 owner = job.get("bps_operator", None) 

1378 if not owner: 

1379 owner = job.get("Owner", None) 

1380 if not owner: 

1381 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1382 owner = "MISS" 

1383 return owner 

1384 

1385 

1386def _get_run_summary(job): 

1387 """Get the run summary for a job. 

1388 

1389 Parameters 

1390 ---------- 

1391 job : `dict` [`str`, `Any`] 

1392 HTCondor dag job information. 

1393 

1394 Returns 

1395 ------- 

1396 summary : `str` 

1397 Number of jobs per PipelineTask label in approximate pipeline order. 

1398 Format: <label>:<count>[;<label>:<count>]+ 

1399 """ 

1400 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1401 if not summary: 

1402 summary, _ = summary_from_dag(job["Iwd"]) 

1403 if not summary: 

1404 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1405 _LOG.debug("_get_run_summary: summary=%s", summary) 

1406 

1407 # Workaround sometimes using init vs pipetaskInit 

1408 summary = summary.replace("init:", "pipetaskInit:") 

1409 

1410 if "pegasus_version" in job and "pegasus" not in summary: 

1411 summary += ";pegasus:0" 

1412 

1413 return summary 

1414 

1415 

1416def _get_exit_code_summary(jobs): 

1417 """Get the exit code summary for a run. 

1418 

1419 Parameters 

1420 ---------- 

1421 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1422 Mapping HTCondor job id to job information. 

1423 

1424 Returns 

1425 ------- 

1426 summary : `dict` [`str`, `list` [`int`]] 

1427 Jobs' exit codes per job label. 

1428 """ 

1429 summary = {} 

1430 for job_id, job_ad in jobs.items(): 

1431 job_label = job_ad["bps_job_label"] 

1432 summary.setdefault(job_label, []) 

1433 try: 

1434 exit_code = 0 

1435 job_status = job_ad["JobStatus"] 

1436 match job_status: 

1437 case JobStatus.COMPLETED | JobStatus.HELD: 

1438 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"] 

1439 case ( 

1440 JobStatus.IDLE 

1441 | JobStatus.RUNNING 

1442 | JobStatus.REMOVED 

1443 | JobStatus.TRANSFERRING_OUTPUT 

1444 | JobStatus.SUSPENDED 

1445 ): 

1446 pass 

1447 case _: 

1448 _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%s'", job_status, job_id) 

1449 if exit_code != 0: 

1450 summary[job_label].append(exit_code) 

1451 except KeyError as ex: 

1452 _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id) 

1453 return summary 

1454 

1455 

1456def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1457 """Count number of jobs per WMS state. 

1458 

1459 Parameters 

1460 ---------- 

1461 wms_workflow_id : `str` 

1462 HTCondor job id. 

1463 jobs : `dict` [`str`, `Any`] 

1464 HTCondor dag job information. 

1465 

1466 Returns 

1467 ------- 

1468 total_count : `int` 

1469 Total number of dag nodes. 

1470 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1471 Keys are the different WMS states and values are counts of jobs 

1472 that are in that WMS state. 

1473 """ 

1474 state_counts = dict.fromkeys(WmsStates, 0) 

1475 

1476 for jid, jinfo in jobs.items(): 

1477 if jid != wms_workflow_id: 

1478 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1479 

1480 total_counted = sum(state_counts.values()) 

1481 if "NodesTotal" in jobs[wms_workflow_id]: 

1482 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1483 else: 

1484 total_count = total_counted 

1485 

1486 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1487 

1488 return total_count, state_counts 

1489 

1490 

1491def _get_state_counts_from_dag_job(job): 

1492 """Count number of jobs per WMS state. 

1493 

1494 Parameters 

1495 ---------- 

1496 job : `dict` [`str`, `Any`] 

1497 HTCondor dag job information. 

1498 

1499 Returns 

1500 ------- 

1501 total_count : `int` 

1502 Total number of dag nodes. 

1503 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1504 Keys are the different WMS states and values are counts of jobs 

1505 that are in that WMS state. 

1506 """ 

1507 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1508 state_counts = dict.fromkeys(WmsStates, 0) 

1509 if "DAG_NodesReady" in job: 

1510 state_counts = { 

1511 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1512 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1513 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1514 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1515 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1516 WmsStates.PRUNED: job.get("DAG_NodesFutile", 0), 

1517 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1518 } 

1519 total_jobs = job.get("DAG_NodesTotal") 

1520 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1521 elif "NodesFailed" in job: 

1522 state_counts = { 

1523 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1524 WmsStates.READY: job.get("NodesReady", 0), 

1525 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1526 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1527 WmsStates.FAILED: job.get("NodesFailed", 0), 

1528 WmsStates.PRUNED: job.get("NodesFutile", 0), 

1529 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1530 } 

1531 try: 

1532 total_jobs = job.get("NodesTotal") 

1533 except KeyError as ex: 

1534 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1535 raise 

1536 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1537 else: 

1538 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1539 # for every DAG. 

1540 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1541 total_jobs = 0 

1542 

1543 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1544 return total_jobs, state_counts 

1545 

1546 

1547def _htc_status_to_wms_state(job): 

1548 """Convert HTCondor job status to generic wms state. 

1549 

1550 Parameters 

1551 ---------- 

1552 job : `dict` [`str`, `Any`] 

1553 HTCondor job information. 

1554 

1555 Returns 

1556 ------- 

1557 wms_state : `WmsStates` 

1558 The equivalent WmsState to given job's status. 

1559 """ 

1560 wms_state = WmsStates.MISFIT 

1561 if "JobStatus" in job: 

1562 wms_state = _htc_job_status_to_wms_state(job) 

1563 

1564 if wms_state == WmsStates.MISFIT and "NodeStatus" in job: 

1565 wms_state = _htc_node_status_to_wms_state(job) 

1566 return wms_state 

1567 

1568 

1569def _htc_job_status_to_wms_state(job): 

1570 """Convert HTCondor job status to generic wms state. 

1571 

1572 Parameters 

1573 ---------- 

1574 job : `dict` [`str`, `Any`] 

1575 HTCondor job information. 

1576 

1577 Returns 

1578 ------- 

1579 wms_state : `lsst.ctrl.bps.WmsStates` 

1580 The equivalent WmsState to given job's status. 

1581 """ 

1582 _LOG.debug( 

1583 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1584 ) 

1585 job_status = int(job["JobStatus"]) 

1586 wms_state = WmsStates.MISFIT 

1587 

1588 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1589 if job_status == JobStatus.IDLE: 

1590 wms_state = WmsStates.PENDING 

1591 elif job_status == JobStatus.RUNNING: 

1592 wms_state = WmsStates.RUNNING 

1593 elif job_status == JobStatus.REMOVED: 

1594 wms_state = WmsStates.DELETED 

1595 elif job_status == JobStatus.COMPLETED: 

1596 if ( 

1597 job.get("ExitBySignal", False) 

1598 or job.get("ExitCode", 0) 

1599 or job.get("ExitSignal", 0) 

1600 or job.get("DAG_Status", 0) 

1601 ): 

1602 wms_state = WmsStates.FAILED 

1603 else: 

1604 wms_state = WmsStates.SUCCEEDED 

1605 elif job_status == JobStatus.HELD: 

1606 wms_state = WmsStates.HELD 

1607 

1608 return wms_state 

1609 

1610 

1611def _htc_node_status_to_wms_state(job): 

1612 """Convert HTCondor node status to generic wms state. 

1613 

1614 Parameters 

1615 ---------- 

1616 job : `dict` [`str`, `Any`] 

1617 HTCondor job information. 

1618 

1619 Returns 

1620 ------- 

1621 wms_state : `lsst.ctrl.bps.WmsStates` 

1622 The equivalent WmsState to given node's status. 

1623 """ 

1624 wms_state = WmsStates.MISFIT 

1625 match job["NodeStatus"]: 

1626 case NodeStatus.NOT_READY: 

1627 wms_state = WmsStates.UNREADY 

1628 case NodeStatus.READY: 

1629 wms_state = WmsStates.READY 

1630 case NodeStatus.PRERUN: 

1631 wms_state = WmsStates.MISFIT 

1632 case NodeStatus.SUBMITTED: 

1633 if job["JobProcsHeld"]: 

1634 wms_state = WmsStates.HELD 

1635 elif job["StatusDetails"] == "not_idle": 

1636 wms_state = WmsStates.RUNNING 

1637 elif job["JobProcsQueued"]: 

1638 wms_state = WmsStates.PENDING 

1639 case NodeStatus.POSTRUN: 

1640 wms_state = WmsStates.MISFIT 

1641 case NodeStatus.DONE: 

1642 wms_state = WmsStates.SUCCEEDED 

1643 case NodeStatus.ERROR: 

1644 # Use job exit status instead of post script exit status. 

1645 if "DAGMAN error 0" in job["StatusDetails"]: 

1646 wms_state = WmsStates.SUCCEEDED 

1647 else: 

1648 wms_state = WmsStates.FAILED 

1649 case NodeStatus.FUTILE: 

1650 wms_state = WmsStates.PRUNED 

1651 return wms_state 

1652 

1653 

1654def _update_jobs(jobs1, jobs2): 

1655 """Update jobs1 with info in jobs2. 

1656 

1657 (Basically an update for nested dictionaries.) 

1658 

1659 Parameters 

1660 ---------- 

1661 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1662 HTCondor job information to be updated. 

1663 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1664 Additional HTCondor job information. 

1665 """ 

1666 for jid, jinfo in jobs2.items(): 

1667 if jid in jobs1: 

1668 jobs1[jid].update(jinfo) 

1669 else: 

1670 jobs1[jid] = jinfo 

1671 

1672 

1673def _wms_id_type(wms_id): 

1674 """Determine the type of the WMS id. 

1675 

1676 Parameters 

1677 ---------- 

1678 wms_id : `str` 

1679 WMS id identifying a job. 

1680 

1681 Returns 

1682 ------- 

1683 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1684 Type of WMS id. 

1685 """ 

1686 try: 

1687 int(float(wms_id)) 

1688 except ValueError: 

1689 wms_path = Path(wms_id) 

1690 if wms_path.is_dir(): 

1691 id_type = WmsIdType.PATH 

1692 else: 

1693 id_type = WmsIdType.GLOBAL 

1694 except TypeError: 

1695 id_type = WmsIdType.UNKNOWN 

1696 else: 

1697 id_type = WmsIdType.LOCAL 

1698 return id_type 

1699 

1700 

1701def _wms_id_to_cluster(wms_id): 

1702 """Convert WMS id to cluster id. 

1703 

1704 Parameters 

1705 ---------- 

1706 wms_id : `int` or `float` or `str` 

1707 HTCondor job id or path. 

1708 

1709 Returns 

1710 ------- 

1711 schedd_ad : `classad.ClassAd` 

1712 ClassAd describing the scheduler managing the job with the given id. 

1713 cluster_id : `int` 

1714 HTCondor cluster id. 

1715 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1716 The type of the provided id. 

1717 """ 

1718 coll = htcondor.Collector() 

1719 

1720 schedd_ad = None 

1721 cluster_id = None 

1722 id_type = _wms_id_type(wms_id) 

1723 if id_type == WmsIdType.LOCAL: 

1724 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1725 cluster_id = int(float(wms_id)) 

1726 elif id_type == WmsIdType.GLOBAL: 

1727 constraint = f'GlobalJobId == "{wms_id}"' 

1728 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1729 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()} 

1730 job_info = condor_q(constraint=constraint, schedds=schedds) 

1731 if job_info: 

1732 schedd_name, job_rec = job_info.popitem() 

1733 job_id, _ = job_rec.popitem() 

1734 schedd_ad = schedd_ads[schedd_name] 

1735 cluster_id = int(float(job_id)) 

1736 elif id_type == WmsIdType.PATH: 

1737 try: 

1738 job_info = read_dag_info(wms_id) 

1739 except (FileNotFoundError, PermissionError, OSError): 

1740 pass 

1741 else: 

1742 schedd_name, job_rec = job_info.popitem() 

1743 job_id, _ = job_rec.popitem() 

1744 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1745 cluster_id = int(float(job_id)) 

1746 else: 

1747 pass 

1748 return schedd_ad, cluster_id, id_type 

1749 

1750 

1751def _wms_id_to_dir(wms_id): 

1752 """Convert WMS id to a submit directory candidate. 

1753 

1754 The function does not check if the directory exists or if it is a valid 

1755 BPS submit directory. 

1756 

1757 Parameters 

1758 ---------- 

1759 wms_id : `int` or `float` or `str` 

1760 HTCondor job id or path. 

1761 

1762 Returns 

1763 ------- 

1764 wms_path : `pathlib.Path` or None 

1765 Submit directory candidate for the run with the given job id. If no 

1766 directory can be associated with the provided WMS id, it will be set 

1767 to None. 

1768 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1769 The type of the provided id. 

1770 

1771 Raises 

1772 ------ 

1773 TypeError 

1774 Raised if provided WMS id has invalid type. 

1775 """ 

1776 coll = htcondor.Collector() 

1777 schedd_ads = [] 

1778 

1779 constraint = None 

1780 wms_path = None 

1781 id_type = _wms_id_type(wms_id) 

1782 match id_type: 

1783 case WmsIdType.LOCAL: 

1784 constraint = f"ClusterId == {int(float(wms_id))}" 

1785 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1786 case WmsIdType.GLOBAL: 

1787 constraint = f'GlobalJobId == "{wms_id}"' 

1788 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1789 case WmsIdType.PATH: 

1790 wms_path = Path(wms_id) 

1791 case WmsIdType.UNKNOWN: 

1792 raise TypeError(f"Invalid job id type: {wms_id}") 

1793 if constraint is not None: 

1794 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1795 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"]) 

1796 if job_info: 

1797 _, job_rec = job_info.popitem() 

1798 _, job_ad = job_rec.popitem() 

1799 wms_path = Path(job_ad["Iwd"]) 

1800 return wms_path, id_type 

1801 

1802 

1803def _create_periodic_release_expr(memory, multiplier, limit): 

1804 """Construct an HTCondorAd expression for releasing held jobs. 

1805 

1806 The expression instruct HTCondor to release any job which was put on hold 

1807 due to exceeding memory requirements back to the job queue providing it 

1808 satisfies all of the conditions below: 

1809 

1810 * number of run attempts did not reach allowable number of retries, 

1811 * the memory requirements in the last failed run attempt did not reach 

1812 the specified memory limit. 

1813 

1814 Parameters 

1815 ---------- 

1816 memory : `int` 

1817 Requested memory in MB. 

1818 multiplier : `float` 

1819 Memory growth rate between retires. 

1820 limit : `int` 

1821 Memory limit. 

1822 

1823 Returns 

1824 ------- 

1825 expr : `str` 

1826 A string representing an HTCondor ClassAd expression for releasing jobs 

1827 which have been held due to exceeding the memory requirements. 

1828 """ 

1829 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1830 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1831 

1832 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1833 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1834 # The special comparison operators ensure that all comparisons below will 

1835 # evaluate to FALSE in this case. 

1836 # 

1837 # Note: 

1838 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1839 # the entire expression should evaluate to FALSE when the job is not HELD. 

1840 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1841 # but better safe than sorry. 

1842 was_mem_exceeded = ( 

1843 "JobStatus == 5 " 

1844 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1845 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1846 ) 

1847 

1848 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1849 return expr 

1850 

1851 

1852def _create_periodic_remove_expr(memory, multiplier, limit): 

1853 """Construct an HTCondorAd expression for removing jobs from the queue. 

1854 

1855 The expression instruct HTCondor to remove any job which was put on hold 

1856 due to exceeding memory requirements from the job queue providing it 

1857 satisfies any of the conditions below: 

1858 

1859 * allowable number of retries was reached, 

1860 * the memory requirements during the last failed run attempt reached 

1861 the specified memory limit. 

1862 

1863 Parameters 

1864 ---------- 

1865 memory : `int` 

1866 Requested memory in MB. 

1867 multiplier : `float` 

1868 Memory growth rate between retires. 

1869 limit : `int` 

1870 Memory limit. 

1871 

1872 Returns 

1873 ------- 

1874 expr : `str` 

1875 A string representing an HTCondor ClassAd expression for removing jobs 

1876 which were run at the maximal allowable memory and still exceeded 

1877 the memory requirements. 

1878 """ 

1879 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1880 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1881 

1882 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1883 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1884 # The special comparison operators ensure that all comparisons below will 

1885 # evaluate to FALSE in this case. 

1886 # 

1887 # Note: 

1888 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1889 # the entire expression should evaluate to FALSE when the job is not HELD. 

1890 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1891 # but better safe than sorry. 

1892 was_mem_exceeded = ( 

1893 "JobStatus == 5 " 

1894 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1895 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1896 ) 

1897 

1898 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1899 return expr 

1900 

1901 

1902def _create_request_memory_expr(memory, multiplier, limit): 

1903 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1904 

1905 Parameters 

1906 ---------- 

1907 memory : `int` 

1908 Requested memory in MB. 

1909 multiplier : `float` 

1910 Memory growth rate between retires. 

1911 limit : `int` 

1912 Memory limit. 

1913 

1914 Returns 

1915 ------- 

1916 expr : `str` 

1917 A string representing an HTCondor ClassAd expression enabling safe 

1918 memory scaling between job retries. 

1919 """ 

1920 # The check if the job was held due to exceeding memory requirements 

1921 # will be made *after* job was released back to the job queue (is in 

1922 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1923 # the ones describing job's current state. 

1924 # 

1925 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1926 # initially put in the job queue. The special comparison operators ensure 

1927 # that all comparisons below will evaluate to FALSE in this case. 

1928 was_mem_exceeded = ( 

1929 "LastJobStatus =?= 5 " 

1930 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1931 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1932 ) 

1933 

1934 # If job runs the first time or was held for reasons other than exceeding 

1935 # the memory, set the required memory to the requested value or use 

1936 # the memory value measured by HTCondor (MemoryUsage) depending on 

1937 # whichever is greater. 

1938 expr = ( 

1939 f"({was_mem_exceeded}) " 

1940 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1941 f": max({{{memory}, MemoryUsage ?: 0}})" 

1942 ) 

1943 return expr 

1944 

1945 

1946def _locate_schedds(locate_all=False): 

1947 """Find out Scheduler daemons in an HTCondor pool. 

1948 

1949 Parameters 

1950 ---------- 

1951 locate_all : `bool`, optional 

1952 If True, all available schedulers in the HTCondor pool will be located. 

1953 False by default which means that the search will be limited to looking 

1954 for the Scheduler running on a local host. 

1955 

1956 Returns 

1957 ------- 

1958 schedds : `dict` [`str`, `htcondor.Schedd`] 

1959 A mapping between Scheduler names and Python objects allowing for 

1960 interacting with them. 

1961 """ 

1962 coll = htcondor.Collector() 

1963 

1964 schedd_ads = [] 

1965 if locate_all: 

1966 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1967 else: 

1968 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1969 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1970 

1971 

1972def _gather_site_values(config, compute_site): 

1973 """Gather values specific to given site. 

1974 

1975 Parameters 

1976 ---------- 

1977 config : `lsst.ctrl.bps.BpsConfig` 

1978 BPS configuration that includes necessary submit/runtime 

1979 information. 

1980 compute_site : `str` 

1981 Compute site name. 

1982 

1983 Returns 

1984 ------- 

1985 site_values : `dict` [`str`, `Any`] 

1986 Values specific to the given site. 

1987 """ 

1988 site_values = {"attrs": {}, "profile": {}} 

1989 search_opts = {} 

1990 if compute_site: 

1991 search_opts["curvals"] = {"curr_site": compute_site} 

1992 

1993 # Determine the hard limit for the memory requirement. 

1994 found, limit = config.search("memoryLimit", opt=search_opts) 

1995 if not found: 

1996 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1997 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1998 del search_opts["default"] 

1999 

2000 # To reduce the amount of data, ignore dynamic slots (if any) as, 

2001 # by definition, they cannot have more memory than 

2002 # the partitionable slot they are the part of. 

2003 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

2004 pool_info = condor_status(constraint=constraint) 

2005 try: 

2006 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

2007 except ValueError: 

2008 _LOG.debug("No execute machine in the pool matches %s", patt) 

2009 if limit: 

2010 config[".bps_defined.memory_limit"] = limit 

2011 

2012 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

2013 site_values["memoryLimit"] = limit 

2014 

2015 found, value = config.search("accountingGroup", opt=search_opts) 

2016 if found: 

2017 site_values["accountingGroup"] = value 

2018 found, value = config.search("accountingUser", opt=search_opts) 

2019 if found: 

2020 site_values["accountingUser"] = value 

2021 

2022 key = f".site.{compute_site}.profile.condor" 

2023 if key in config: 

2024 for key, val in config[key].items(): 

2025 if key.startswith("+"): 

2026 site_values["attrs"][key[1:]] = val 

2027 else: 

2028 site_values["profile"][key] = val 

2029 

2030 return site_values