Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

659 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import os 

29import re 

30import logging 

31from enum import IntEnum, auto 

32from pathlib import Path 

33from collections import defaultdict 

34 

35import htcondor 

36from packaging import version 

37 

38from lsst.utils.timer import time_this 

39from ... import ( 

40 BaseWmsWorkflow, 

41 BaseWmsService, 

42 GenericWorkflow, 

43 GenericWorkflowJob, 

44 WmsRunReport, 

45 WmsJobReport, 

46 WmsStates 

47) 

48from ...bps_utils import ( 

49 chdir, 

50 create_count_summary 

51) 

52from .lssthtc import ( 

53 HTCDag, 

54 HTCJob, 

55 MISSING_ID, 

56 JobStatus, 

57 NodeStatus, 

58 htc_backup_files, 

59 htc_check_dagman_output, 

60 htc_create_submit_from_cmd, 

61 htc_create_submit_from_dag, 

62 htc_create_submit_from_file, 

63 htc_escape, 

64 htc_submit_dag, 

65 htc_version, 

66 read_dag_info, 

67 read_dag_log, 

68 read_dag_status, 

69 read_node_status, 

70 write_dag_info, 

71 condor_q, 

72 condor_search, 

73 condor_status, 

74 pegasus_name_to_label, 

75 summary_from_dag, 

76) 

77 

78 

79class WmsIdType(IntEnum): 

80 """Type of valid WMS ids. 

81 """ 

82 

83 UNKNOWN = auto() 

84 """The type of id cannot be determined. 

85 """ 

86 

87 LOCAL = auto() 

88 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

89 """ 

90 

91 GLOBAL = auto() 

92 """Id is a HTCondor's global job id. 

93 """ 

94 

95 PATH = auto() 

96 """Id is a submission path. 

97 """ 

98 

99 

100DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

101"""Default pattern for searching execute machines in an HTCondor pool. 

102""" 

103 

104_LOG = logging.getLogger(__name__) 

105 

106 

107class HTCondorService(BaseWmsService): 

108 """HTCondor version of WMS service. 

109 """ 

110 def prepare(self, config, generic_workflow, out_prefix=None): 

111 """Convert generic workflow to an HTCondor DAG ready for submission. 

112 

113 Parameters 

114 ---------- 

115 config : `lsst.ctrl.bps.BpsConfig` 

116 BPS configuration that includes necessary submit/runtime 

117 information. 

118 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

119 The generic workflow (e.g., has executable name and arguments). 

120 out_prefix : `str` 

121 The root directory into which all WMS-specific files are written. 

122 

123 Returns 

124 ---------- 

125 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

126 HTCondor workflow ready to be run. 

127 """ 

128 _LOG.debug("out_prefix = '%s'", out_prefix) 

129 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

130 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

131 f"{self.__class__.__module__}." 

132 f"{self.__class__.__name__}") 

133 

134 with time_this(log=_LOG, level=logging.INFO, prefix=None, 

135 msg="Completed writing out HTCondor workflow"): 

136 workflow.write(out_prefix) 

137 return workflow 

138 

139 def submit(self, workflow): 

140 """Submit a single HTCondor workflow. 

141 

142 Parameters 

143 ---------- 

144 workflow : `lsst.ctrl.bps.BaseWorkflow` 

145 A single HTCondor workflow to submit. run_id is updated after 

146 successful submission to WMS. 

147 """ 

148 dag = workflow.dag 

149 

150 ver = version.parse(htc_version()) 

151 if ver >= version.parse("8.9.3"): 

152 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

153 else: 

154 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

155 

156 # For workflow portability, internal paths are all relative. Hence 

157 # the DAG needs to be submitted to HTCondor from inside the submit 

158 # directory. 

159 with chdir(workflow.submit_path): 

160 _LOG.info("Submitting from directory: %s", os.getcwd()) 

161 schedd_dag_info = htc_submit_dag(sub) 

162 if schedd_dag_info: 

163 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

164 

165 _, dag_info = schedd_dag_info.popitem() 

166 _, dag_ad = dag_info.popitem() 

167 

168 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

169 workflow.run_id = dag.run_id 

170 else: 

171 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

172 

173 def restart(self, wms_workflow_id): 

174 """Restart a failed DAGMan workflow. 

175 

176 Parameters 

177 ---------- 

178 wms_workflow_id : `str` 

179 The directory with HTCondor files. 

180 

181 Returns 

182 ------- 

183 run_id : `str` 

184 HTCondor id of the restarted DAGMan job. If restart failed, it will 

185 be set to None. 

186 run_name : `str` 

187 Name of the restarted workflow. If restart failed, it will be set 

188 to None. 

189 message : `str` 

190 A message describing any issues encountered during the restart. 

191 If there were no issues, an empty string is returned. 

192 """ 

193 wms_path = Path(wms_workflow_id) 

194 if not wms_path.is_dir(): 

195 return None, None, f"Directory '{wms_path}' not found" 

196 

197 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

198 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

199 if not rescue_dags: 

200 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

201 

202 _LOG.info("Verifying that the workflow is not already in the job queue") 

203 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"') 

204 if schedd_dag_info: 

205 _, dag_info = schedd_dag_info.popitem() 

206 _, dag_ad = dag_info.popitem() 

207 id_ = dag_ad["GlobalJobId"] 

208 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

209 

210 _LOG.info("Checking execution status of the workflow") 

211 warn = False 

212 dag_ad = read_dag_status(str(wms_path)) 

213 if dag_ad: 

214 nodes_total = dag_ad.get("NodesTotal", 0) 

215 if nodes_total != 0: 

216 nodes_done = dag_ad.get("NodesDone", 0) 

217 if nodes_total == nodes_done: 

218 return None, None, "All jobs in the workflow finished successfully" 

219 else: 

220 warn = True 

221 else: 

222 warn = True 

223 if warn: 

224 _LOG.warning("Cannot determine the execution status of the workflow, " 

225 "continuing with restart regardless") 

226 

227 _LOG.info("Backing up select HTCondor files from previous run attempt") 

228 htc_backup_files(wms_path, subdir='backups') 

229 

230 # For workflow portability, internal paths are all relative. Hence 

231 # the DAG needs to be resubmitted to HTCondor from inside the submit 

232 # directory. 

233 _LOG.info("Adding workflow to the job queue") 

234 run_id, run_name, message = None, None, "" 

235 with chdir(wms_path): 

236 try: 

237 dag_path = next(wms_path.glob('*.dag.condor.sub')) 

238 except StopIteration: 

239 message = f"DAGMan submit description file not found in '{wms_path}'" 

240 else: 

241 sub = htc_create_submit_from_file(dag_path.name) 

242 schedd_dag_info = htc_submit_dag(sub) 

243 

244 # Save select information about the DAGMan job to a file. Use 

245 # the run name (available in the ClassAd) as the filename. 

246 if schedd_dag_info: 

247 dag_info = next(iter(schedd_dag_info.values())) 

248 dag_ad = next(iter(dag_info.values())) 

249 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

250 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

251 run_name = dag_ad["bps_run"] 

252 else: 

253 message = "DAGMan job information unavailable" 

254 

255 return run_id, run_name, message 

256 

257 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

258 """Query WMS for list of submitted WMS workflows/jobs. 

259 

260 This should be a quick lookup function to create list of jobs for 

261 other functions. 

262 

263 Parameters 

264 ---------- 

265 wms_id : `int` or `str`, optional 

266 Id or path that can be used by WMS service to look up job. 

267 user : `str`, optional 

268 User whose submitted jobs should be listed. 

269 require_bps : `bool`, optional 

270 Whether to require jobs returned in list to be bps-submitted jobs. 

271 pass_thru : `str`, optional 

272 Information to pass through to WMS. 

273 is_global : `bool`, optional 

274 If set, all job queues (and their histories) will be queried for 

275 job information. Defaults to False which means that only the local 

276 job queue will be queried. 

277 

278 Returns 

279 ------- 

280 job_ids : `list` [`Any`] 

281 Only job ids to be used by cancel and other functions. Typically 

282 this means top-level jobs (i.e., not children jobs). 

283 """ 

284 _LOG.debug("list_submitted_jobs params: " 

285 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

286 wms_id, user, require_bps, pass_thru, is_global) 

287 

288 # Determine which Schedds will be queried for job information. 

289 coll = htcondor.Collector() 

290 

291 schedd_ads = [] 

292 if is_global: 

293 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

294 else: 

295 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

296 

297 # Construct appropriate constraint expression using provided arguments. 

298 constraint = "False" 

299 if wms_id is None: 

300 if user is not None: 

301 constraint = f'(Owner == "{user}")' 

302 else: 

303 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

304 if cluster_id is not None: 

305 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

306 

307 # If provided id is either a submission path or a global id, 

308 # make sure the right Schedd will be queried regardless of 

309 # 'is_global' value. 

310 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

311 schedd_ads = [schedd_ad] 

312 if require_bps: 

313 constraint += ' && (bps_isjob == "True")' 

314 if pass_thru: 

315 if "-forcex" in pass_thru: 

316 pass_thru_2 = pass_thru.replace("-forcex", "") 

317 if pass_thru_2 and not pass_thru_2.isspace(): 

318 constraint += f" && ({pass_thru_2})" 

319 else: 

320 constraint += f" && ({pass_thru})" 

321 

322 # Create a list of scheduler daemons which need to be queried. 

323 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

324 

325 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

326 results = condor_q(constraint=constraint, schedds=schedds) 

327 

328 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

329 job_ids = [] 

330 for schedd_name, job_info in results.items(): 

331 for job_id, job_ad in job_info.items(): 

332 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

333 if "DAGManJobId" not in job_ad: 

334 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

335 else: 

336 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

337 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

338 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

339 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

340 

341 _LOG.debug("job_ids = %s", job_ids) 

342 return job_ids 

343 

344 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

345 """Return run information based upon given constraints. 

346 

347 Parameters 

348 ---------- 

349 wms_workflow_id : `str`, optional 

350 Limit to specific run based on id. 

351 user : `str`, optional 

352 Limit results to runs for this user. 

353 hist : `float`, optional 

354 Limit history search to this many days. Defaults to 0. 

355 pass_thru : `str`, optional 

356 Constraints to pass through to HTCondor. 

357 is_global : `bool`, optional 

358 If set, all job queues (and their histories) will be queried for 

359 job information. Defaults to False which means that only the local 

360 job queue will be queried. 

361 

362 Returns 

363 ------- 

364 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

365 Information about runs from given job information. 

366 message : `str` 

367 Extra message for report command to print. This could be pointers 

368 to documentation or to WMS specific commands. 

369 """ 

370 if wms_workflow_id: 

371 id_type = _wms_id_type(wms_workflow_id) 

372 if id_type == WmsIdType.LOCAL: 

373 schedulers = _locate_schedds(locate_all=is_global) 

374 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

375 elif id_type == WmsIdType.GLOBAL: 

376 schedulers = _locate_schedds(locate_all=True) 

377 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

378 elif id_type == WmsIdType.PATH: 

379 run_reports, message = _report_from_path(wms_workflow_id) 

380 else: 

381 run_reports, message = {}, 'Invalid job id' 

382 else: 

383 schedulers = _locate_schedds(locate_all=is_global) 

384 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

385 _LOG.debug("report: %s, %s", run_reports, message) 

386 

387 return list(run_reports.values()), message 

388 

389 def cancel(self, wms_id, pass_thru=None): 

390 """Cancel submitted workflows/jobs. 

391 

392 Parameters 

393 ---------- 

394 wms_id : `str` 

395 Id or path of job that should be canceled. 

396 pass_thru : `str`, optional 

397 Information to pass through to WMS. 

398 

399 Returns 

400 -------- 

401 deleted : `bool` 

402 Whether successful deletion or not. Currently, if any doubt or any 

403 individual jobs not deleted, return False. 

404 message : `str` 

405 Any message from WMS (e.g., error details). 

406 """ 

407 _LOG.debug("Canceling wms_id = %s", wms_id) 

408 

409 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

410 

411 if cluster_id is None: 

412 deleted = False 

413 message = "invalid id" 

414 else: 

415 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s", 

416 cluster_id, schedd_ad["Name"]) 

417 schedd = htcondor.Schedd(schedd_ad) 

418 

419 constraint = f"ClusterId == {cluster_id}" 

420 if pass_thru is not None and "-forcex" in pass_thru: 

421 pass_thru_2 = pass_thru.replace("-forcex", "") 

422 if pass_thru_2 and not pass_thru_2.isspace(): 

423 constraint += f"&& ({pass_thru_2})" 

424 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

425 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

426 else: 

427 if pass_thru: 

428 constraint += f"&& ({pass_thru})" 

429 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

430 results = schedd.act(htcondor.JobAction.Remove, constraint) 

431 _LOG.debug("Remove results: %s", results) 

432 

433 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

434 deleted = True 

435 message = "" 

436 else: 

437 deleted = False 

438 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

439 message = "no such bps job in batch queue" 

440 else: 

441 message = f"unknown problems deleting: {results}" 

442 

443 _LOG.debug("deleted: %s; message = %s", deleted, message) 

444 return deleted, message 

445 

446 

447class HTCondorWorkflow(BaseWmsWorkflow): 

448 """Single HTCondor workflow. 

449 

450 Parameters 

451 ---------- 

452 name : `str` 

453 Unique name for Workflow used when naming files. 

454 config : `lsst.ctrl.bps.BpsConfig` 

455 BPS configuration that includes necessary submit/runtime information. 

456 """ 

457 def __init__(self, name, config=None): 

458 super().__init__(name, config) 

459 self.dag = None 

460 

461 @classmethod 

462 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

463 # Docstring inherited 

464 htc_workflow = cls(generic_workflow.name, config) 

465 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

466 

467 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

468 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

469 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

470 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

471 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

472 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

473 

474 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

475 if isinstance(tmp_template, str): 

476 subdir_template = defaultdict(lambda: tmp_template) 

477 else: 

478 subdir_template = tmp_template 

479 

480 # Create all DAG jobs 

481 site_values = {} # cache compute site specific values to reduce config lookups 

482 for job_name in generic_workflow: 

483 gwjob = generic_workflow.get_job(job_name) 

484 if gwjob.compute_site not in site_values: 

485 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

486 htc_job = _create_job(subdir_template[gwjob.label], site_values[gwjob.compute_site], 

487 generic_workflow, gwjob, out_prefix) 

488 htc_workflow.dag.add_job(htc_job) 

489 

490 # Add job dependencies to the DAG 

491 for job_name in generic_workflow: 

492 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

493 

494 # If final job exists in generic workflow, create DAG final job 

495 final = generic_workflow.get_final() 

496 if final and isinstance(final, GenericWorkflowJob): 

497 if final.compute_site and final.compute_site not in site_values: 

498 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

499 final_htjob = _create_job(subdir_template[final.label], site_values[final.compute_site], 

500 generic_workflow, final, out_prefix) 

501 if "post" not in final_htjob.dagcmds: 

502 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

503 f" {final.name} $DAG_STATUS $RETURN" 

504 htc_workflow.dag.add_final_job(final_htjob) 

505 elif final and isinstance(final, GenericWorkflow): 

506 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

507 elif final: 

508 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

509 

510 return htc_workflow 

511 

512 def write(self, out_prefix): 

513 """Output HTCondor DAGMan files needed for workflow submission. 

514 

515 Parameters 

516 ---------- 

517 out_prefix : `str` 

518 Directory prefix for HTCondor files. 

519 """ 

520 self.submit_path = out_prefix 

521 os.makedirs(out_prefix, exist_ok=True) 

522 

523 # Write down the workflow in HTCondor format. 

524 self.dag.write(out_prefix, "jobs/{self.label}") 

525 

526 

527def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

528 """Convert GenericWorkflow job nodes to DAG jobs. 

529 

530 Parameters 

531 ---------- 

532 subdir_template : `str` 

533 Template for making subdirs. 

534 site_values : `dict` 

535 Site specific values 

536 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

537 Generic workflow that is being converted. 

538 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

539 The generic job to convert to a HTCondor job. 

540 out_prefix : `str` 

541 Directory prefix for HTCondor files. 

542 

543 Returns 

544 ------- 

545 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

546 The HTCondor job equivalent to the given generic job. 

547 """ 

548 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

549 

550 curvals = defaultdict(str) 

551 curvals["label"] = gwjob.label 

552 if gwjob.tags: 

553 curvals.update(gwjob.tags) 

554 

555 subdir = subdir_template.format_map(curvals) 

556 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

557 

558 htc_job_cmds = { 

559 "universe": "vanilla", 

560 "should_transfer_files": "YES", 

561 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

562 "transfer_output_files": '""', # Set to empty string to disable 

563 "transfer_executable": "False", 

564 "getenv": "True", 

565 

566 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

567 # htcondor to put on hold any jobs which exited by a signal. 

568 "on_exit_hold": "ExitBySignal == true", 

569 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

570 '"Handling signal as if job has gone over memory limit.")', 

571 "on_exit_hold_subcode": "34" 

572 } 

573 

574 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

575 

576 # job stdout, stderr, htcondor user log. 

577 for key in ("output", "error", "log"): 

578 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

579 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

580 

581 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], 

582 out_prefix)) 

583 

584 # Add the job cmds dict to the job object. 

585 htc_job.add_job_cmds(htc_job_cmds) 

586 

587 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

588 

589 # Add job attributes to job. 

590 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

591 htc_job.add_job_attrs(gwjob.attrs) 

592 htc_job.add_job_attrs(site_values["attrs"]) 

593 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

594 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

595 "bps_job_label": gwjob.label}) 

596 

597 return htc_job 

598 

599 

600def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

601 """Translate the job data that are one to one mapping 

602 

603 Parameters 

604 ---------- 

605 cached_vals : `dict` [`str`, `Any`] 

606 Config values common to jobs with same label. 

607 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

608 Generic workflow that contains job to being converted. 

609 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

610 Generic workflow job to be converted. 

611 

612 Returns 

613 ------- 

614 htc_job_commands : `dict` [`str`, `Any`] 

615 Contains commands which can appear in the HTCondor submit description 

616 file. 

617 """ 

618 # Values in the job script that just are name mappings. 

619 job_translation = {"mail_to": "notify_user", 

620 "when_to_mail": "notification", 

621 "request_cpus": "request_cpus", 

622 "priority": "priority", 

623 "category": "category"} 

624 

625 jobcmds = {} 

626 for gwkey, htckey in job_translation.items(): 

627 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

628 

629 # job commands that need modification 

630 if gwjob.number_of_retries: 

631 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

632 

633 if gwjob.retry_unless_exit: 

634 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

635 

636 if gwjob.request_disk: 

637 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

638 

639 if gwjob.request_memory: 

640 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

641 

642 if gwjob.memory_multiplier: 

643 # Do not use try-except! At the moment, BpsConfig returns an empty 

644 # string if it does not contain the key. 

645 memory_limit = cached_vals["memoryLimit"] 

646 if not memory_limit: 

647 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

648 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

649 "search pattern 'executeMachinesPattern' might help.") 

650 

651 # Set maximal amount of memory job can ask for. 

652 # 

653 # The check below assumes that 'memory_limit' was set to a value which 

654 # realistically reflects actual physical limitations of a given compute 

655 # resource. 

656 memory_max = memory_limit 

657 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

658 memory_max = gwjob.request_memory_max 

659 

660 # Make job ask for more memory each time it failed due to insufficient 

661 # memory requirements. 

662 jobcmds["request_memory"] = \ 

663 _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

664 

665 # Periodically release jobs which are being held due to exceeding 

666 # memory. Stop doing that (by removing the job from the HTCondor queue) 

667 # after the maximal number of retries has been reached or the job was 

668 # already run at maximal allowed memory. 

669 jobcmds["periodic_release"] = \ 

670 _create_periodic_release_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

671 jobcmds["periodic_remove"] = \ 

672 _create_periodic_remove_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

673 

674 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

675 # May need to move to special site-specific implementation if sites use 

676 # other mechanisms. 

677 if gwjob.concurrency_limit: 

678 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

679 

680 # Handle command line 

681 if gwjob.executable.transfer_executable: 

682 jobcmds["transfer_executable"] = "True" 

683 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

684 else: 

685 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

686 

687 if gwjob.arguments: 

688 arguments = gwjob.arguments 

689 arguments = _replace_cmd_vars(arguments, gwjob) 

690 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

691 arguments = _fix_env_var_syntax(arguments) 

692 jobcmds["arguments"] = arguments 

693 

694 # Add extra "pass-thru" job commands 

695 if gwjob.profile: 

696 for key, val in gwjob.profile.items(): 

697 jobcmds[key] = htc_escape(val) 

698 for key, val in cached_vals["profile"]: 

699 jobcmds[key] = htc_escape(val) 

700 

701 return jobcmds 

702 

703 

704def _translate_dag_cmds(gwjob): 

705 """Translate job values into DAGMan commands. 

706 

707 Parameters 

708 ---------- 

709 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

710 Job containing values to be translated. 

711 

712 Returns 

713 ------- 

714 dagcmds : `dict` [`str`, `Any`] 

715 DAGMan commands for the job. 

716 """ 

717 # Values in the dag script that just are name mappings. 

718 dag_translation = {"abort_on_value": "abort_dag_on", 

719 "abort_return_value": "abort_exit"} 

720 

721 dagcmds = {} 

722 for gwkey, htckey in dag_translation.items(): 

723 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

724 

725 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

726 return dagcmds 

727 

728 

729def _fix_env_var_syntax(oldstr): 

730 """Change ENV place holders to HTCondor Env var syntax. 

731 

732 Parameters 

733 ---------- 

734 oldstr : `str` 

735 String in which environment variable syntax is to be fixed. 

736 

737 Returns 

738 ------- 

739 newstr : `str` 

740 Given string with environment variable syntax fixed. 

741 """ 

742 newstr = oldstr 

743 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

744 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

745 return newstr 

746 

747 

748def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

749 """Replace file placeholders in command line arguments with correct 

750 physical file names. 

751 

752 Parameters 

753 ---------- 

754 use_shared : `bool` 

755 Whether HTCondor can assume shared filesystem. 

756 arguments : `str` 

757 Arguments string in which to replace file placeholders. 

758 workflow : `lsst.ctrl.bps.GenericWorkflow` 

759 Generic workflow that contains file information. 

760 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

761 The job corresponding to the arguments. 

762 

763 Returns 

764 ------- 

765 arguments : `str` 

766 Given arguments string with file placeholders replaced. 

767 """ 

768 # Replace input file placeholders with paths. 

769 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

770 if not gwfile.wms_transfer: 

771 # Must assume full URI if in command line and told WMS is not 

772 # responsible for transferring file. 

773 uri = gwfile.src_uri 

774 elif use_shared: 

775 if gwfile.job_shared: 

776 # Have shared filesystems and jobs can share file. 

777 uri = gwfile.src_uri 

778 else: 

779 # Taking advantage of inside knowledge. Not future-proof. 

780 # Temporary fix until have job wrapper that pulls files 

781 # within job. 

782 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

783 uri = "butler.yaml" 

784 else: 

785 uri = os.path.basename(gwfile.src_uri) 

786 else: # Using push transfer 

787 uri = os.path.basename(gwfile.src_uri) 

788 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

789 

790 # Replace output file placeholders with paths. 

791 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

792 if not gwfile.wms_transfer: 

793 # Must assume full URI if in command line and told WMS is not 

794 # responsible for transferring file. 

795 uri = gwfile.src_uri 

796 elif use_shared: 

797 if gwfile.job_shared: 

798 # Have shared filesystems and jobs can share file. 

799 uri = gwfile.src_uri 

800 else: 

801 uri = os.path.basename(gwfile.src_uri) 

802 else: # Using push transfer 

803 uri = os.path.basename(gwfile.src_uri) 

804 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

805 return arguments 

806 

807 

808def _replace_cmd_vars(arguments, gwjob): 

809 """Replace format-style placeholders in arguments. 

810 

811 Parameters 

812 ---------- 

813 arguments : `str` 

814 Arguments string in which to replace placeholders. 

815 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

816 Job containing values to be used to replace placeholders 

817 (in particular gwjob.cmdvals). 

818 

819 Returns 

820 ------- 

821 arguments : `str` 

822 Given arguments string with placeholders replaced. 

823 """ 

824 try: 

825 arguments = arguments.format(**gwjob.cmdvals) 

826 except (KeyError, TypeError): # TypeError in case None instead of {} 

827 _LOG.error("Could not replace command variables:\n" 

828 "arguments: %s\n" 

829 "cmdvals: %s", arguments, gwjob.cmdvals) 

830 raise 

831 return arguments 

832 

833 

834def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

835 """Add job input files from generic workflow to job. 

836 

837 Parameters 

838 ---------- 

839 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

840 The generic workflow (e.g., has executable name and arguments). 

841 job_name : `str` 

842 Unique name for the job. 

843 use_shared : `bool` 

844 Whether job has access to files via shared filesystem. 

845 out_prefix : `str` 

846 The root directory into which all WMS-specific files are written. 

847 

848 Returns 

849 ------- 

850 htc_commands : `dict` [`str`, `str`] 

851 HTCondor commands for the job submission script. 

852 """ 

853 htc_commands = {} 

854 inputs = [] 

855 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

856 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

857 

858 uri = Path(gwf_file.src_uri) 

859 

860 # Note if use_shared and job_shared, don't need to transfer file. 

861 

862 if not use_shared: # Copy file using push to job 

863 inputs.append(str(uri.relative_to(out_prefix))) 

864 elif not gwf_file.job_shared: # Jobs require own copy 

865 

866 # if using shared filesystem, but still need copy in job. Use 

867 # HTCondor's curl plugin for a local copy. 

868 

869 # Execution butler is represented as a directory which the 

870 # curl plugin does not handle. Taking advantage of inside 

871 # knowledge for temporary fix until have job wrapper that pulls 

872 # files within job. 

873 if gwf_file.name == "butlerConfig": 

874 # The execution butler directory doesn't normally exist until 

875 # the submit phase so checking for suffix instead of using 

876 # is_dir(). If other non-yaml file exists they would have a 

877 # different gwf_file.name. 

878 if uri.suffix == ".yaml": # Single file, so just copy. 

879 inputs.append(f"file://{uri}") 

880 else: 

881 inputs.append(f"file://{uri / 'butler.yaml'}") 

882 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

883 elif uri.is_dir(): 

884 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job " 

885 f"{gwf_file.src_uri}") 

886 else: 

887 inputs.append(f"file://{uri}") 

888 

889 if inputs: 

890 htc_commands["transfer_input_files"] = ",".join(inputs) 

891 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

892 return htc_commands 

893 

894 

895def _report_from_path(wms_path): 

896 """Gather run information from a given run directory. 

897 

898 Parameters 

899 ---------- 

900 wms_path : `str` 

901 The directory containing the submit side files (e.g., HTCondor files). 

902 

903 Returns 

904 ------- 

905 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

906 Run information for the detailed report. The key is the HTCondor id 

907 and the value is a collection of report information for that run. 

908 message : `str` 

909 Message to be printed with the summary report. 

910 """ 

911 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

912 if wms_workflow_id == MISSING_ID: 

913 run_reports = {} 

914 else: 

915 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

916 return run_reports, message 

917 

918 

919def _report_from_id(wms_workflow_id, hist, schedds=None): 

920 """Gather run information using workflow id. 

921 

922 Parameters 

923 ---------- 

924 wms_workflow_id : `str` 

925 Limit to specific run based on id. 

926 hist : `float` 

927 Limit history search to this many days. 

928 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

929 HTCondor schedulers which to query for job information. If None 

930 (default), all queries will be run against the local scheduler only. 

931 

932 Returns 

933 ------- 

934 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

935 Run information for the detailed report. The key is the HTCondor id 

936 and the value is a collection of report information for that run. 

937 message : `str` 

938 Message to be printed with the summary report. 

939 """ 

940 dag_constraint = 'regexp("dagman$", Cmd)' 

941 try: 

942 cluster_id = int(float(wms_workflow_id)) 

943 except ValueError: 

944 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

945 else: 

946 dag_constraint += f" && ClusterId == {cluster_id}" 

947 

948 # With the current implementation of the condor_* functions the query will 

949 # always return only one match per Scheduler. 

950 # 

951 # Even in the highly unlikely situation where HTCondor history (which 

952 # condor_search queries too) is long enough to have jobs from before the 

953 # cluster ids were rolled over (and as a result there is more then one job 

954 # with the same cluster id) they will not show up in the results. 

955 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

956 if len(schedd_dag_info) == 0: 

957 run_reports = {} 

958 message = "" 

959 elif len(schedd_dag_info) == 1: 

960 _, dag_info = schedd_dag_info.popitem() 

961 dag_id, dag_ad = dag_info.popitem() 

962 

963 # Create a mapping between jobs and their classads. The keys will be 

964 # of format 'ClusterId.ProcId'. 

965 job_info = {dag_id: dag_ad} 

966 

967 # Find jobs (nodes) belonging to that DAGMan job. 

968 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

969 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

970 if schedd_job_info: 

971 _, node_info = schedd_job_info.popitem() 

972 job_info.update(node_info) 

973 

974 # Collect additional pieces of information about jobs using HTCondor 

975 # files in the submission directory. 

976 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

977 _update_jobs(job_info, path_jobs) 

978 

979 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

980 else: 

981 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

982 run_reports = {} 

983 message = f"More than one job matches id '{wms_workflow_id}', " \ 

984 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

985 return run_reports, message 

986 

987 

988def _get_info_from_path(wms_path): 

989 """Gather run information from a given run directory. 

990 

991 Parameters 

992 ---------- 

993 wms_path : `str` 

994 Directory containing HTCondor files. 

995 

996 Returns 

997 ------- 

998 wms_workflow_id : `str` 

999 The run id which is a DAGman job id. 

1000 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1001 Information about jobs read from files in the given directory. 

1002 The key is the HTCondor id and the value is a dictionary of HTCondor 

1003 keys and values. 

1004 message : `str` 

1005 Message to be printed with the summary report. 

1006 """ 

1007 messages = [] 

1008 try: 

1009 wms_workflow_id, jobs = read_dag_log(wms_path) 

1010 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1011 _update_jobs(jobs, read_node_status(wms_path)) 

1012 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1013 

1014 # Add more info for DAGman job 

1015 job = jobs[wms_workflow_id] 

1016 job.update(read_dag_status(wms_path)) 

1017 

1018 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1019 if "bps_run" not in job: 

1020 _add_run_info(wms_path, job) 

1021 

1022 message = htc_check_dagman_output(wms_path) 

1023 if message: 

1024 messages.append(message) 

1025 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

1026 jobs[wms_workflow_id]["total_jobs"]) 

1027 

1028 # Add extra pieces of information which cannot be found in HTCondor 

1029 # generated files like 'GlobalJobId'. 

1030 # 

1031 # Do not treat absence of this file as a serious error. Neither runs 

1032 # submitted with earlier versions of the plugin nor the runs submitted 

1033 # with Pegasus plugin will have it at the moment. However, once enough 

1034 # time passes and Pegasus plugin will have its own report() method 

1035 # (instead of sneakily using HTCondor's one), the lack of that file 

1036 # should be treated as seriously as lack of any other file. 

1037 try: 

1038 job_info = read_dag_info(wms_path) 

1039 except FileNotFoundError as exc: 

1040 message = f"Warn: Some information may not be available: {exc}" 

1041 messages.append(message) 

1042 else: 

1043 schedd_name = next(iter(job_info)) 

1044 job_ad = next(iter(job_info[schedd_name].values())) 

1045 job.update(job_ad) 

1046 except FileNotFoundError: 

1047 message = f"Could not find HTCondor files in '{wms_path}'" 

1048 _LOG.warning(message) 

1049 messages.append(message) 

1050 wms_workflow_id = MISSING_ID 

1051 jobs = {} 

1052 

1053 message = '\n'.join([msg for msg in messages if msg]) 

1054 return wms_workflow_id, jobs, message 

1055 

1056 

1057def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1058 """Gather run information to be used in generating summary reports. 

1059 

1060 Parameters 

1061 ---------- 

1062 wms_workflow_id : `str` 

1063 The run id to create the report for. 

1064 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1065 Mapping HTCondor job id to job information. 

1066 

1067 Returns 

1068 ------- 

1069 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1070 Run information for the detailed report. The key is the given HTCondor 

1071 id and the value is a collection of report information for that run. 

1072 """ 

1073 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1074 dag_job = jobs[wms_workflow_id] 

1075 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1076 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1077 path=dag_job["Iwd"], 

1078 label=dag_job.get("bps_job_label", "MISS"), 

1079 run=dag_job.get("bps_run", "MISS"), 

1080 project=dag_job.get("bps_project", "MISS"), 

1081 campaign=dag_job.get("bps_campaign", "MISS"), 

1082 payload=dag_job.get("bps_payload", "MISS"), 

1083 operator=_get_owner(dag_job), 

1084 run_summary=_get_run_summary(dag_job), 

1085 state=_htc_status_to_wms_state(dag_job), 

1086 jobs=[], 

1087 total_number_jobs=dag_job["total_jobs"], 

1088 job_state_counts=dag_job["state_counts"]) 

1089 

1090 for job_id, job_info in jobs.items(): 

1091 try: 

1092 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

1093 job_report = WmsJobReport(wms_id=job_id, 

1094 name=job_info.get("DAGNodeName", job_id), 

1095 label=job_info.get("bps_job_label", 

1096 pegasus_name_to_label(job_info["DAGNodeName"])), 

1097 state=_htc_status_to_wms_state(job_info)) 

1098 if job_report.label == "init": 

1099 job_report.label = "pipetaskInit" 

1100 report.jobs.append(job_report) 

1101 except KeyError as ex: 

1102 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1103 raise 

1104 

1105 run_reports = {report.wms_id: report} 

1106 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1107 return run_reports 

1108 

1109 

1110def _summary_report(user, hist, pass_thru, schedds=None): 

1111 """Gather run information to be used in generating summary reports. 

1112 

1113 Parameters 

1114 ---------- 

1115 user : `str` 

1116 Run lookup restricted to given user. 

1117 hist : `float` 

1118 How many previous days to search for run information. 

1119 pass_thru : `str` 

1120 Advanced users can define the HTCondor constraint to be used 

1121 when searching queue and history. 

1122 

1123 Returns 

1124 ------- 

1125 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1126 Run information for the summary report. The keys are HTCondor ids and 

1127 the values are collections of report information for each run. 

1128 message : `str` 

1129 Message to be printed with the summary report. 

1130 """ 

1131 # only doing summary report so only look for dagman jobs 

1132 if pass_thru: 

1133 constraint = pass_thru 

1134 else: 

1135 # Notes: 

1136 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1137 # manually restarted. 

1138 # * Any job with DAGManJobID isn't a DAG job 

1139 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1140 if user: 

1141 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1142 

1143 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1144 

1145 # Have list of DAGMan jobs, need to get run_report info. 

1146 run_reports = {} 

1147 for jobs in job_info.values(): 

1148 for job_id, job in jobs.items(): 

1149 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1150 # If didn't get from queue information (e.g., Kerberos bug), 

1151 # try reading from file. 

1152 if total_jobs == 0: 

1153 try: 

1154 job.update(read_dag_status(job["Iwd"])) 

1155 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1156 except StopIteration: 

1157 pass # don't kill report can't find htcondor files 

1158 

1159 if "bps_run" not in job: 

1160 _add_run_info(job["Iwd"], job) 

1161 report = WmsRunReport(wms_id=job_id, 

1162 global_wms_id=job["GlobalJobId"], 

1163 path=job["Iwd"], 

1164 label=job.get("bps_job_label", "MISS"), 

1165 run=job.get("bps_run", "MISS"), 

1166 project=job.get("bps_project", "MISS"), 

1167 campaign=job.get("bps_campaign", "MISS"), 

1168 payload=job.get("bps_payload", "MISS"), 

1169 operator=_get_owner(job), 

1170 run_summary=_get_run_summary(job), 

1171 state=_htc_status_to_wms_state(job), 

1172 jobs=[], 

1173 total_number_jobs=total_jobs, 

1174 job_state_counts=state_counts) 

1175 run_reports[report.global_wms_id] = report 

1176 

1177 return run_reports, "" 

1178 

1179 

1180def _add_run_info(wms_path, job): 

1181 """Find BPS run information elsewhere for runs without bps attributes. 

1182 

1183 Parameters 

1184 ---------- 

1185 wms_path : `str` 

1186 Path to submit files for the run. 

1187 job : `dict` [`str`, `Any`] 

1188 HTCondor dag job information. 

1189 

1190 Raises 

1191 ------ 

1192 StopIteration 

1193 If cannot find file it is looking for. Permission errors are 

1194 caught and job's run is marked with error. 

1195 """ 

1196 path = Path(wms_path) / "jobs" 

1197 try: 

1198 subfile = next(path.glob("**/*.sub")) 

1199 except (StopIteration, PermissionError): 

1200 job["bps_run"] = "Unavailable" 

1201 else: 

1202 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1203 try: 

1204 with open(subfile, "r", encoding='utf-8') as fh: 

1205 for line in fh: 

1206 if line.startswith("+bps_"): 

1207 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1208 if m: 

1209 _LOG.debug("Matching line: %s", line) 

1210 job[m.group(1)] = m.group(2).replace('"', "") 

1211 else: 

1212 _LOG.debug("Could not parse attribute: %s", line) 

1213 except PermissionError: 

1214 job["bps_run"] = "PermissionError" 

1215 _LOG.debug("After adding job = %s", job) 

1216 

1217 

1218def _get_owner(job): 

1219 """Get the owner of a dag job. 

1220 

1221 Parameters 

1222 ---------- 

1223 job : `dict` [`str`, `Any`] 

1224 HTCondor dag job information. 

1225 

1226 Returns 

1227 ------- 

1228 owner : `str` 

1229 Owner of the dag job. 

1230 """ 

1231 owner = job.get("bps_operator", None) 

1232 if not owner: 

1233 owner = job.get("Owner", None) 

1234 if not owner: 

1235 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1236 owner = "MISS" 

1237 return owner 

1238 

1239 

1240def _get_run_summary(job): 

1241 """Get the run summary for a job. 

1242 

1243 Parameters 

1244 ---------- 

1245 job : `dict` [`str`, `Any`] 

1246 HTCondor dag job information. 

1247 

1248 Returns 

1249 ------- 

1250 summary : `str` 

1251 Number of jobs per PipelineTask label in approximate pipeline order. 

1252 Format: <label>:<count>[;<label>:<count>]+ 

1253 """ 

1254 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1255 if not summary: 

1256 summary, _ = summary_from_dag(job["Iwd"]) 

1257 if not summary: 

1258 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1259 _LOG.debug("_get_run_summary: summary=%s", summary) 

1260 

1261 # Workaround sometimes using init vs pipetaskInit 

1262 summary = summary.replace("init:", "pipetaskInit:") 

1263 

1264 if "pegasus_version" in job and "pegasus" not in summary: 

1265 summary += ";pegasus:0" 

1266 

1267 return summary 

1268 

1269 

1270def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1271 """Count number of jobs per WMS state. 

1272 

1273 Parameters 

1274 ---------- 

1275 wms_workflow_id : `str` 

1276 HTCondor job id. 

1277 jobs : `dict` [`str`, `Any`] 

1278 HTCondor dag job information. 

1279 

1280 Returns 

1281 ------- 

1282 total_count : `int` 

1283 Total number of dag nodes. 

1284 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1285 Keys are the different WMS states and values are counts of jobs 

1286 that are in that WMS state. 

1287 """ 

1288 state_counts = dict.fromkeys(WmsStates, 0) 

1289 

1290 for jid, jinfo in jobs.items(): 

1291 if jid != wms_workflow_id: 

1292 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1293 

1294 total_counted = sum(state_counts.values()) 

1295 if "NodesTotal" in jobs[wms_workflow_id]: 

1296 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1297 else: 

1298 total_count = total_counted 

1299 

1300 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1301 

1302 return total_count, state_counts 

1303 

1304 

1305def _get_state_counts_from_dag_job(job): 

1306 """Count number of jobs per WMS state. 

1307 

1308 Parameters 

1309 ---------- 

1310 job : `dict` [`str`, `Any`] 

1311 HTCondor dag job information. 

1312 

1313 Returns 

1314 ------- 

1315 total_count : `int` 

1316 Total number of dag nodes. 

1317 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1318 Keys are the different WMS states and values are counts of jobs 

1319 that are in that WMS state. 

1320 """ 

1321 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1322 state_counts = dict.fromkeys(WmsStates, 0) 

1323 if "DAG_NodesReady" in job: 

1324 state_counts = { 

1325 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1326 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1327 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1328 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1329 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1330 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1331 total_jobs = job.get("DAG_NodesTotal") 

1332 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1333 elif "NodesFailed" in job: 

1334 state_counts = { 

1335 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1336 WmsStates.READY: job.get("NodesReady", 0), 

1337 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1338 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1339 WmsStates.FAILED: job.get("NodesFailed", 0), 

1340 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1341 try: 

1342 total_jobs = job.get("NodesTotal") 

1343 except KeyError as ex: 

1344 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1345 raise 

1346 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1347 else: 

1348 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1349 # for every DAG. 

1350 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1351 total_jobs = 0 

1352 

1353 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1354 return total_jobs, state_counts 

1355 

1356 

1357def _htc_status_to_wms_state(job): 

1358 """Convert HTCondor job status to generic wms state. 

1359 

1360 Parameters 

1361 ---------- 

1362 job : `dict` [`str`, `Any`] 

1363 HTCondor job information. 

1364 

1365 Returns 

1366 ------- 

1367 wms_state : `WmsStates` 

1368 The equivalent WmsState to given job's status. 

1369 """ 

1370 wms_state = WmsStates.MISFIT 

1371 if "JobStatus" in job: 

1372 wms_state = _htc_job_status_to_wms_state(job) 

1373 elif "NodeStatus" in job: 

1374 wms_state = _htc_node_status_to_wms_state(job) 

1375 return wms_state 

1376 

1377 

1378def _htc_job_status_to_wms_state(job): 

1379 """Convert HTCondor job status to generic wms state. 

1380 

1381 Parameters 

1382 ---------- 

1383 job : `dict` [`str`, `Any`] 

1384 HTCondor job information. 

1385 

1386 Returns 

1387 ------- 

1388 wms_state : `lsst.ctrl.bps.WmsStates` 

1389 The equivalent WmsState to given job's status. 

1390 """ 

1391 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1392 type(job["JobStatus"])) 

1393 job_status = int(job["JobStatus"]) 

1394 wms_state = WmsStates.MISFIT 

1395 

1396 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1397 if job_status == JobStatus.IDLE: 

1398 wms_state = WmsStates.PENDING 

1399 elif job_status == JobStatus.RUNNING: 

1400 wms_state = WmsStates.RUNNING 

1401 elif job_status == JobStatus.REMOVED: 

1402 wms_state = WmsStates.DELETED 

1403 elif job_status == JobStatus.COMPLETED: 

1404 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1405 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1406 job.get("ReturnValue", 0): 

1407 wms_state = WmsStates.FAILED 

1408 else: 

1409 wms_state = WmsStates.SUCCEEDED 

1410 elif job_status == JobStatus.HELD: 

1411 wms_state = WmsStates.HELD 

1412 

1413 return wms_state 

1414 

1415 

1416def _htc_node_status_to_wms_state(job): 

1417 """Convert HTCondor status to generic wms state. 

1418 

1419 Parameters 

1420 ---------- 

1421 job : `dict` [`str`, `Any`] 

1422 HTCondor job information. 

1423 

1424 Returns 

1425 ------- 

1426 wms_state : `lsst.ctrl.bps.WmsStates` 

1427 The equivalent WmsState to given node's status. 

1428 """ 

1429 wms_state = WmsStates.MISFIT 

1430 

1431 status = job["NodeStatus"] 

1432 if status == NodeStatus.NOT_READY: 

1433 wms_state = WmsStates.UNREADY 

1434 elif status == NodeStatus.READY: 

1435 wms_state = WmsStates.READY 

1436 elif status == NodeStatus.PRERUN: 

1437 wms_state = WmsStates.MISFIT 

1438 elif status == NodeStatus.SUBMITTED: 

1439 if job["JobProcsHeld"]: 

1440 wms_state = WmsStates.HELD 

1441 elif job["StatusDetails"] == "not_idle": 

1442 wms_state = WmsStates.RUNNING 

1443 elif job["JobProcsQueued"]: 

1444 wms_state = WmsStates.PENDING 

1445 elif status == NodeStatus.POSTRUN: 

1446 wms_state = WmsStates.MISFIT 

1447 elif status == NodeStatus.DONE: 

1448 wms_state = WmsStates.SUCCEEDED 

1449 elif status == NodeStatus.ERROR: 

1450 # Use job exist instead of post script exit 

1451 if "DAGMAN error 0" in job["StatusDetails"]: 

1452 wms_state = WmsStates.SUCCEEDED 

1453 else: 

1454 wms_state = WmsStates.FAILED 

1455 

1456 return wms_state 

1457 

1458 

1459def _update_jobs(jobs1, jobs2): 

1460 """Update jobs1 with info in jobs2. 

1461 

1462 (Basically an update for nested dictionaries.) 

1463 

1464 Parameters 

1465 ---------- 

1466 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1467 HTCondor job information to be updated. 

1468 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1469 Additional HTCondor job information. 

1470 """ 

1471 for jid, jinfo in jobs2.items(): 

1472 if jid in jobs1: 

1473 jobs1[jid].update(jinfo) 

1474 else: 

1475 jobs1[jid] = jinfo 

1476 

1477 

1478def _wms_id_type(wms_id): 

1479 """Determine the type of the WMS id. 

1480 

1481 Parameters 

1482 ---------- 

1483 wms_id : `str` 

1484 WMS id identifying a job. 

1485 

1486 Returns 

1487 ------- 

1488 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1489 Type of WMS id. 

1490 """ 

1491 try: 

1492 int(float(wms_id)) 

1493 except ValueError: 

1494 wms_path = Path(wms_id) 

1495 if wms_path.exists(): 

1496 id_type = WmsIdType.PATH 

1497 else: 

1498 id_type = WmsIdType.GLOBAL 

1499 except TypeError: 

1500 id_type = WmsIdType.UNKNOWN 

1501 else: 

1502 id_type = WmsIdType.LOCAL 

1503 return id_type 

1504 

1505 

1506def _wms_id_to_cluster(wms_id): 

1507 """Convert WMS id to cluster id. 

1508 

1509 Parameters 

1510 ---------- 

1511 wms_id : `int` or `float` or `str` 

1512 HTCondor job id or path. 

1513 

1514 Returns 

1515 ------- 

1516 schedd_ad : `classad.ClassAd` 

1517 ClassAd describing the scheduler managing the job with the given id. 

1518 cluster_id : `int` 

1519 HTCondor cluster id. 

1520 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1521 The type of the provided id. 

1522 """ 

1523 coll = htcondor.Collector() 

1524 

1525 schedd_ad = None 

1526 cluster_id = None 

1527 id_type = _wms_id_type(wms_id) 

1528 if id_type == WmsIdType.LOCAL: 

1529 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1530 cluster_id = int(float(wms_id)) 

1531 elif id_type == WmsIdType.GLOBAL: 

1532 constraint = f'GlobalJobId == "{wms_id}"' 

1533 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1534 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1535 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1536 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries) 

1537 if (ads := query.nextAdsNonBlocking())} 

1538 if results: 

1539 schedd_name = next(iter(results)) 

1540 schedd_ad = schedd_ads[schedd_name] 

1541 cluster_id = results[schedd_name]["ClusterId"] 

1542 elif id_type == WmsIdType.PATH: 

1543 try: 

1544 job_info = read_dag_info(wms_id) 

1545 except (FileNotFoundError, PermissionError, IOError): 

1546 pass 

1547 else: 

1548 schedd_name = next(iter(job_info)) 

1549 job_id = next(iter(job_info[schedd_name])) 

1550 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1551 cluster_id = int(float(job_id)) 

1552 else: 

1553 pass 

1554 return schedd_ad, cluster_id, id_type 

1555 

1556 

1557def _create_periodic_release_expr(memory, multiplier, limit): 

1558 """Construct an HTCondorAd expression for releasing held jobs. 

1559 

1560 The expression instruct HTCondor to release any job which was put on hold 

1561 due to exceeding memory requirements back to the job queue providing it 

1562 satisfies all of the conditions below: 

1563 

1564 * number of run attempts did not reach allowable number of retries, 

1565 * the memory requirements in the last failed run attempt did not reach 

1566 the specified memory limit. 

1567 

1568 Parameters 

1569 ---------- 

1570 memory : `int` 

1571 Requested memory in MB. 

1572 multiplier : `float` 

1573 Memory growth rate between retires. 

1574 limit : `int` 

1575 Memory limit. 

1576 

1577 Returns 

1578 ------- 

1579 expr : `str` 

1580 A string representing an HTCondor ClassAd expression for releasing jobs 

1581 which have been held due to exceeding the memory requirements. 

1582 """ 

1583 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1584 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1585 

1586 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1587 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1588 # The special comparison operators ensure that all comparisons below will 

1589 # evaluate to FALSE in this case. 

1590 # 

1591 # Note: 

1592 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1593 # the entire expression should evaluate to FALSE when the job is not HELD. 

1594 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1595 # but better safe than sorry. 

1596 was_mem_exceeded = "JobStatus == 5 " \ 

1597 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1598 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1599 

1600 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1601 return expr 

1602 

1603 

1604def _create_periodic_remove_expr(memory, multiplier, limit): 

1605 """Construct an HTCondorAd expression for removing jobs from the queue. 

1606 

1607 The expression instruct HTCondor to remove any job which was put on hold 

1608 due to exceeding memory requirements from the job queue providing it 

1609 satisfies any of the conditions below: 

1610 

1611 * allowable number of retries was reached, 

1612 * the memory requirements during the last failed run attempt reached 

1613 the specified memory limit. 

1614 

1615 Parameters 

1616 ---------- 

1617 memory : `int` 

1618 Requested memory in MB. 

1619 multiplier : `float` 

1620 Memory growth rate between retires. 

1621 limit : `int` 

1622 Memory limit. 

1623 

1624 Returns 

1625 ------- 

1626 expr : `str` 

1627 A string representing an HTCondor ClassAd expression for removing jobs 

1628 which were run at the maximal allowable memory and still exceeded 

1629 the memory requirements. 

1630 """ 

1631 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1632 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1633 

1634 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1635 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1636 # The special comparison operators ensure that all comparisons below will 

1637 # evaluate to FALSE in this case. 

1638 # 

1639 # Note: 

1640 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1641 # the entire expression should evaluate to FALSE when the job is not HELD. 

1642 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1643 # but better safe than sorry. 

1644 was_mem_exceeded = "JobStatus == 5 " \ 

1645 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1646 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1647 

1648 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1649 return expr 

1650 

1651 

1652def _create_request_memory_expr(memory, multiplier, limit): 

1653 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1654 

1655 Parameters 

1656 ---------- 

1657 memory : `int` 

1658 Requested memory in MB. 

1659 multiplier : `float` 

1660 Memory growth rate between retires. 

1661 limit : `int` 

1662 Memory limit. 

1663 

1664 Returns 

1665 ------- 

1666 expr : `str` 

1667 A string representing an HTCondor ClassAd expression enabling safe 

1668 memory scaling between job retries. 

1669 """ 

1670 # The check if the job was held due to exceeding memory requirements 

1671 # will be made *after* job was released back to the job queue (is in 

1672 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1673 # the ones describing job's current state. 

1674 # 

1675 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1676 # initially put in the job queue. The special comparison operators ensure 

1677 # that all comparisons below will evaluate to FALSE in this case. 

1678 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1679 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1680 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1681 

1682 # If job runs the first time or was held for reasons other than exceeding 

1683 # the memory, set the required memory to the requested value or use 

1684 # the memory value measured by HTCondor (MemoryUsage) depending on 

1685 # whichever is greater. 

1686 expr = f"({was_mem_exceeded}) " \ 

1687 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " \ 

1688 f": max({{{memory}, MemoryUsage ?: 0}})" 

1689 return expr 

1690 

1691 

1692def _locate_schedds(locate_all=False): 

1693 """Find out Scheduler daemons in an HTCondor pool. 

1694 

1695 Parameters 

1696 ---------- 

1697 locate_all : `bool`, optional 

1698 If True, all available schedulers in the HTCondor pool will be located. 

1699 False by default which means that the search will be limited to looking 

1700 for the Scheduler running on a local host. 

1701 

1702 Returns 

1703 ------- 

1704 schedds : `dict` [`str`, `htcondor.Schedd`] 

1705 A mapping between Scheduler names and Python objects allowing for 

1706 interacting with them. 

1707 """ 

1708 coll = htcondor.Collector() 

1709 

1710 schedd_ads = [] 

1711 if locate_all: 

1712 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1713 else: 

1714 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1715 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1716 

1717 

1718def _gather_site_values(config, compute_site): 

1719 """Gather values specific to given site. 

1720 

1721 Parameters 

1722 ---------- 

1723 config : `lsst.ctrl.bps.BpsConfig` 

1724 BPS configuration that includes necessary submit/runtime 

1725 information. 

1726 compute_site : `str` 

1727 Compute site name. 

1728 

1729 Returns 

1730 ------- 

1731 site_values : `dict` [`str`, `Any`] 

1732 Values specific to the given site. 

1733 """ 

1734 site_values = {"attrs": {}, "profile": {}} 

1735 search_opts = {} 

1736 if compute_site: 

1737 search_opts["curvals"] = {"curr_site": compute_site} 

1738 

1739 # Determine the hard limit for the memory requirement. 

1740 found, limit = config.search('memoryLimit', opt=search_opts) 

1741 if not found: 

1742 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1743 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1744 del search_opts["default"] 

1745 

1746 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1747 # by definition, they cannot have more memory than 

1748 # the partitionable slot they are the part of. 

1749 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1750 pool_info = condor_status(constraint=constraint) 

1751 try: 

1752 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1753 except ValueError: 

1754 _LOG.debug("No execute machine in the pool matches %s", patt) 

1755 if limit: 

1756 config[".bps_defined.memory_limit"] = limit 

1757 

1758 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1759 site_values["memoryLimit"] = limit 

1760 

1761 key = f".site.{compute_site}.profile.condor" 

1762 if key in config: 

1763 for key, val in config[key].items(): 

1764 if key.startswith("+"): 

1765 site_values["attrs"][key[1:]] = val 

1766 else: 

1767 site_values["profile"][key] = val 

1768 

1769 return site_values