Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

599 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import os 

29import re 

30import logging 

31from enum import IntEnum, auto 

32from pathlib import Path 

33from collections import defaultdict 

34 

35import htcondor 

36 

37from lsst.utils.timer import time_this 

38from ... import ( 

39 BaseWmsWorkflow, 

40 BaseWmsService, 

41 GenericWorkflow, 

42 GenericWorkflowJob, 

43 WmsRunReport, 

44 WmsJobReport, 

45 WmsStates 

46) 

47from ...bps_utils import ( 

48 chdir, 

49 create_count_summary 

50) 

51from .lssthtc import ( 

52 HTCDag, 

53 HTCJob, 

54 MISSING_ID, 

55 JobStatus, 

56 NodeStatus, 

57 htc_check_dagman_output, 

58 htc_escape, 

59 htc_submit_dag, 

60 read_dag_info, 

61 read_dag_log, 

62 read_dag_status, 

63 read_node_status, 

64 condor_q, 

65 condor_search, 

66 condor_status, 

67 pegasus_name_to_label, 

68 summary_from_dag, 

69) 

70 

71 

72class WmsIdType(IntEnum): 

73 """Type of valid WMS ids. 

74 """ 

75 

76 UNKNOWN = auto() 

77 """The type of id cannot be determined. 

78 """ 

79 

80 LOCAL = auto() 

81 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

82 """ 

83 

84 GLOBAL = auto() 

85 """Id is a HTCondor's global job id. 

86 """ 

87 

88 PATH = auto() 

89 """Id is a submission path. 

90 """ 

91 

92 

93DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

94"""Default pattern for searching execute machines in an HTCondor pool. 

95""" 

96 

97_LOG = logging.getLogger(__name__) 

98 

99 

100class HTCondorService(BaseWmsService): 

101 """HTCondor version of WMS service. 

102 """ 

103 def prepare(self, config, generic_workflow, out_prefix=None): 

104 """Convert generic workflow to an HTCondor DAG ready for submission. 

105 

106 Parameters 

107 ---------- 

108 config : `lsst.ctrl.bps.BpsConfig` 

109 BPS configuration that includes necessary submit/runtime 

110 information. 

111 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

112 The generic workflow (e.g., has executable name and arguments). 

113 out_prefix : `str` 

114 The root directory into which all WMS-specific files are written. 

115 

116 Returns 

117 ---------- 

118 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

119 HTCondor workflow ready to be run. 

120 """ 

121 _LOG.debug("out_prefix = '%s'", out_prefix) 

122 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

123 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

124 f"{self.__class__.__module__}." 

125 f"{self.__class__.__name__}") 

126 

127 with time_this(log=_LOG, level=logging.INFO, prefix=None, 

128 msg="Completed writing out HTCondor workflow"): 

129 workflow.write(out_prefix) 

130 return workflow 

131 

132 def submit(self, workflow): 

133 """Submit a single HTCondor workflow. 

134 

135 Parameters 

136 ---------- 

137 workflow : `lsst.ctrl.bps.BaseWorkflow` 

138 A single HTCondor workflow to submit. run_id is updated after 

139 successful submission to WMS. 

140 """ 

141 # For workflow portability, internal paths are all relative. Hence 

142 # the DAG needs to be submitted to HTCondor from inside the submit 

143 # directory. 

144 with chdir(workflow.submit_path): 

145 _LOG.info("Submitting from directory: %s", os.getcwd()) 

146 htc_submit_dag(workflow.dag, {}) 

147 workflow.run_id = workflow.dag.run_id 

148 

149 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

150 """Query WMS for list of submitted WMS workflows/jobs. 

151 

152 This should be a quick lookup function to create list of jobs for 

153 other functions. 

154 

155 Parameters 

156 ---------- 

157 wms_id : `int` or `str`, optional 

158 Id or path that can be used by WMS service to look up job. 

159 user : `str`, optional 

160 User whose submitted jobs should be listed. 

161 require_bps : `bool`, optional 

162 Whether to require jobs returned in list to be bps-submitted jobs. 

163 pass_thru : `str`, optional 

164 Information to pass through to WMS. 

165 is_global : `bool`, optional 

166 If set, all job queues (and their histories) will be queried for 

167 job information. Defaults to False which means that only the local 

168 job queue will be queried. 

169 

170 Returns 

171 ------- 

172 job_ids : `list` [`Any`] 

173 Only job ids to be used by cancel and other functions. Typically 

174 this means top-level jobs (i.e., not children jobs). 

175 """ 

176 _LOG.debug("list_submitted_jobs params: " 

177 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

178 wms_id, user, require_bps, pass_thru, is_global) 

179 

180 # Determine which Schedds will be queried for job information. 

181 coll = htcondor.Collector() 

182 

183 schedd_ads = [] 

184 if is_global: 

185 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

186 else: 

187 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

188 

189 # Construct appropriate constraint expression using provided arguments. 

190 constraint = "False" 

191 if wms_id is None: 

192 if user is not None: 

193 constraint = f'(Owner == "{user}")' 

194 else: 

195 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

196 if cluster_id is not None: 

197 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

198 

199 # If provided id is either a submission path or a global id, 

200 # make sure the right Schedd will be queried regardless of 

201 # 'is_global' value. 

202 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

203 schedd_ads = [schedd_ad] 

204 if require_bps: 

205 constraint += ' && (bps_isjob == "True")' 

206 if pass_thru: 

207 if "-forcex" in pass_thru: 

208 pass_thru_2 = pass_thru.replace("-forcex", "") 

209 if pass_thru_2 and not pass_thru_2.isspace(): 

210 constraint += f" && ({pass_thru_2})" 

211 else: 

212 constraint += f" && ({pass_thru})" 

213 

214 # Create a list of scheduler daemons which need to be queried. 

215 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

216 

217 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

218 results = condor_q(constraint=constraint, schedds=schedds) 

219 

220 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

221 job_ids = [] 

222 for schedd_name, job_info in results.items(): 

223 for job_id, job_ad in job_info.items(): 

224 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

225 if "DAGManJobId" not in job_ad: 

226 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

227 else: 

228 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

229 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

230 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

231 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

232 

233 _LOG.debug("job_ids = %s", job_ids) 

234 return job_ids 

235 

236 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

237 """Return run information based upon given constraints. 

238 

239 Parameters 

240 ---------- 

241 wms_workflow_id : `str`, optional 

242 Limit to specific run based on id. 

243 user : `str`, optional 

244 Limit results to runs for this user. 

245 hist : `float`, optional 

246 Limit history search to this many days. Defaults to 0. 

247 pass_thru : `str`, optional 

248 Constraints to pass through to HTCondor. 

249 is_global : `bool`, optional 

250 If set, all job queues (and their histories) will be queried for 

251 job information. Defaults to False which means that only the local 

252 job queue will be queried. 

253 

254 Returns 

255 ------- 

256 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

257 Information about runs from given job information. 

258 message : `str` 

259 Extra message for report command to print. This could be pointers 

260 to documentation or to WMS specific commands. 

261 """ 

262 if wms_workflow_id: 

263 id_type = _wms_id_type(wms_workflow_id) 

264 if id_type == WmsIdType.LOCAL: 

265 schedulers = _locate_schedds(locate_all=is_global) 

266 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

267 elif id_type == WmsIdType.GLOBAL: 

268 schedulers = _locate_schedds(locate_all=True) 

269 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

270 elif id_type == WmsIdType.PATH: 

271 run_reports, message = _report_from_path(wms_workflow_id) 

272 else: 

273 run_reports, message = {}, 'Invalid job id' 

274 else: 

275 schedulers = _locate_schedds(locate_all=is_global) 

276 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

277 _LOG.debug("report: %s, %s", run_reports, message) 

278 

279 return list(run_reports.values()), message 

280 

281 def cancel(self, wms_id, pass_thru=None): 

282 """Cancel submitted workflows/jobs. 

283 

284 Parameters 

285 ---------- 

286 wms_id : `str` 

287 Id or path of job that should be canceled. 

288 pass_thru : `str`, optional 

289 Information to pass through to WMS. 

290 

291 Returns 

292 -------- 

293 deleted : `bool` 

294 Whether successful deletion or not. Currently, if any doubt or any 

295 individual jobs not deleted, return False. 

296 message : `str` 

297 Any message from WMS (e.g., error details). 

298 """ 

299 _LOG.debug("Canceling wms_id = %s", wms_id) 

300 

301 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

302 

303 if cluster_id is None: 

304 deleted = False 

305 message = "invalid id" 

306 else: 

307 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s", 

308 cluster_id, schedd_ad["Name"]) 

309 schedd = htcondor.Schedd(schedd_ad) 

310 

311 constraint = f"ClusterId == {cluster_id}" 

312 if pass_thru is not None and "-forcex" in pass_thru: 

313 pass_thru_2 = pass_thru.replace("-forcex", "") 

314 if pass_thru_2 and not pass_thru_2.isspace(): 

315 constraint += f"&& ({pass_thru_2})" 

316 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

317 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

318 else: 

319 if pass_thru: 

320 constraint += f"&& ({pass_thru})" 

321 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

322 results = schedd.act(htcondor.JobAction.Remove, constraint) 

323 _LOG.debug("Remove results: %s", results) 

324 

325 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

326 deleted = True 

327 message = "" 

328 else: 

329 deleted = False 

330 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

331 message = "no such bps job in batch queue" 

332 else: 

333 message = f"unknown problems deleting: {results}" 

334 

335 _LOG.debug("deleted: %s; message = %s", deleted, message) 

336 return deleted, message 

337 

338 

339class HTCondorWorkflow(BaseWmsWorkflow): 

340 """Single HTCondor workflow. 

341 

342 Parameters 

343 ---------- 

344 name : `str` 

345 Unique name for Workflow used when naming files. 

346 config : `lsst.ctrl.bps.BpsConfig` 

347 BPS configuration that includes necessary submit/runtime information. 

348 """ 

349 def __init__(self, name, config=None): 

350 super().__init__(name, config) 

351 self.dag = None 

352 

353 @classmethod 

354 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

355 # Docstring inherited 

356 htc_workflow = cls(generic_workflow.name, config) 

357 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

358 

359 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

360 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

361 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

362 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

363 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

364 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

365 

366 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

367 if isinstance(tmp_template, str): 

368 subdir_template = defaultdict(lambda: tmp_template) 

369 else: 

370 subdir_template = tmp_template 

371 

372 # Create all DAG jobs 

373 site_values = {} # cache compute site specific values to reduce config lookups 

374 for job_name in generic_workflow: 

375 gwjob = generic_workflow.get_job(job_name) 

376 if gwjob.compute_site not in site_values: 

377 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

378 htc_job = _create_job(subdir_template[gwjob.label], site_values[gwjob.compute_site], 

379 generic_workflow, gwjob, out_prefix) 

380 htc_workflow.dag.add_job(htc_job) 

381 

382 # Add job dependencies to the DAG 

383 for job_name in generic_workflow: 

384 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

385 

386 # If final job exists in generic workflow, create DAG final job 

387 final = generic_workflow.get_final() 

388 if final and isinstance(final, GenericWorkflowJob): 

389 if final.compute_site and final.compute_site not in site_values: 

390 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

391 final_htjob = _create_job(subdir_template[final.label], site_values[final.compute_site], 

392 generic_workflow, final, out_prefix) 

393 if "post" not in final_htjob.dagcmds: 

394 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

395 f" {final.name} $DAG_STATUS $RETURN" 

396 htc_workflow.dag.add_final_job(final_htjob) 

397 elif final and isinstance(final, GenericWorkflow): 

398 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

399 elif final: 

400 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

401 

402 return htc_workflow 

403 

404 def write(self, out_prefix): 

405 """Output HTCondor DAGMan files needed for workflow submission. 

406 

407 Parameters 

408 ---------- 

409 out_prefix : `str` 

410 Directory prefix for HTCondor files. 

411 """ 

412 self.submit_path = out_prefix 

413 os.makedirs(out_prefix, exist_ok=True) 

414 

415 # Write down the workflow in HTCondor format. 

416 self.dag.write(out_prefix, "jobs/{self.label}") 

417 

418 

419def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

420 """Convert GenericWorkflow job nodes to DAG jobs. 

421 

422 Parameters 

423 ---------- 

424 subdir_template : `str` 

425 Template for making subdirs. 

426 site_values : `dict` 

427 Site specific values 

428 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

429 Generic workflow that is being converted. 

430 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

431 The generic job to convert to a HTCondor job. 

432 out_prefix : `str` 

433 Directory prefix for HTCondor files. 

434 

435 Returns 

436 ------- 

437 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

438 The HTCondor job equivalent to the given generic job. 

439 """ 

440 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

441 

442 curvals = defaultdict(str) 

443 curvals["label"] = gwjob.label 

444 if gwjob.tags: 

445 curvals.update(gwjob.tags) 

446 

447 subdir = subdir_template.format_map(curvals) 

448 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

449 

450 htc_job_cmds = { 

451 "universe": "vanilla", 

452 "should_transfer_files": "YES", 

453 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

454 "transfer_output_files": '""', # Set to empty string to disable 

455 "transfer_executable": "False", 

456 "getenv": "True", 

457 

458 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

459 # to put SIGBUS jobs on hold. 

460 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

461 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

462 "on_exit_hold_subcode": "34" 

463 } 

464 

465 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

466 

467 # job stdout, stderr, htcondor user log. 

468 for key in ("output", "error", "log"): 

469 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

470 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

471 

472 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], 

473 out_prefix)) 

474 

475 # Add the job cmds dict to the job object. 

476 htc_job.add_job_cmds(htc_job_cmds) 

477 

478 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

479 

480 # Add job attributes to job. 

481 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

482 htc_job.add_job_attrs(gwjob.attrs) 

483 htc_job.add_job_attrs(site_values["attrs"]) 

484 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

485 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

486 "bps_job_label": gwjob.label}) 

487 

488 return htc_job 

489 

490 

491def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

492 """Translate the job data that are one to one mapping 

493 

494 Parameters 

495 ---------- 

496 cached_vals : `dict` [`str`, `Any`] 

497 Config values common to jobs with same label. 

498 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

499 Generic workflow that contains job to being converted. 

500 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

501 Generic workflow job to be converted. 

502 

503 Returns 

504 ------- 

505 htc_job_commands : `dict` [`str`, `Any`] 

506 Contains commands which can appear in the HTCondor submit description 

507 file. 

508 """ 

509 # Values in the job script that just are name mappings. 

510 job_translation = {"mail_to": "notify_user", 

511 "when_to_mail": "notification", 

512 "request_cpus": "request_cpus", 

513 "priority": "priority", 

514 "category": "category"} 

515 

516 jobcmds = {} 

517 for gwkey, htckey in job_translation.items(): 

518 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

519 

520 # job commands that need modification 

521 if gwjob.number_of_retries: 

522 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

523 

524 if gwjob.retry_unless_exit: 

525 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

526 

527 if gwjob.request_disk: 

528 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

529 

530 if gwjob.request_memory: 

531 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

532 

533 if gwjob.memory_multiplier: 

534 # Do not use try-except! At the moment, BpsConfig returns an empty 

535 # string if it does not contain the key. 

536 memory_limit = cached_vals["memoryLimit"] 

537 if not memory_limit: 

538 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

539 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

540 "search pattern 'executeMachinesPattern' might help.") 

541 

542 # Set maximal amount of memory job can ask for. 

543 # 

544 # The check below assumes that 'memory_limit' was set to a value which 

545 # realistically reflects actual physical limitations of a given compute 

546 # resource. 

547 memory_max = memory_limit 

548 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

549 memory_max = gwjob.request_memory_max 

550 

551 # Make job ask for more memory each time it failed due to insufficient 

552 # memory requirements. 

553 jobcmds["request_memory"] = \ 

554 _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

555 

556 # Periodically release jobs which are being held due to exceeding 

557 # memory. Stop doing that (by removing the job from the HTCondor queue) 

558 # after the maximal number of retries has been reached or the job was 

559 # already run at maximal allowed memory. 

560 jobcmds["periodic_release"] = \ 

561 _create_periodic_release_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

562 jobcmds["periodic_remove"] = \ 

563 _create_periodic_remove_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

564 

565 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

566 # May need to move to special site-specific implementation if sites use 

567 # other mechanisms. 

568 if gwjob.concurrency_limit: 

569 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

570 

571 # Handle command line 

572 if gwjob.executable.transfer_executable: 

573 jobcmds["transfer_executable"] = "True" 

574 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

575 else: 

576 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

577 

578 if gwjob.arguments: 

579 arguments = gwjob.arguments 

580 arguments = _replace_cmd_vars(arguments, gwjob) 

581 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

582 arguments = _fix_env_var_syntax(arguments) 

583 jobcmds["arguments"] = arguments 

584 

585 # Add extra "pass-thru" job commands 

586 if gwjob.profile: 

587 for key, val in gwjob.profile.items(): 

588 jobcmds[key] = htc_escape(val) 

589 for key, val in cached_vals["profile"]: 

590 jobcmds[key] = htc_escape(val) 

591 

592 return jobcmds 

593 

594 

595def _translate_dag_cmds(gwjob): 

596 """Translate job values into DAGMan commands. 

597 

598 Parameters 

599 ---------- 

600 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

601 Job containing values to be translated. 

602 

603 Returns 

604 ------- 

605 dagcmds : `dict` [`str`, `Any`] 

606 DAGMan commands for the job. 

607 """ 

608 # Values in the dag script that just are name mappings. 

609 dag_translation = {"abort_on_value": "abort_dag_on", 

610 "abort_return_value": "abort_exit"} 

611 

612 dagcmds = {} 

613 for gwkey, htckey in dag_translation.items(): 

614 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

615 

616 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

617 return dagcmds 

618 

619 

620def _fix_env_var_syntax(oldstr): 

621 """Change ENV place holders to HTCondor Env var syntax. 

622 

623 Parameters 

624 ---------- 

625 oldstr : `str` 

626 String in which environment variable syntax is to be fixed. 

627 

628 Returns 

629 ------- 

630 newstr : `str` 

631 Given string with environment variable syntax fixed. 

632 """ 

633 newstr = oldstr 

634 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

635 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

636 return newstr 

637 

638 

639def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

640 """Replace file placeholders in command line arguments with correct 

641 physical file names. 

642 

643 Parameters 

644 ---------- 

645 use_shared : `bool` 

646 Whether HTCondor can assume shared filesystem. 

647 arguments : `str` 

648 Arguments string in which to replace file placeholders. 

649 workflow : `lsst.ctrl.bps.GenericWorkflow` 

650 Generic workflow that contains file information. 

651 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

652 The job corresponding to the arguments. 

653 

654 Returns 

655 ------- 

656 arguments : `str` 

657 Given arguments string with file placeholders replaced. 

658 """ 

659 # Replace input file placeholders with paths. 

660 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

661 if not gwfile.wms_transfer: 

662 # Must assume full URI if in command line and told WMS is not 

663 # responsible for transferring file. 

664 uri = gwfile.src_uri 

665 elif use_shared: 

666 if gwfile.job_shared: 

667 # Have shared filesystems and jobs can share file. 

668 uri = gwfile.src_uri 

669 else: 

670 # Taking advantage of inside knowledge. Not future-proof. 

671 # Temporary fix until have job wrapper that pulls files 

672 # within job. 

673 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

674 uri = "butler.yaml" 

675 else: 

676 uri = os.path.basename(gwfile.src_uri) 

677 else: # Using push transfer 

678 uri = os.path.basename(gwfile.src_uri) 

679 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

680 

681 # Replace output file placeholders with paths. 

682 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

683 if not gwfile.wms_transfer: 

684 # Must assume full URI if in command line and told WMS is not 

685 # responsible for transferring file. 

686 uri = gwfile.src_uri 

687 elif use_shared: 

688 if gwfile.job_shared: 

689 # Have shared filesystems and jobs can share file. 

690 uri = gwfile.src_uri 

691 else: 

692 uri = os.path.basename(gwfile.src_uri) 

693 else: # Using push transfer 

694 uri = os.path.basename(gwfile.src_uri) 

695 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

696 return arguments 

697 

698 

699def _replace_cmd_vars(arguments, gwjob): 

700 """Replace format-style placeholders in arguments. 

701 

702 Parameters 

703 ---------- 

704 arguments : `str` 

705 Arguments string in which to replace placeholders. 

706 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

707 Job containing values to be used to replace placeholders 

708 (in particular gwjob.cmdvals). 

709 

710 Returns 

711 ------- 

712 arguments : `str` 

713 Given arguments string with placeholders replaced. 

714 """ 

715 try: 

716 arguments = arguments.format(**gwjob.cmdvals) 

717 except (KeyError, TypeError): # TypeError in case None instead of {} 

718 _LOG.error("Could not replace command variables:\n" 

719 "arguments: %s\n" 

720 "cmdvals: %s", arguments, gwjob.cmdvals) 

721 raise 

722 return arguments 

723 

724 

725def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

726 """Add job input files from generic workflow to job. 

727 

728 Parameters 

729 ---------- 

730 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

731 The generic workflow (e.g., has executable name and arguments). 

732 job_name : `str` 

733 Unique name for the job. 

734 use_shared : `bool` 

735 Whether job has access to files via shared filesystem. 

736 out_prefix : `str` 

737 The root directory into which all WMS-specific files are written. 

738 

739 Returns 

740 ------- 

741 htc_commands : `dict` [`str`, `str`] 

742 HTCondor commands for the job submission script. 

743 """ 

744 htc_commands = {} 

745 inputs = [] 

746 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

747 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

748 

749 uri = Path(gwf_file.src_uri) 

750 

751 # Note if use_shared and job_shared, don't need to transfer file. 

752 

753 if not use_shared: # Copy file using push to job 

754 inputs.append(str(uri.relative_to(out_prefix))) 

755 elif not gwf_file.job_shared: # Jobs require own copy 

756 

757 # if using shared filesystem, but still need copy in job. Use 

758 # HTCondor's curl plugin for a local copy. 

759 

760 # Execution butler is represented as a directory which the 

761 # curl plugin does not handle. Taking advantage of inside 

762 # knowledge for temporary fix until have job wrapper that pulls 

763 # files within job. 

764 if gwf_file.name == "butlerConfig": 

765 # The execution butler directory doesn't normally exist until 

766 # the submit phase so checking for suffix instead of using 

767 # is_dir(). If other non-yaml file exists they would have a 

768 # different gwf_file.name. 

769 if uri.suffix == ".yaml": # Single file, so just copy. 

770 inputs.append(f"file://{uri}") 

771 else: 

772 inputs.append(f"file://{uri / 'butler.yaml'}") 

773 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

774 elif uri.is_dir(): 

775 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job " 

776 f"{gwf_file.src_uri}") 

777 else: 

778 inputs.append(f"file://{uri}") 

779 

780 if inputs: 

781 htc_commands["transfer_input_files"] = ",".join(inputs) 

782 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

783 return htc_commands 

784 

785 

786def _report_from_path(wms_path): 

787 """Gather run information from a given run directory. 

788 

789 Parameters 

790 ---------- 

791 wms_path : `str` 

792 The directory containing the submit side files (e.g., HTCondor files). 

793 

794 Returns 

795 ------- 

796 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

797 Run information for the detailed report. The key is the HTCondor id 

798 and the value is a collection of report information for that run. 

799 message : `str` 

800 Message to be printed with the summary report. 

801 """ 

802 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

803 if wms_workflow_id == MISSING_ID: 

804 run_reports = {} 

805 else: 

806 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

807 return run_reports, message 

808 

809 

810def _report_from_id(wms_workflow_id, hist, schedds=None): 

811 """Gather run information using workflow id. 

812 

813 Parameters 

814 ---------- 

815 wms_workflow_id : `str` 

816 Limit to specific run based on id. 

817 hist : `float` 

818 Limit history search to this many days. 

819 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

820 HTCondor schedulers which to query for job information. If None 

821 (default), all queries will be run against the local scheduler only. 

822 

823 Returns 

824 ------- 

825 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

826 Run information for the detailed report. The key is the HTCondor id 

827 and the value is a collection of report information for that run. 

828 message : `str` 

829 Message to be printed with the summary report. 

830 """ 

831 dag_constraint = 'regexp("dagman$", Cmd)' 

832 try: 

833 cluster_id = int(float(wms_workflow_id)) 

834 except ValueError: 

835 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

836 else: 

837 dag_constraint += f" && ClusterId == {cluster_id}" 

838 

839 # With the current implementation of the condor_* functions the query will 

840 # always return only one match per Scheduler. 

841 # 

842 # Even in the highly unlikely situation where HTCondor history (which 

843 # condor_search queries too) is long enough to have jobs from before the 

844 # cluster ids were rolled over (and as a result there is more then one job 

845 # with the same cluster id) they will not show up in the results. 

846 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

847 if len(schedd_dag_info) == 0: 

848 run_reports = {} 

849 message = "" 

850 elif len(schedd_dag_info) == 1: 

851 _, dag_info = schedd_dag_info.popitem() 

852 dag_id, dag_ad = dag_info.popitem() 

853 

854 # Create a mapping between jobs and their classads. The keys will be 

855 # of format 'ClusterId.ProcId'. 

856 job_info = {dag_id: dag_ad} 

857 

858 # Find jobs (nodes) belonging to that DAGMan job. 

859 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

860 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

861 _, node_info = schedd_job_info.popitem() 

862 job_info.update(node_info) 

863 

864 # Collect additional pieces of information about jobs using HTCondor 

865 # files in the submission directory. 

866 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

867 _update_jobs(job_info, path_jobs) 

868 

869 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

870 else: 

871 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

872 run_reports = {} 

873 message = f"More than one job matches id '{wms_workflow_id}', " \ 

874 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

875 return run_reports, message 

876 

877 

878def _get_info_from_path(wms_path): 

879 """Gather run information from a given run directory. 

880 

881 Parameters 

882 ---------- 

883 wms_path : `str` 

884 Directory containing HTCondor files. 

885 

886 Returns 

887 ------- 

888 wms_workflow_id : `str` 

889 The run id which is a DAGman job id. 

890 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

891 Information about jobs read from files in the given directory. 

892 The key is the HTCondor id and the value is a dictionary of HTCondor 

893 keys and values. 

894 message : `str` 

895 Message to be printed with the summary report. 

896 """ 

897 messages = [] 

898 try: 

899 wms_workflow_id, jobs = read_dag_log(wms_path) 

900 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

901 _update_jobs(jobs, read_node_status(wms_path)) 

902 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

903 

904 # Add more info for DAGman job 

905 job = jobs[wms_workflow_id] 

906 job.update(read_dag_status(wms_path)) 

907 

908 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

909 if "bps_run" not in job: 

910 _add_run_info(wms_path, job) 

911 

912 message = htc_check_dagman_output(wms_path) 

913 if message: 

914 messages.append(message) 

915 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

916 jobs[wms_workflow_id]["total_jobs"]) 

917 

918 # Add extra pieces of information which cannot be found in HTCondor 

919 # generated files like 'GlobalJobId'. 

920 # 

921 # Do not treat absence of this file as a serious error. Neither runs 

922 # submitted with earlier versions of the plugin nor the runs submitted 

923 # with Pegasus plugin will have it at the moment. However, once enough 

924 # time passes and Pegasus plugin will have its own report() method 

925 # (instead of sneakily using HTCondor's one), the lack of that file 

926 # should be treated as seriously as lack of any other file. 

927 try: 

928 job_info = read_dag_info(wms_path) 

929 except FileNotFoundError as exc: 

930 message = f"Warn: Some information may not be available: {exc}" 

931 messages.append(message) 

932 else: 

933 schedd_name = next(iter(job_info)) 

934 job_ad = next(iter(job_info[schedd_name].values())) 

935 job.update(job_ad) 

936 except FileNotFoundError: 

937 message = f"Could not find HTCondor files in '{wms_path}'" 

938 _LOG.warning(message) 

939 messages.append(message) 

940 wms_workflow_id = MISSING_ID 

941 jobs = {} 

942 

943 message = '\n'.join([msg for msg in messages if msg]) 

944 return wms_workflow_id, jobs, message 

945 

946 

947def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

948 """Gather run information to be used in generating summary reports. 

949 

950 Parameters 

951 ---------- 

952 wms_workflow_id : `str` 

953 The run id to create the report for. 

954 jobs : `dict` [`str`, `dict` [`str`, Any]] 

955 Mapping HTCondor job id to job information. 

956 

957 Returns 

958 ------- 

959 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

960 Run information for the detailed report. The key is the given HTCondor 

961 id and the value is a collection of report information for that run. 

962 """ 

963 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

964 dag_job = jobs[wms_workflow_id] 

965 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

966 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

967 path=dag_job["Iwd"], 

968 label=dag_job.get("bps_job_label", "MISS"), 

969 run=dag_job.get("bps_run", "MISS"), 

970 project=dag_job.get("bps_project", "MISS"), 

971 campaign=dag_job.get("bps_campaign", "MISS"), 

972 payload=dag_job.get("bps_payload", "MISS"), 

973 operator=_get_owner(dag_job), 

974 run_summary=_get_run_summary(dag_job), 

975 state=_htc_status_to_wms_state(dag_job), 

976 jobs=[], 

977 total_number_jobs=dag_job["total_jobs"], 

978 job_state_counts=dag_job["state_counts"]) 

979 

980 for job_id, job_info in jobs.items(): 

981 try: 

982 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

983 job_report = WmsJobReport(wms_id=job_id, 

984 name=job_info.get("DAGNodeName", job_id), 

985 label=job_info.get("bps_job_label", 

986 pegasus_name_to_label(job_info["DAGNodeName"])), 

987 state=_htc_status_to_wms_state(job_info)) 

988 if job_report.label == "init": 

989 job_report.label = "pipetaskInit" 

990 report.jobs.append(job_report) 

991 except KeyError as ex: 

992 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

993 raise 

994 

995 run_reports = {report.wms_id: report} 

996 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

997 return run_reports 

998 

999 

1000def _summary_report(user, hist, pass_thru, schedds=None): 

1001 """Gather run information to be used in generating summary reports. 

1002 

1003 Parameters 

1004 ---------- 

1005 user : `str` 

1006 Run lookup restricted to given user. 

1007 hist : `float` 

1008 How many previous days to search for run information. 

1009 pass_thru : `str` 

1010 Advanced users can define the HTCondor constraint to be used 

1011 when searching queue and history. 

1012 

1013 Returns 

1014 ------- 

1015 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1016 Run information for the summary report. The keys are HTCondor ids and 

1017 the values are collections of report information for each run. 

1018 message : `str` 

1019 Message to be printed with the summary report. 

1020 """ 

1021 # only doing summary report so only look for dagman jobs 

1022 if pass_thru: 

1023 constraint = pass_thru 

1024 else: 

1025 # Notes: 

1026 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1027 # manually restarted. 

1028 # * Any job with DAGManJobID isn't a DAG job 

1029 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1030 if user: 

1031 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1032 

1033 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1034 

1035 # Have list of DAGMan jobs, need to get run_report info. 

1036 run_reports = {} 

1037 for jobs in job_info.values(): 

1038 for job_id, job in jobs.items(): 

1039 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1040 # If didn't get from queue information (e.g., Kerberos bug), 

1041 # try reading from file. 

1042 if total_jobs == 0: 

1043 try: 

1044 job.update(read_dag_status(job["Iwd"])) 

1045 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1046 except StopIteration: 

1047 pass # don't kill report can't find htcondor files 

1048 

1049 if "bps_run" not in job: 

1050 _add_run_info(job["Iwd"], job) 

1051 report = WmsRunReport(wms_id=job_id, 

1052 global_wms_id=job["GlobalJobId"], 

1053 path=job["Iwd"], 

1054 label=job.get("bps_job_label", "MISS"), 

1055 run=job.get("bps_run", "MISS"), 

1056 project=job.get("bps_project", "MISS"), 

1057 campaign=job.get("bps_campaign", "MISS"), 

1058 payload=job.get("bps_payload", "MISS"), 

1059 operator=_get_owner(job), 

1060 run_summary=_get_run_summary(job), 

1061 state=_htc_status_to_wms_state(job), 

1062 jobs=[], 

1063 total_number_jobs=total_jobs, 

1064 job_state_counts=state_counts) 

1065 run_reports[report.global_wms_id] = report 

1066 

1067 return run_reports, "" 

1068 

1069 

1070def _add_run_info(wms_path, job): 

1071 """Find BPS run information elsewhere for runs without bps attributes. 

1072 

1073 Parameters 

1074 ---------- 

1075 wms_path : `str` 

1076 Path to submit files for the run. 

1077 job : `dict` [`str`, `Any`] 

1078 HTCondor dag job information. 

1079 

1080 Raises 

1081 ------ 

1082 StopIteration 

1083 If cannot find file it is looking for. Permission errors are 

1084 caught and job's run is marked with error. 

1085 """ 

1086 path = Path(wms_path) / "jobs" 

1087 try: 

1088 subfile = next(path.glob("**/*.sub")) 

1089 except (StopIteration, PermissionError): 

1090 job["bps_run"] = "Unavailable" 

1091 else: 

1092 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1093 try: 

1094 with open(subfile, "r", encoding='utf-8') as fh: 

1095 for line in fh: 

1096 if line.startswith("+bps_"): 

1097 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1098 if m: 

1099 _LOG.debug("Matching line: %s", line) 

1100 job[m.group(1)] = m.group(2).replace('"', "") 

1101 else: 

1102 _LOG.debug("Could not parse attribute: %s", line) 

1103 except PermissionError: 

1104 job["bps_run"] = "PermissionError" 

1105 _LOG.debug("After adding job = %s", job) 

1106 

1107 

1108def _get_owner(job): 

1109 """Get the owner of a dag job. 

1110 

1111 Parameters 

1112 ---------- 

1113 job : `dict` [`str`, `Any`] 

1114 HTCondor dag job information. 

1115 

1116 Returns 

1117 ------- 

1118 owner : `str` 

1119 Owner of the dag job. 

1120 """ 

1121 owner = job.get("bps_operator", None) 

1122 if not owner: 

1123 owner = job.get("Owner", None) 

1124 if not owner: 

1125 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1126 owner = "MISS" 

1127 return owner 

1128 

1129 

1130def _get_run_summary(job): 

1131 """Get the run summary for a job. 

1132 

1133 Parameters 

1134 ---------- 

1135 job : `dict` [`str`, `Any`] 

1136 HTCondor dag job information. 

1137 

1138 Returns 

1139 ------- 

1140 summary : `str` 

1141 Number of jobs per PipelineTask label in approximate pipeline order. 

1142 Format: <label>:<count>[;<label>:<count>]+ 

1143 """ 

1144 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1145 if not summary: 

1146 summary, _ = summary_from_dag(job["Iwd"]) 

1147 if not summary: 

1148 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1149 _LOG.debug("_get_run_summary: summary=%s", summary) 

1150 

1151 # Workaround sometimes using init vs pipetaskInit 

1152 summary = summary.replace("init:", "pipetaskInit:") 

1153 

1154 if "pegasus_version" in job and "pegasus" not in summary: 

1155 summary += ";pegasus:0" 

1156 

1157 return summary 

1158 

1159 

1160def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1161 """Count number of jobs per WMS state. 

1162 

1163 Parameters 

1164 ---------- 

1165 wms_workflow_id : `str` 

1166 HTCondor job id. 

1167 jobs : `dict` [`str`, `Any`] 

1168 HTCondor dag job information. 

1169 

1170 Returns 

1171 ------- 

1172 total_count : `int` 

1173 Total number of dag nodes. 

1174 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1175 Keys are the different WMS states and values are counts of jobs 

1176 that are in that WMS state. 

1177 """ 

1178 state_counts = dict.fromkeys(WmsStates, 0) 

1179 

1180 for jid, jinfo in jobs.items(): 

1181 if jid != wms_workflow_id: 

1182 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1183 

1184 total_counted = sum(state_counts.values()) 

1185 if "NodesTotal" in jobs[wms_workflow_id]: 

1186 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1187 else: 

1188 total_count = total_counted 

1189 

1190 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1191 

1192 return total_count, state_counts 

1193 

1194 

1195def _get_state_counts_from_dag_job(job): 

1196 """Count number of jobs per WMS state. 

1197 

1198 Parameters 

1199 ---------- 

1200 job : `dict` [`str`, `Any`] 

1201 HTCondor dag job information. 

1202 

1203 Returns 

1204 ------- 

1205 total_count : `int` 

1206 Total number of dag nodes. 

1207 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1208 Keys are the different WMS states and values are counts of jobs 

1209 that are in that WMS state. 

1210 """ 

1211 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1212 state_counts = dict.fromkeys(WmsStates, 0) 

1213 if "DAG_NodesReady" in job: 

1214 state_counts = { 

1215 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1216 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1217 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1218 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1219 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1220 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1221 total_jobs = job.get("DAG_NodesTotal") 

1222 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1223 elif "NodesFailed" in job: 

1224 state_counts = { 

1225 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1226 WmsStates.READY: job.get("NodesReady", 0), 

1227 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1228 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1229 WmsStates.FAILED: job.get("NodesFailed", 0), 

1230 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1231 try: 

1232 total_jobs = job.get("NodesTotal") 

1233 except KeyError as ex: 

1234 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1235 raise 

1236 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1237 else: 

1238 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1239 # for every DAG. 

1240 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1241 total_jobs = 0 

1242 

1243 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1244 return total_jobs, state_counts 

1245 

1246 

1247def _htc_status_to_wms_state(job): 

1248 """Convert HTCondor job status to generic wms state. 

1249 

1250 Parameters 

1251 ---------- 

1252 job : `dict` [`str`, `Any`] 

1253 HTCondor job information. 

1254 

1255 Returns 

1256 ------- 

1257 wms_state : `WmsStates` 

1258 The equivalent WmsState to given job's status. 

1259 """ 

1260 wms_state = WmsStates.MISFIT 

1261 if "JobStatus" in job: 

1262 wms_state = _htc_job_status_to_wms_state(job) 

1263 elif "NodeStatus" in job: 

1264 wms_state = _htc_node_status_to_wms_state(job) 

1265 return wms_state 

1266 

1267 

1268def _htc_job_status_to_wms_state(job): 

1269 """Convert HTCondor job status to generic wms state. 

1270 

1271 Parameters 

1272 ---------- 

1273 job : `dict` [`str`, `Any`] 

1274 HTCondor job information. 

1275 

1276 Returns 

1277 ------- 

1278 wms_state : `lsst.ctrl.bps.WmsStates` 

1279 The equivalent WmsState to given job's status. 

1280 """ 

1281 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1282 type(job["JobStatus"])) 

1283 job_status = int(job["JobStatus"]) 

1284 wms_state = WmsStates.MISFIT 

1285 

1286 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1287 if job_status == JobStatus.IDLE: 

1288 wms_state = WmsStates.PENDING 

1289 elif job_status == JobStatus.RUNNING: 

1290 wms_state = WmsStates.RUNNING 

1291 elif job_status == JobStatus.REMOVED: 

1292 wms_state = WmsStates.DELETED 

1293 elif job_status == JobStatus.COMPLETED: 

1294 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1295 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1296 job.get("ReturnValue", 0): 

1297 wms_state = WmsStates.FAILED 

1298 else: 

1299 wms_state = WmsStates.SUCCEEDED 

1300 elif job_status == JobStatus.HELD: 

1301 wms_state = WmsStates.HELD 

1302 

1303 return wms_state 

1304 

1305 

1306def _htc_node_status_to_wms_state(job): 

1307 """Convert HTCondor status to generic wms state. 

1308 

1309 Parameters 

1310 ---------- 

1311 job : `dict` [`str`, `Any`] 

1312 HTCondor job information. 

1313 

1314 Returns 

1315 ------- 

1316 wms_state : `lsst.ctrl.bps.WmsStates` 

1317 The equivalent WmsState to given node's status. 

1318 """ 

1319 wms_state = WmsStates.MISFIT 

1320 

1321 status = job["NodeStatus"] 

1322 if status == NodeStatus.NOT_READY: 

1323 wms_state = WmsStates.UNREADY 

1324 elif status == NodeStatus.READY: 

1325 wms_state = WmsStates.READY 

1326 elif status == NodeStatus.PRERUN: 

1327 wms_state = WmsStates.MISFIT 

1328 elif status == NodeStatus.SUBMITTED: 

1329 if job["JobProcsHeld"]: 

1330 wms_state = WmsStates.HELD 

1331 elif job["StatusDetails"] == "not_idle": 

1332 wms_state = WmsStates.RUNNING 

1333 elif job["JobProcsQueued"]: 

1334 wms_state = WmsStates.PENDING 

1335 elif status == NodeStatus.POSTRUN: 

1336 wms_state = WmsStates.MISFIT 

1337 elif status == NodeStatus.DONE: 

1338 wms_state = WmsStates.SUCCEEDED 

1339 elif status == NodeStatus.ERROR: 

1340 # Use job exist instead of post script exit 

1341 if "DAGMAN error 0" in job["StatusDetails"]: 

1342 wms_state = WmsStates.SUCCEEDED 

1343 else: 

1344 wms_state = WmsStates.FAILED 

1345 

1346 return wms_state 

1347 

1348 

1349def _update_jobs(jobs1, jobs2): 

1350 """Update jobs1 with info in jobs2. 

1351 

1352 (Basically an update for nested dictionaries.) 

1353 

1354 Parameters 

1355 ---------- 

1356 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1357 HTCondor job information to be updated. 

1358 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1359 Additional HTCondor job information. 

1360 """ 

1361 for jid, jinfo in jobs2.items(): 

1362 if jid in jobs1: 

1363 jobs1[jid].update(jinfo) 

1364 else: 

1365 jobs1[jid] = jinfo 

1366 

1367 

1368def _wms_id_type(wms_id): 

1369 """Determine the type of the WMS id. 

1370 

1371 Parameters 

1372 ---------- 

1373 wms_id : `str` 

1374 WMS id identifying a job. 

1375 

1376 Returns 

1377 ------- 

1378 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1379 Type of WMS id. 

1380 """ 

1381 try: 

1382 int(float(wms_id)) 

1383 except ValueError: 

1384 wms_path = Path(wms_id) 

1385 if wms_path.exists(): 

1386 id_type = WmsIdType.PATH 

1387 else: 

1388 id_type = WmsIdType.GLOBAL 

1389 except TypeError: 

1390 id_type = WmsIdType.UNKNOWN 

1391 else: 

1392 id_type = WmsIdType.LOCAL 

1393 return id_type 

1394 

1395 

1396def _wms_id_to_cluster(wms_id): 

1397 """Convert WMS id to cluster id. 

1398 

1399 Parameters 

1400 ---------- 

1401 wms_id : `int` or `float` or `str` 

1402 HTCondor job id or path. 

1403 

1404 Returns 

1405 ------- 

1406 schedd_ad : `classad.ClassAd` 

1407 ClassAd describing the scheduler managing the job with the given id. 

1408 cluster_id : `int` 

1409 HTCondor cluster id. 

1410 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1411 The type of the provided id. 

1412 """ 

1413 coll = htcondor.Collector() 

1414 

1415 schedd_ad = None 

1416 cluster_id = None 

1417 id_type = _wms_id_type(wms_id) 

1418 if id_type == WmsIdType.LOCAL: 

1419 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1420 cluster_id = int(float(wms_id)) 

1421 elif id_type == WmsIdType.GLOBAL: 

1422 constraint = f'GlobalJobId == "{wms_id}"' 

1423 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1424 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1425 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1426 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries) 

1427 if (ads := query.nextAdsNonBlocking())} 

1428 if results: 

1429 schedd_name = next(iter(results)) 

1430 schedd_ad = schedd_ads[schedd_name] 

1431 cluster_id = results[schedd_name]["ClusterId"] 

1432 elif id_type == WmsIdType.PATH: 

1433 try: 

1434 job_info = read_dag_info(wms_id) 

1435 except (FileNotFoundError, PermissionError, IOError): 

1436 pass 

1437 else: 

1438 schedd_name = next(iter(job_info)) 

1439 job_id = next(iter(job_info[schedd_name])) 

1440 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1441 cluster_id = int(float(job_id)) 

1442 else: 

1443 pass 

1444 return schedd_ad, cluster_id, id_type 

1445 

1446 

1447def _create_periodic_release_expr(memory, multiplier, limit): 

1448 """Construct an HTCondorAd expression for releasing held jobs. 

1449 

1450 The expression instruct HTCondor to release any job which was put on hold 

1451 due to exceeding memory requirements back to the job queue providing it 

1452 satisfies all of the conditions below: 

1453 

1454 * number of run attempts did not reach allowable number of retries, 

1455 * the memory requirements in the last failed run attempt did not reach 

1456 the specified memory limit. 

1457 

1458 Parameters 

1459 ---------- 

1460 memory : `int` 

1461 Requested memory in MB. 

1462 multiplier : `float` 

1463 Memory growth rate between retires. 

1464 limit : `int` 

1465 Memory limit. 

1466 

1467 Returns 

1468 ------- 

1469 expr : `str` 

1470 A string representing an HTCondor ClassAd expression for releasing jobs 

1471 which have been held due to exceeding the memory requirements. 

1472 """ 

1473 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1474 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1475 

1476 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1477 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1478 # The special comparison operators ensure that all comparisons below will 

1479 # evaluate to FALSE in this case. 

1480 # 

1481 # Note: 

1482 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1483 # the entire expression should evaluate to FALSE when the job is not HELD. 

1484 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1485 # but better safe than sorry. 

1486 was_mem_exceeded = "JobStatus == 5 " \ 

1487 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1488 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1489 

1490 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1491 return expr 

1492 

1493 

1494def _create_periodic_remove_expr(memory, multiplier, limit): 

1495 """Construct an HTCondorAd expression for removing jobs from the queue. 

1496 

1497 The expression instruct HTCondor to remove any job which was put on hold 

1498 due to exceeding memory requirements from the job queue providing it 

1499 satisfies any of the conditions below: 

1500 

1501 * allowable number of retries was reached, 

1502 * the memory requirements during the last failed run attempt reached 

1503 the specified memory limit. 

1504 

1505 Parameters 

1506 ---------- 

1507 memory : `int` 

1508 Requested memory in MB. 

1509 multiplier : `float` 

1510 Memory growth rate between retires. 

1511 limit : `int` 

1512 Memory limit. 

1513 

1514 Returns 

1515 ------- 

1516 expr : `str` 

1517 A string representing an HTCondor ClassAd expression for removing jobs 

1518 which were run at the maximal allowable memory and still exceeded 

1519 the memory requirements. 

1520 """ 

1521 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1522 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1523 

1524 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1525 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1526 # The special comparison operators ensure that all comparisons below will 

1527 # evaluate to FALSE in this case. 

1528 # 

1529 # Note: 

1530 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1531 # the entire expression should evaluate to FALSE when the job is not HELD. 

1532 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1533 # but better safe than sorry. 

1534 was_mem_exceeded = "JobStatus == 5 " \ 

1535 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1536 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1537 

1538 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1539 return expr 

1540 

1541 

1542def _create_request_memory_expr(memory, multiplier, limit): 

1543 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1544 

1545 Parameters 

1546 ---------- 

1547 memory : `int` 

1548 Requested memory in MB. 

1549 multiplier : `float` 

1550 Memory growth rate between retires. 

1551 limit : `int` 

1552 Memory limit. 

1553 

1554 Returns 

1555 ------- 

1556 expr : `str` 

1557 A string representing an HTCondor ClassAd expression enabling safe 

1558 memory scaling between job retries. 

1559 """ 

1560 # The check if the job was held due to exceeding memory requirements 

1561 # will be made *after* job was released back to the job queue (is in 

1562 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1563 # the ones describing job's current state. 

1564 # 

1565 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1566 # initially put in the job queue. The special comparison operators ensure 

1567 # that all comparisons below will evaluate to FALSE in this case. 

1568 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1569 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1570 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1571 

1572 # If job runs the first time or was held for reasons other than exceeding 

1573 # the memory, set the required memory to the requested value or use 

1574 # the memory value measured by HTCondor (MemoryUsage) depending on 

1575 # whichever is greater. 

1576 expr = f"({was_mem_exceeded}) " \ 

1577 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " \ 

1578 f": max({{{memory}, MemoryUsage ?: 0}})" 

1579 return expr 

1580 

1581 

1582def _locate_schedds(locate_all=False): 

1583 """Find out Scheduler daemons in an HTCondor pool. 

1584 

1585 Parameters 

1586 ---------- 

1587 locate_all : `bool`, optional 

1588 If True, all available schedulers in the HTCondor pool will be located. 

1589 False by default which means that the search will be limited to looking 

1590 for the Scheduler running on a local host. 

1591 

1592 Returns 

1593 ------- 

1594 schedds : `dict` [`str`, `htcondor.Schedd`] 

1595 A mapping between Scheduler names and Python objects allowing for 

1596 interacting with them. 

1597 """ 

1598 coll = htcondor.Collector() 

1599 

1600 schedd_ads = [] 

1601 if locate_all: 

1602 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1603 else: 

1604 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1605 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1606 

1607 

1608def _gather_site_values(config, compute_site): 

1609 """Gather values specific to given site. 

1610 

1611 Parameters 

1612 ---------- 

1613 config : `lsst.ctrl.bps.BpsConfig` 

1614 BPS configuration that includes necessary submit/runtime 

1615 information. 

1616 compute_site : `str` 

1617 Compute site name. 

1618 

1619 Returns 

1620 ------- 

1621 site_values : `dict` [`str`, `Any`] 

1622 Values specific to the given site. 

1623 """ 

1624 site_values = {"attrs": {}, "profile": {}} 

1625 search_opts = {} 

1626 if compute_site: 

1627 search_opts["curvals"] = {"curr_site": compute_site} 

1628 

1629 # Determine the hard limit for the memory requirement. 

1630 found, limit = config.search('memoryLimit', opt=search_opts) 

1631 if not found: 

1632 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1633 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1634 del search_opts["default"] 

1635 

1636 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1637 # by definition, they cannot have more memory than 

1638 # the partitionable slot they are the part of. 

1639 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1640 pool_info = condor_status(constraint=constraint) 

1641 try: 

1642 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1643 except ValueError: 

1644 _LOG.debug("No execute machine in the pool matches %s", patt) 

1645 if limit: 

1646 config[".bps_defined.memory_limit"] = limit 

1647 

1648 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1649 site_values["memoryLimit"] = limit 

1650 

1651 key = f".site.{compute_site}.profile.condor" 

1652 if key in config: 

1653 for key, val in config[key].items(): 

1654 if key.startswith("+"): 

1655 site_values["attrs"][key[1:]] = val 

1656 else: 

1657 site_values["profile"][key] = val 

1658 

1659 return site_values