Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%

520 statements  

« prev     ^ index     » next       coverage.py v7.2.1, created at 2023-03-12 03:01 -0700

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import os 

29import re 

30import logging 

31from datetime import datetime, timedelta 

32from pathlib import Path 

33from collections import defaultdict 

34 

35import htcondor 

36 

37from lsst.daf.butler.core.utils import time_this 

38from ... import ( 

39 BaseWmsWorkflow, 

40 BaseWmsService, 

41 GenericWorkflow, 

42 GenericWorkflowJob, 

43 WmsRunReport, 

44 WmsJobReport, 

45 WmsStates 

46) 

47from ...bps_utils import ( 

48 chdir, 

49 create_count_summary 

50) 

51from .lssthtc import ( 

52 HTCDag, 

53 HTCJob, 

54 MISSING_ID, 

55 JobStatus, 

56 NodeStatus, 

57 htc_check_dagman_output, 

58 htc_escape, 

59 htc_submit_dag, 

60 read_dag_log, 

61 read_dag_status, 

62 read_node_status, 

63 condor_history, 

64 condor_q, 

65 condor_status, 

66 pegasus_name_to_label, 

67 summary_from_dag, 

68) 

69 

70 

71DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

72"""Default pattern for searching execute machines in an HTCondor pool. 

73""" 

74 

75_LOG = logging.getLogger(__name__) 

76 

77 

78class HTCondorService(BaseWmsService): 

79 """HTCondor version of WMS service. 

80 """ 

81 def prepare(self, config, generic_workflow, out_prefix=None): 

82 """Convert generic workflow to an HTCondor DAG ready for submission. 

83 

84 Parameters 

85 ---------- 

86 config : `lsst.ctrl.bps.BpsConfig` 

87 BPS configuration that includes necessary submit/runtime 

88 information. 

89 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

90 The generic workflow (e.g., has executable name and arguments). 

91 out_prefix : `str` 

92 The root directory into which all WMS-specific files are written. 

93 

94 Returns 

95 ---------- 

96 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

97 HTCondor workflow ready to be run. 

98 """ 

99 _LOG.debug("out_prefix = '%s'", out_prefix) 

100 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

101 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

102 f"{self.__class__.__module__}." 

103 f"{self.__class__.__name__}") 

104 

105 with time_this(log=_LOG, level=logging.INFO, prefix=None, 

106 msg="Completed writing out HTCondor workflow"): 

107 workflow.write(out_prefix) 

108 return workflow 

109 

110 def submit(self, workflow): 

111 """Submit a single HTCondor workflow. 

112 

113 Parameters 

114 ---------- 

115 workflow : `lsst.ctrl.bps.BaseWorkflow` 

116 A single HTCondor workflow to submit. run_id is updated after 

117 successful submission to WMS. 

118 """ 

119 # For workflow portability, internal paths are all relative. Hence 

120 # the DAG needs to be submitted to HTCondor from inside the submit 

121 # directory. 

122 with chdir(workflow.submit_path): 

123 _LOG.info("Submitting from directory: %s", os.getcwd()) 

124 htc_submit_dag(workflow.dag, {}) 

125 workflow.run_id = workflow.dag.run_id 

126 

127 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

128 """Query WMS for list of submitted WMS workflows/jobs. 

129 

130 This should be a quick lookup function to create list of jobs for 

131 other functions. 

132 

133 Parameters 

134 ---------- 

135 wms_id : `int` or `str`, optional 

136 Id or path that can be used by WMS service to look up job. 

137 user : `str`, optional 

138 User whose submitted jobs should be listed. 

139 require_bps : `bool`, optional 

140 Whether to require jobs returned in list to be bps-submitted jobs. 

141 pass_thru : `str`, optional 

142 Information to pass through to WMS. 

143 

144 Returns 

145 ------- 

146 job_ids : `list` [`Any`] 

147 Only job ids to be used by cancel and other functions. Typically 

148 this means top-level jobs (i.e., not children jobs). 

149 """ 

150 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s", 

151 wms_id, user, require_bps, pass_thru) 

152 constraint = "" 

153 

154 if wms_id is None: 

155 if user is not None: 

156 constraint = f'(Owner == "{user}")' 

157 else: 

158 cluster_id = _wms_id_to_cluster(wms_id) 

159 if cluster_id != 0: 

160 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

161 

162 if require_bps: 

163 constraint += ' && (bps_isjob == "True")' 

164 

165 if pass_thru: 

166 if "-forcex" in pass_thru: 

167 pass_thru_2 = pass_thru.replace("-forcex", "") 

168 if pass_thru_2 and not pass_thru_2.isspace(): 

169 constraint += f"&& ({pass_thru_2})" 

170 else: 

171 constraint += f" && ({pass_thru})" 

172 

173 _LOG.debug("constraint = %s", constraint) 

174 jobs = condor_q(constraint) 

175 

176 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

177 job_ids = [] 

178 for job_id, job_info in jobs.items(): 

179 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None")) 

180 if "DAGManJobId" not in job_info: # orphaned job 

181 job_ids.append(job_id) 

182 else: 

183 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0") 

184 _LOG.debug("\tin jobs.keys() = %s", jobs.keys()) 

185 if f"{job_info['DAGManJobId']}.0" not in jobs: 

186 job_ids.append(job_id) 

187 

188 _LOG.debug("job_ids = %s", job_ids) 

189 return job_ids 

190 

191 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

192 """Return run information based upon given constraints. 

193 

194 Parameters 

195 ---------- 

196 wms_workflow_id : `str` 

197 Limit to specific run based on id. 

198 user : `str` 

199 Limit results to runs for this user. 

200 hist : `float` 

201 Limit history search to this many days. 

202 pass_thru : `str` 

203 Constraints to pass through to HTCondor. 

204 

205 Returns 

206 ------- 

207 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

208 Information about runs from given job information. 

209 message : `str` 

210 Extra message for report command to print. This could be pointers 

211 to documentation or to WMS specific commands. 

212 """ 

213 message = "" 

214 

215 if wms_workflow_id: 

216 # Explicitly checking if wms_workflow_id can be converted to a 

217 # float instead of using try/except to avoid catching a different 

218 # ValueError from _report_from_id 

219 try: 

220 float(wms_workflow_id) 

221 is_float = True 

222 except ValueError: # Don't need TypeError here as None goes to else branch. 

223 is_float = False 

224 

225 if is_float: 

226 run_reports, message = _report_from_id(float(wms_workflow_id), hist) 

227 else: 

228 run_reports, message = _report_from_path(wms_workflow_id) 

229 else: 

230 run_reports, message = _summary_report(user, hist, pass_thru) 

231 _LOG.debug("report: %s, %s", run_reports, message) 

232 

233 return list(run_reports.values()), message 

234 

235 def cancel(self, wms_id, pass_thru=None): 

236 """Cancel submitted workflows/jobs. 

237 

238 Parameters 

239 ---------- 

240 wms_id : `str` 

241 Id or path of job that should be canceled. 

242 pass_thru : `str`, optional 

243 Information to pass through to WMS. 

244 

245 Returns 

246 -------- 

247 deleted : `bool` 

248 Whether successful deletion or not. Currently, if any doubt or any 

249 individual jobs not deleted, return False. 

250 message : `str` 

251 Any message from WMS (e.g., error details). 

252 """ 

253 _LOG.debug("Canceling wms_id = %s", wms_id) 

254 

255 cluster_id = _wms_id_to_cluster(wms_id) 

256 if cluster_id == 0: 

257 deleted = False 

258 message = "Invalid id" 

259 else: 

260 _LOG.debug("Canceling cluster_id = %s", cluster_id) 

261 schedd = htcondor.Schedd() 

262 constraint = f"ClusterId == {cluster_id}" 

263 if pass_thru is not None and "-forcex" in pass_thru: 

264 pass_thru_2 = pass_thru.replace("-forcex", "") 

265 if pass_thru_2 and not pass_thru_2.isspace(): 

266 constraint += f"&& ({pass_thru_2})" 

267 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

268 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

269 else: 

270 if pass_thru: 

271 constraint += f"&& ({pass_thru})" 

272 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

273 results = schedd.act(htcondor.JobAction.Remove, constraint) 

274 _LOG.debug("Remove results: %s", results) 

275 

276 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

277 deleted = True 

278 message = "" 

279 else: 

280 deleted = False 

281 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

282 message = "no such bps job in batch queue" 

283 else: 

284 message = f"unknown problems deleting: {results}" 

285 

286 _LOG.debug("deleted: %s; message = %s", deleted, message) 

287 return deleted, message 

288 

289 

290class HTCondorWorkflow(BaseWmsWorkflow): 

291 """Single HTCondor workflow. 

292 

293 Parameters 

294 ---------- 

295 name : `str` 

296 Unique name for Workflow used when naming files. 

297 config : `lsst.ctrl.bps.BpsConfig` 

298 BPS configuration that includes necessary submit/runtime information. 

299 """ 

300 def __init__(self, name, config=None): 

301 super().__init__(name, config) 

302 self.dag = None 

303 

304 @classmethod 

305 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

306 # Docstring inherited 

307 htc_workflow = cls(generic_workflow.name, config) 

308 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

309 

310 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

311 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

312 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

313 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

314 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

315 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

316 

317 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

318 if isinstance(tmp_template, str): 

319 subdir_template = defaultdict(lambda: tmp_template) 

320 else: 

321 subdir_template = tmp_template 

322 

323 # Create all DAG jobs 

324 site_values = {} # cache compute site specific values to reduce config lookups 

325 for job_name in generic_workflow: 

326 gwjob = generic_workflow.get_job(job_name) 

327 if gwjob.compute_site not in site_values: 

328 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

329 htc_job = _create_job(subdir_template[gwjob.label], site_values[gwjob.compute_site], 

330 generic_workflow, gwjob, out_prefix) 

331 htc_workflow.dag.add_job(htc_job) 

332 

333 # Add job dependencies to the DAG 

334 for job_name in generic_workflow: 

335 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

336 

337 # If final job exists in generic workflow, create DAG final job 

338 final = generic_workflow.get_final() 

339 if final and isinstance(final, GenericWorkflowJob): 

340 if final.compute_site and final.compute_site not in site_values: 

341 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

342 final_htjob = _create_job(subdir_template[final.label], site_values[final.compute_site], 

343 generic_workflow, final, out_prefix) 

344 if "post" not in final_htjob.dagcmds: 

345 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

346 f" {final.name} $DAG_STATUS $RETURN" 

347 htc_workflow.dag.add_final_job(final_htjob) 

348 elif final and isinstance(final, GenericWorkflow): 

349 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

350 elif final: 

351 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

352 

353 return htc_workflow 

354 

355 def write(self, out_prefix): 

356 """Output HTCondor DAGMan files needed for workflow submission. 

357 

358 Parameters 

359 ---------- 

360 out_prefix : `str` 

361 Directory prefix for HTCondor files. 

362 """ 

363 self.submit_path = out_prefix 

364 os.makedirs(out_prefix, exist_ok=True) 

365 

366 # Write down the workflow in HTCondor format. 

367 self.dag.write(out_prefix, "jobs/{self.label}") 

368 

369 

370def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

371 """Convert GenericWorkflow job nodes to DAG jobs. 

372 

373 Parameters 

374 ---------- 

375 subdir_template : `str` 

376 Template for making subdirs. 

377 site_values : `dict` 

378 Site specific values 

379 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

380 Generic workflow that is being converted. 

381 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

382 The generic job to convert to a HTCondor job. 

383 out_prefix : `str` 

384 Directory prefix for HTCondor files. 

385 

386 Returns 

387 ------- 

388 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

389 The HTCondor job equivalent to the given generic job. 

390 """ 

391 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

392 

393 curvals = defaultdict(str) 

394 curvals["label"] = gwjob.label 

395 if gwjob.tags: 

396 curvals.update(gwjob.tags) 

397 

398 subdir = subdir_template.format_map(curvals) 

399 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

400 

401 htc_job_cmds = { 

402 "universe": "vanilla", 

403 "should_transfer_files": "YES", 

404 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

405 "transfer_output_files": '""', # Set to empty string to disable 

406 "transfer_executable": "False", 

407 "getenv": "True", 

408 

409 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

410 # to put SIGBUS jobs on hold. 

411 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

412 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

413 "on_exit_hold_subcode": "34" 

414 } 

415 

416 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

417 

418 # job stdout, stderr, htcondor user log. 

419 for key in ("output", "error", "log"): 

420 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

421 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

422 

423 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], 

424 out_prefix)) 

425 

426 # Add the job cmds dict to the job object. 

427 htc_job.add_job_cmds(htc_job_cmds) 

428 

429 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

430 

431 # Add job attributes to job. 

432 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

433 htc_job.add_job_attrs(gwjob.attrs) 

434 htc_job.add_job_attrs(site_values["attrs"]) 

435 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

436 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

437 "bps_job_label": gwjob.label}) 

438 

439 return htc_job 

440 

441 

442def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

443 """Translate the job data that are one to one mapping 

444 

445 Parameters 

446 ---------- 

447 cached_vals : `dict` [`str`, `Any`] 

448 Config values common to jobs with same label. 

449 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

450 Generic workflow that contains job to being converted. 

451 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

452 Generic workflow job to be converted. 

453 

454 Returns 

455 ------- 

456 htc_job_commands : `dict` [`str`, `Any`] 

457 Contains commands which can appear in the HTCondor submit description 

458 file. 

459 """ 

460 # Values in the job script that just are name mappings. 

461 job_translation = {"mail_to": "notify_user", 

462 "when_to_mail": "notification", 

463 "request_cpus": "request_cpus", 

464 "priority": "priority", 

465 "category": "category"} 

466 

467 jobcmds = {} 

468 for gwkey, htckey in job_translation.items(): 

469 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

470 

471 # job commands that need modification 

472 if gwjob.number_of_retries: 

473 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

474 

475 if gwjob.retry_unless_exit: 

476 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

477 

478 if gwjob.request_disk: 

479 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

480 

481 if gwjob.request_memory: 

482 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

483 

484 if gwjob.memory_multiplier: 

485 # Do not use try-except! At the moment, BpsConfig returns an empty 

486 # string if it does not contain the key. 

487 memory_limit = cached_vals["memoryLimit"] 

488 if not memory_limit: 

489 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

490 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

491 "search pattern 'executeMachinesPattern' might help.") 

492 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier) 

493 

494 # Periodically release jobs which are being held due to exceeding 

495 # memory. Stop doing that (by removing the job from the HTCondor queue) 

496 # after the maximal number of retries has been reached or the memory 

497 # requirements cannot be satisfied. 

498 jobcmds["periodic_release"] = \ 

499 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)" 

500 jobcmds["periodic_remove"] = \ 

501 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \ 

502 f"JobStatus == 5 && NumJobStarts > JobMaxRetries" 

503 

504 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

505 # May need to move to special site-specific implementation if sites use 

506 # other mechanisms. 

507 if gwjob.concurrency_limit: 

508 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

509 

510 # Handle command line 

511 if gwjob.executable.transfer_executable: 

512 jobcmds["transfer_executable"] = "True" 

513 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

514 else: 

515 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

516 

517 if gwjob.arguments: 

518 arguments = gwjob.arguments 

519 arguments = _replace_cmd_vars(arguments, gwjob) 

520 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

521 arguments = _fix_env_var_syntax(arguments) 

522 jobcmds["arguments"] = arguments 

523 

524 # Add extra "pass-thru" job commands 

525 if gwjob.profile: 

526 for key, val in gwjob.profile.items(): 

527 jobcmds[key] = htc_escape(val) 

528 for key, val in cached_vals["profile"]: 

529 jobcmds[key] = htc_escape(val) 

530 

531 return jobcmds 

532 

533 

534def _translate_dag_cmds(gwjob): 

535 """Translate job values into DAGMan commands. 

536 

537 Parameters 

538 ---------- 

539 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

540 Job containing values to be translated. 

541 

542 Returns 

543 ------- 

544 dagcmds : `dict` [`str`, `Any`] 

545 DAGMan commands for the job. 

546 """ 

547 # Values in the dag script that just are name mappings. 

548 dag_translation = {"abort_on_value": "abort_dag_on", 

549 "abort_return_value": "abort_exit"} 

550 

551 dagcmds = {} 

552 for gwkey, htckey in dag_translation.items(): 

553 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

554 

555 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

556 return dagcmds 

557 

558 

559def _fix_env_var_syntax(oldstr): 

560 """Change ENV place holders to HTCondor Env var syntax. 

561 

562 Parameters 

563 ---------- 

564 oldstr : `str` 

565 String in which environment variable syntax is to be fixed. 

566 

567 Returns 

568 ------- 

569 newstr : `str` 

570 Given string with environment variable syntax fixed. 

571 """ 

572 newstr = oldstr 

573 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

574 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

575 return newstr 

576 

577 

578def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

579 """Replace file placeholders in command line arguments with correct 

580 physical file names. 

581 

582 Parameters 

583 ---------- 

584 use_shared : `bool` 

585 Whether HTCondor can assume shared filesystem. 

586 arguments : `str` 

587 Arguments string in which to replace file placeholders. 

588 workflow : `lsst.ctrl.bps.GenericWorkflow` 

589 Generic workflow that contains file information. 

590 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

591 The job corresponding to the arguments. 

592 

593 Returns 

594 ------- 

595 arguments : `str` 

596 Given arguments string with file placeholders replaced. 

597 """ 

598 # Replace input file placeholders with paths. 

599 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

600 if not gwfile.wms_transfer: 

601 # Must assume full URI if in command line and told WMS is not 

602 # responsible for transferring file. 

603 uri = gwfile.src_uri 

604 elif use_shared: 

605 if gwfile.job_shared: 

606 # Have shared filesystems and jobs can share file. 

607 uri = gwfile.src_uri 

608 else: 

609 # Taking advantage of inside knowledge. Not future-proof. 

610 # Temporary fix until have job wrapper that pulls files 

611 # within job. 

612 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

613 uri = "butler.yaml" 

614 else: 

615 uri = os.path.basename(gwfile.src_uri) 

616 else: # Using push transfer 

617 uri = os.path.basename(gwfile.src_uri) 

618 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

619 

620 # Replace output file placeholders with paths. 

621 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

622 if not gwfile.wms_transfer: 

623 # Must assume full URI if in command line and told WMS is not 

624 # responsible for transferring file. 

625 uri = gwfile.src_uri 

626 elif use_shared: 

627 if gwfile.job_shared: 

628 # Have shared filesystems and jobs can share file. 

629 uri = gwfile.src_uri 

630 else: 

631 uri = os.path.basename(gwfile.src_uri) 

632 else: # Using push transfer 

633 uri = os.path.basename(gwfile.src_uri) 

634 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

635 return arguments 

636 

637 

638def _replace_cmd_vars(arguments, gwjob): 

639 """Replace format-style placeholders in arguments. 

640 

641 Parameters 

642 ---------- 

643 arguments : `str` 

644 Arguments string in which to replace placeholders. 

645 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

646 Job containing values to be used to replace placeholders 

647 (in particular gwjob.cmdvals). 

648 

649 Returns 

650 ------- 

651 arguments : `str` 

652 Given arguments string with placeholders replaced. 

653 """ 

654 try: 

655 arguments = arguments.format(**gwjob.cmdvals) 

656 except (KeyError, TypeError): # TypeError in case None instead of {} 

657 _LOG.error("Could not replace command variables:\n" 

658 "arguments: %s\n" 

659 "cmdvals: %s", arguments, gwjob.cmdvals) 

660 raise 

661 return arguments 

662 

663 

664def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

665 """Add job input files from generic workflow to job. 

666 

667 Parameters 

668 ---------- 

669 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

670 The generic workflow (e.g., has executable name and arguments). 

671 job_name : `str` 

672 Unique name for the job. 

673 use_shared : `bool` 

674 Whether job has access to files via shared filesystem. 

675 out_prefix : `str` 

676 The root directory into which all WMS-specific files are written. 

677 

678 Returns 

679 ------- 

680 htc_commands : `dict` [`str`, `str`] 

681 HTCondor commands for the job submission script. 

682 """ 

683 htc_commands = {} 

684 inputs = [] 

685 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

686 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

687 

688 uri = Path(gwf_file.src_uri) 

689 

690 # Note if use_shared and job_shared, don't need to transfer file. 

691 

692 if not use_shared: # Copy file using push to job 

693 inputs.append(str(uri.relative_to(out_prefix))) 

694 elif not gwf_file.job_shared: # Jobs require own copy 

695 

696 # if using shared filesystem, but still need copy in job. Use 

697 # HTCondor's curl plugin for a local copy. 

698 

699 # Execution butler is represented as a directory which the 

700 # curl plugin does not handle. Taking advantage of inside 

701 # knowledge for temporary fix until have job wrapper that pulls 

702 # files within job. 

703 if gwf_file.name == "butlerConfig": 

704 # The execution butler directory doesn't normally exist until 

705 # the submit phase so checking for suffix instead of using 

706 # is_dir(). If other non-yaml file exists they would have a 

707 # different gwf_file.name. 

708 if uri.suffix == ".yaml": # Single file, so just copy. 

709 inputs.append(f"file://{uri}") 

710 else: 

711 inputs.append(f"file://{uri / 'butler.yaml'}") 

712 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

713 elif uri.is_dir(): 

714 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job " 

715 f"{gwf_file.src_uri}") 

716 else: 

717 inputs.append(f"file://{uri}") 

718 

719 if inputs: 

720 htc_commands["transfer_input_files"] = ",".join(inputs) 

721 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

722 return htc_commands 

723 

724 

725def _report_from_path(wms_path): 

726 """Gather run information from a given run directory. 

727 

728 Parameters 

729 ---------- 

730 wms_path : `str` 

731 The directory containing the submit side files (e.g., HTCondor files). 

732 

733 Returns 

734 ------- 

735 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

736 Run information for the detailed report. The key is the HTCondor id 

737 and the value is a collection of report information for that run. 

738 message : `str` 

739 Message to be printed with the summary report. 

740 """ 

741 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

742 if wms_workflow_id == MISSING_ID: 

743 run_reports = {} 

744 else: 

745 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

746 return run_reports, message 

747 

748 

749def _report_from_id(wms_workflow_id, hist): 

750 """Gather run information from a given run directory. 

751 

752 Parameters 

753 ---------- 

754 wms_workflow_id : `int` or `str` 

755 Limit to specific run based on id. 

756 hist : `float` 

757 Limit history search to this many days. 

758 

759 Returns 

760 ------- 

761 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

762 Run information for the detailed report. The key is the HTCondor id 

763 and the value is a collection of report information for that run. 

764 message : `str` 

765 Message to be printed with the summary report. 

766 """ 

767 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \ 

768 f"{int(float(wms_workflow_id))})" 

769 jobs = condor_q(constraint) 

770 if hist: 

771 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

772 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

773 hist_jobs = condor_history(constraint) 

774 _update_jobs(jobs, hist_jobs) 

775 

776 # keys in dictionary will be strings of format "ClusterId.ProcId" 

777 wms_workflow_id = str(wms_workflow_id) 

778 if not wms_workflow_id.endswith(".0"): 

779 wms_workflow_id += ".0" 

780 

781 if wms_workflow_id in jobs: 

782 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"]) 

783 _update_jobs(jobs, path_jobs) 

784 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

785 else: 

786 run_reports = {} 

787 message = f"Found 0 records for run id {wms_workflow_id}" 

788 return run_reports, message 

789 

790 

791def _get_info_from_path(wms_path): 

792 """Gather run information from a given run directory. 

793 

794 Parameters 

795 ---------- 

796 wms_path : `str` 

797 Directory containing HTCondor files. 

798 

799 Returns 

800 ------- 

801 wms_workflow_id : `str` 

802 The run id which is a DAGman job id. 

803 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

804 Information about jobs read from files in the given directory. 

805 The key is the HTCondor id and the value is a dictionary of HTCondor 

806 keys and values. 

807 message : `str` 

808 Message to be printed with the summary report. 

809 """ 

810 try: 

811 wms_workflow_id, jobs = read_dag_log(wms_path) 

812 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

813 _update_jobs(jobs, read_node_status(wms_path)) 

814 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

815 

816 # Add more info for DAGman job 

817 job = jobs[wms_workflow_id] 

818 job.update(read_dag_status(wms_path)) 

819 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

820 if "bps_run" not in job: 

821 _add_run_info(wms_path, job) 

822 

823 message = htc_check_dagman_output(wms_path) 

824 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

825 jobs[wms_workflow_id]["total_jobs"]) 

826 except StopIteration: 

827 message = f"Could not find HTCondor files in {wms_path}" 

828 _LOG.warning(message) 

829 wms_workflow_id = MISSING_ID 

830 jobs = {} 

831 

832 return wms_workflow_id, jobs, message 

833 

834 

835def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

836 """Gather run information to be used in generating summary reports. 

837 

838 Parameters 

839 ---------- 

840 wms_workflow_id : `str` 

841 Run lookup restricted to given user. 

842 jobs : `float` 

843 How many previous days to search for run information. 

844 

845 Returns 

846 ------- 

847 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

848 Run information for the detailed report. The key is the given HTCondor 

849 id and the value is a collection of report information for that run. 

850 """ 

851 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

852 dag_job = jobs[wms_workflow_id] 

853 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job: 

854 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id) 

855 return {} 

856 report = WmsRunReport(wms_id=wms_workflow_id, 

857 path=dag_job["Iwd"], 

858 label=dag_job.get("bps_job_label", "MISS"), 

859 run=dag_job.get("bps_run", "MISS"), 

860 project=dag_job.get("bps_project", "MISS"), 

861 campaign=dag_job.get("bps_campaign", "MISS"), 

862 payload=dag_job.get("bps_payload", "MISS"), 

863 operator=_get_owner(dag_job), 

864 run_summary=_get_run_summary(dag_job), 

865 state=_htc_status_to_wms_state(dag_job), 

866 jobs=[], 

867 total_number_jobs=dag_job["total_jobs"], 

868 job_state_counts=dag_job["state_counts"]) 

869 

870 try: 

871 for job in jobs.values(): 

872 if job["ClusterId"] != int(float(wms_workflow_id)): 

873 job_report = WmsJobReport(wms_id=job["ClusterId"], 

874 name=job.get("DAGNodeName", str(job["ClusterId"])), 

875 label=job.get("bps_job_label", 

876 pegasus_name_to_label(job["DAGNodeName"])), 

877 state=_htc_status_to_wms_state(job)) 

878 if job_report.label == "init": 

879 job_report.label = "pipetaskInit" 

880 report.jobs.append(job_report) 

881 except KeyError as ex: 

882 _LOG.error("Job missing key '%s': %s", str(ex), job) 

883 raise 

884 

885 run_reports = {report.wms_id: report} 

886 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

887 return run_reports 

888 

889 

890def _summary_report(user, hist, pass_thru): 

891 """Gather run information to be used in generating summary reports. 

892 

893 Parameters 

894 ---------- 

895 user : `str` 

896 Run lookup restricted to given user. 

897 hist : `float` 

898 How many previous days to search for run information. 

899 pass_thru : `str` 

900 Advanced users can define the HTCondor constraint to be used 

901 when searching queue and history. 

902 

903 Returns 

904 ------- 

905 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

906 Run information for the summary report. The keys are HTCondor ids and 

907 the values are collections of report information for each run. 

908 message : `str` 

909 Message to be printed with the summary report. 

910 """ 

911 # only doing summary report so only look for dagman jobs 

912 if pass_thru: 

913 constraint = pass_thru 

914 else: 

915 # Notes: 

916 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

917 # manually restarted. 

918 # * Any job with DAGManJobID isn't a DAG job 

919 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

920 if user: 

921 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

922 

923 # Check runs in queue. 

924 jobs = condor_q(constraint) 

925 

926 if hist: 

927 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

928 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

929 hist_jobs = condor_history(constraint) 

930 _update_jobs(jobs, hist_jobs) 

931 

932 _LOG.debug("Job ids from queue and history %s", jobs.keys()) 

933 

934 # Have list of DAGMan jobs, need to get run_report info. 

935 run_reports = {} 

936 for job in jobs.values(): 

937 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

938 # If didn't get from queue information (e.g., Kerberos bug), 

939 # try reading from file. 

940 if total_jobs == 0: 

941 try: 

942 job.update(read_dag_status(job["Iwd"])) 

943 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

944 except StopIteration: 

945 pass # don't kill report can't find htcondor files 

946 

947 if "bps_run" not in job: 

948 _add_run_info(job["Iwd"], job) 

949 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)), 

950 path=job["Iwd"], 

951 label=job.get("bps_job_label", "MISS"), 

952 run=job.get("bps_run", "MISS"), 

953 project=job.get("bps_project", "MISS"), 

954 campaign=job.get("bps_campaign", "MISS"), 

955 payload=job.get("bps_payload", "MISS"), 

956 operator=_get_owner(job), 

957 run_summary=_get_run_summary(job), 

958 state=_htc_status_to_wms_state(job), 

959 jobs=[], 

960 total_number_jobs=total_jobs, 

961 job_state_counts=state_counts) 

962 

963 run_reports[report.wms_id] = report 

964 

965 return run_reports, "" 

966 

967 

968def _add_run_info(wms_path, job): 

969 """Find BPS run information elsewhere for runs without bps attributes. 

970 

971 Parameters 

972 ---------- 

973 wms_path : `str` 

974 Path to submit files for the run. 

975 job : `dict` [`str`, `Any`] 

976 HTCondor dag job information. 

977 

978 Raises 

979 ------ 

980 StopIteration 

981 If cannot find file it is looking for. Permission errors are 

982 caught and job's run is marked with error. 

983 """ 

984 path = Path(wms_path) / "jobs" 

985 try: 

986 subfile = next(path.glob("**/*.sub")) 

987 except (StopIteration, PermissionError): 

988 job["bps_run"] = "Unavailable" 

989 else: 

990 _LOG.debug("_add_run_info: subfile = %s", subfile) 

991 try: 

992 with open(subfile, "r", encoding='utf-8') as fh: 

993 for line in fh: 

994 if line.startswith("+bps_"): 

995 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

996 if m: 

997 _LOG.debug("Matching line: %s", line) 

998 job[m.group(1)] = m.group(2).replace('"', "") 

999 else: 

1000 _LOG.debug("Could not parse attribute: %s", line) 

1001 except PermissionError: 

1002 job["bps_run"] = "PermissionError" 

1003 _LOG.debug("After adding job = %s", job) 

1004 

1005 

1006def _get_owner(job): 

1007 """Get the owner of a dag job. 

1008 

1009 Parameters 

1010 ---------- 

1011 job : `dict` [`str`, `Any`] 

1012 HTCondor dag job information. 

1013 

1014 Returns 

1015 ------- 

1016 owner : `str` 

1017 Owner of the dag job. 

1018 """ 

1019 owner = job.get("bps_operator", None) 

1020 if not owner: 

1021 owner = job.get("Owner", None) 

1022 if not owner: 

1023 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1024 owner = "MISS" 

1025 return owner 

1026 

1027 

1028def _get_run_summary(job): 

1029 """Get the run summary for a job. 

1030 

1031 Parameters 

1032 ---------- 

1033 job : `dict` [`str`, `Any`] 

1034 HTCondor dag job information. 

1035 

1036 Returns 

1037 ------- 

1038 summary : `str` 

1039 Number of jobs per PipelineTask label in approximate pipeline order. 

1040 Format: <label>:<count>[;<label>:<count>]+ 

1041 """ 

1042 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1043 if not summary: 

1044 summary, _ = summary_from_dag(job["Iwd"]) 

1045 if not summary: 

1046 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1047 _LOG.debug("_get_run_summary: summary=%s", summary) 

1048 

1049 # Workaround sometimes using init vs pipetaskInit 

1050 summary = summary.replace("init:", "pipetaskInit:") 

1051 

1052 if "pegasus_version" in job and "pegasus" not in summary: 

1053 summary += ";pegasus:0" 

1054 

1055 return summary 

1056 

1057 

1058def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1059 """Count number of jobs per WMS state. 

1060 

1061 Parameters 

1062 ---------- 

1063 wms_workflow_id : `str` 

1064 HTCondor job id. 

1065 jobs : `dict` [`str`, `Any`] 

1066 HTCondor dag job information. 

1067 

1068 Returns 

1069 ------- 

1070 total_count : `int` 

1071 Total number of dag nodes. 

1072 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1073 Keys are the different WMS states and values are counts of jobs 

1074 that are in that WMS state. 

1075 """ 

1076 state_counts = dict.fromkeys(WmsStates, 0) 

1077 

1078 for jid, jinfo in jobs.items(): 

1079 if jid != wms_workflow_id: 

1080 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1081 

1082 total_counted = sum(state_counts.values()) 

1083 if "NodesTotal" in jobs[wms_workflow_id]: 

1084 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1085 else: 

1086 total_count = total_counted 

1087 

1088 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1089 

1090 return total_count, state_counts 

1091 

1092 

1093def _get_state_counts_from_dag_job(job): 

1094 """Count number of jobs per WMS state. 

1095 

1096 Parameters 

1097 ---------- 

1098 job : `dict` [`str`, `Any`] 

1099 HTCondor dag job information. 

1100 

1101 Returns 

1102 ------- 

1103 total_count : `int` 

1104 Total number of dag nodes. 

1105 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1106 Keys are the different WMS states and values are counts of jobs 

1107 that are in that WMS state. 

1108 """ 

1109 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1110 state_counts = dict.fromkeys(WmsStates, 0) 

1111 if "DAG_NodesReady" in job: 

1112 state_counts = { 

1113 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1114 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1115 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1116 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1117 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1118 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1119 total_jobs = job.get("DAG_NodesTotal") 

1120 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1121 elif "NodesFailed" in job: 

1122 state_counts = { 

1123 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1124 WmsStates.READY: job.get("NodesReady", 0), 

1125 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1126 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1127 WmsStates.FAILED: job.get("NodesFailed", 0), 

1128 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1129 try: 

1130 total_jobs = job.get("NodesTotal") 

1131 except KeyError as ex: 

1132 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1133 raise 

1134 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1135 else: 

1136 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1137 # for every DAG. 

1138 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1139 total_jobs = 0 

1140 

1141 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1142 return total_jobs, state_counts 

1143 

1144 

1145def _htc_status_to_wms_state(job): 

1146 """Convert HTCondor job status to generic wms state. 

1147 

1148 Parameters 

1149 ---------- 

1150 job : `dict` [`str`, `Any`] 

1151 HTCondor job information. 

1152 

1153 Returns 

1154 ------- 

1155 wms_state : `WmsStates` 

1156 The equivalent WmsState to given job's status. 

1157 """ 

1158 wms_state = WmsStates.MISFIT 

1159 if "JobStatus" in job: 

1160 wms_state = _htc_job_status_to_wms_state(job) 

1161 elif "NodeStatus" in job: 

1162 wms_state = _htc_node_status_to_wms_state(job) 

1163 return wms_state 

1164 

1165 

1166def _htc_job_status_to_wms_state(job): 

1167 """Convert HTCondor job status to generic wms state. 

1168 

1169 Parameters 

1170 ---------- 

1171 job : `dict` [`str`, `Any`] 

1172 HTCondor job information. 

1173 

1174 Returns 

1175 ------- 

1176 wms_state : `lsst.ctrl.bps.WmsStates` 

1177 The equivalent WmsState to given job's status. 

1178 """ 

1179 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1180 type(job["JobStatus"])) 

1181 job_status = int(job["JobStatus"]) 

1182 wms_state = WmsStates.MISFIT 

1183 

1184 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1185 if job_status == JobStatus.IDLE: 

1186 wms_state = WmsStates.PENDING 

1187 elif job_status == JobStatus.RUNNING: 

1188 wms_state = WmsStates.RUNNING 

1189 elif job_status == JobStatus.REMOVED: 

1190 wms_state = WmsStates.DELETED 

1191 elif job_status == JobStatus.COMPLETED: 

1192 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1193 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1194 job.get("ReturnValue", 0): 

1195 wms_state = WmsStates.FAILED 

1196 else: 

1197 wms_state = WmsStates.SUCCEEDED 

1198 elif job_status == JobStatus.HELD: 

1199 wms_state = WmsStates.HELD 

1200 

1201 return wms_state 

1202 

1203 

1204def _htc_node_status_to_wms_state(job): 

1205 """Convert HTCondor status to generic wms state. 

1206 

1207 Parameters 

1208 ---------- 

1209 job : `dict` [`str`, `Any`] 

1210 HTCondor job information. 

1211 

1212 Returns 

1213 ------- 

1214 wms_state : `lsst.ctrl.bps.WmsStates` 

1215 The equivalent WmsState to given node's status. 

1216 """ 

1217 wms_state = WmsStates.MISFIT 

1218 

1219 status = job["NodeStatus"] 

1220 if status == NodeStatus.NOT_READY: 

1221 wms_state = WmsStates.UNREADY 

1222 elif status == NodeStatus.READY: 

1223 wms_state = WmsStates.READY 

1224 elif status == NodeStatus.PRERUN: 

1225 wms_state = WmsStates.MISFIT 

1226 elif status == NodeStatus.SUBMITTED: 

1227 if job["JobProcsHeld"]: 

1228 wms_state = WmsStates.HELD 

1229 elif job["StatusDetails"] == "not_idle": 

1230 wms_state = WmsStates.RUNNING 

1231 elif job["JobProcsQueued"]: 

1232 wms_state = WmsStates.PENDING 

1233 elif status == NodeStatus.POSTRUN: 

1234 wms_state = WmsStates.MISFIT 

1235 elif status == NodeStatus.DONE: 

1236 wms_state = WmsStates.SUCCEEDED 

1237 elif status == NodeStatus.ERROR: 

1238 # Use job exist instead of post script exit 

1239 if "DAGMAN error 0" in job["StatusDetails"]: 

1240 wms_state = WmsStates.SUCCEEDED 

1241 else: 

1242 wms_state = WmsStates.FAILED 

1243 

1244 return wms_state 

1245 

1246 

1247def _update_jobs(jobs1, jobs2): 

1248 """Update jobs1 with info in jobs2. 

1249 

1250 (Basically an update for nested dictionaries.) 

1251 

1252 Parameters 

1253 ---------- 

1254 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1255 HTCondor job information to be updated. 

1256 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1257 Additional HTCondor job information. 

1258 """ 

1259 for jid, jinfo in jobs2.items(): 

1260 if jid in jobs1: 

1261 jobs1[jid].update(jinfo) 

1262 else: 

1263 jobs1[jid] = jinfo 

1264 

1265 

1266def _wms_id_to_cluster(wms_id): 

1267 """Convert WMS ID to cluster ID. 

1268 

1269 Parameters 

1270 ---------- 

1271 wms_id : `int` or `float` or `str` 

1272 HTCondor job id or path. 

1273 

1274 Returns 

1275 ------- 

1276 cluster_id : `int` 

1277 HTCondor cluster id. 

1278 """ 

1279 # If wms_id represents path, get numeric id. 

1280 try: 

1281 cluster_id = int(float(wms_id)) 

1282 except ValueError: 

1283 wms_path = Path(wms_id) 

1284 if wms_path.exists(): 

1285 try: 

1286 cluster_id, _ = read_dag_log(wms_id) 

1287 cluster_id = int(float(cluster_id)) 

1288 except StopIteration: 

1289 cluster_id = 0 

1290 else: 

1291 cluster_id = 0 

1292 return cluster_id 

1293 

1294 

1295def _create_request_memory_expr(memory, multiplier): 

1296 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1297 

1298 Parameters 

1299 ---------- 

1300 memory : `int` 

1301 Requested memory in MB. 

1302 multiplier : `float` 

1303 Memory growth rate between retires. 

1304 

1305 Returns 

1306 ------- 

1307 ad : `str` 

1308 A string representing an HTCondor ClassAd expression enabling safe 

1309 memory scaling between job retries. 

1310 """ 

1311 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue. 

1312 # The special comparison operators ensure that all comparisons below will 

1313 # evaluate to FALSE in this case. 

1314 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1315 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1316 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1317 

1318 # If job runs the first time or was held for reasons other than exceeding 

1319 # the memory, set the required memory to the requested value or use 

1320 # the memory value measured by HTCondor (MemoryUsage) depending on 

1321 # whichever is greater. 

1322 ad = f"({was_mem_exceeded}) " \ 

1323 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \ 

1324 f": max({{{memory}, MemoryUsage ?: 0}}))" 

1325 return ad 

1326 

1327 

1328def _gather_site_values(config, compute_site): 

1329 """Gather values specific to given site. 

1330 

1331 Parameters 

1332 ---------- 

1333 config : `lsst.ctrl.bps.BpsConfig` 

1334 BPS configuration that includes necessary submit/runtime 

1335 information. 

1336 compute_site : `str` 

1337 Compute site name. 

1338 

1339 Returns 

1340 ------- 

1341 site_values : `dict` [`str`, `Any`] 

1342 Values specific to the given site. 

1343 """ 

1344 site_values = {"attrs": {}, "profile": {}} 

1345 search_opts = {} 

1346 if compute_site: 

1347 search_opts["curvals"] = {"curr_site": compute_site} 

1348 

1349 # Determine the hard limit for the memory requirement. 

1350 found, limit = config.search('memoryLimit', opt=search_opts) 

1351 if not found: 

1352 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1353 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1354 del search_opts["default"] 

1355 

1356 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1357 # by definition, they cannot have more memory than 

1358 # the partitionable slot they are the part of. 

1359 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1360 pool_info = condor_status(constraint=constraint) 

1361 try: 

1362 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1363 except ValueError: 

1364 _LOG.debug("No execute machine in the pool matches %s", patt) 

1365 if limit: 

1366 config[".bps_defined.memory_limit"] = limit 

1367 

1368 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1369 site_values["memoryLimit"] = limit 

1370 

1371 key = f".site.{compute_site}.profile.condor" 

1372 if key in config: 

1373 for key, val in config[key].items(): 

1374 if key.startswith("+"): 

1375 site_values["attrs"][key[1:]] = val 

1376 else: 

1377 site_values["profile"][key] = val 

1378 

1379 return site_values