Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import dataclasses 

29import os 

30import re 

31import logging 

32from datetime import datetime, timedelta 

33from pathlib import Path 

34 

35import htcondor 

36 

37from ... import ( 

38 BaseWmsWorkflow, 

39 BaseWmsService, 

40 GenericWorkflow, 

41 GenericWorkflowJob, 

42 WmsRunReport, 

43 WmsJobReport, 

44 WmsStates 

45) 

46from ...bps_utils import chdir 

47from .lssthtc import ( 

48 HTCDag, 

49 HTCJob, 

50 MISSING_ID, 

51 JobStatus, 

52 NodeStatus, 

53 htc_check_dagman_output, 

54 htc_escape, 

55 htc_submit_dag, 

56 read_dag_log, 

57 read_dag_status, 

58 read_node_status, 

59 condor_history, 

60 condor_q, 

61 condor_status, 

62 pegasus_name_to_label, 

63 summary_from_dag, 

64) 

65 

66 

67DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

68"""Default pattern for searching execute machines in an HTCondor pool. 

69""" 

70 

71_LOG = logging.getLogger(__name__) 

72 

73 

74class HTCondorService(BaseWmsService): 

75 """HTCondor version of WMS service. 

76 """ 

77 def prepare(self, config, generic_workflow, out_prefix=None): 

78 """Convert generic workflow to an HTCondor DAG ready for submission. 

79 

80 Parameters 

81 ---------- 

82 config : `lsst.ctrl.bps.BpsConfig` 

83 BPS configuration that includes necessary submit/runtime 

84 information. 

85 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

86 The generic workflow (e.g., has executable name and arguments). 

87 out_prefix : `str` 

88 The root directory into which all WMS-specific files are written. 

89 

90 Returns 

91 ---------- 

92 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

93 HTCondor workflow ready to be run. 

94 """ 

95 _LOG.debug("out_prefix = '%s'", out_prefix) 

96 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

97 f"{self.__class__.__module__}." 

98 f"{self.__class__.__name__}") 

99 workflow.write(out_prefix) 

100 return workflow 

101 

102 def submit(self, workflow): 

103 """Submit a single HTCondor workflow. 

104 

105 Parameters 

106 ---------- 

107 workflow : `lsst.ctrl.bps.BaseWorkflow` 

108 A single HTCondor workflow to submit. run_id is updated after 

109 successful submission to WMS. 

110 """ 

111 # For workflow portability, internal paths are all relative. Hence 

112 # the DAG needs to be submitted to HTCondor from inside the submit 

113 # directory. 

114 with chdir(workflow.submit_path): 

115 _LOG.info("Submitting from directory: %s", os.getcwd()) 

116 htc_submit_dag(workflow.dag, dict()) 

117 workflow.run_id = workflow.dag.run_id 

118 

119 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

120 """Query WMS for list of submitted WMS workflows/jobs. 

121 

122 This should be a quick lookup function to create list of jobs for 

123 other functions. 

124 

125 Parameters 

126 ---------- 

127 wms_id : `int` or `str`, optional 

128 Id or path that can be used by WMS service to look up job. 

129 user : `str`, optional 

130 User whose submitted jobs should be listed. 

131 require_bps : `bool`, optional 

132 Whether to require jobs returned in list to be bps-submitted jobs. 

133 pass_thru : `str`, optional 

134 Information to pass through to WMS. 

135 

136 Returns 

137 ------- 

138 job_ids : `list` [`Any`] 

139 Only job ids to be used by cancel and other functions. Typically 

140 this means top-level jobs (i.e., not children jobs). 

141 """ 

142 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s", 

143 wms_id, user, require_bps, pass_thru) 

144 constraint = "" 

145 

146 if wms_id is None: 

147 if user is not None: 

148 constraint = f'(Owner == "{user}")' 

149 else: 

150 cluster_id = _wms_id_to_cluster(wms_id) 

151 if cluster_id != 0: 

152 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

153 

154 if require_bps: 

155 constraint += ' && (bps_isjob == "True")' 

156 

157 if pass_thru: 

158 if "-forcex" in pass_thru: 

159 pass_thru_2 = pass_thru.replace("-forcex", "") 

160 if pass_thru_2 and not pass_thru_2.isspace(): 

161 constraint += f"&& ({pass_thru_2})" 

162 else: 

163 constraint += f" && ({pass_thru})" 

164 

165 _LOG.debug("constraint = %s", constraint) 

166 jobs = condor_q(constraint) 

167 

168 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

169 job_ids = [] 

170 for job_id, job_info in jobs.items(): 

171 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None")) 

172 if "DAGManJobId" not in job_info: # orphaned job 

173 job_ids.append(job_id) 

174 else: 

175 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0") 

176 _LOG.debug("\tin jobs.keys() = %s", jobs.keys()) 

177 if f"{job_info['DAGManJobId']}.0" not in jobs: 

178 job_ids.append(job_id) 

179 

180 _LOG.debug("job_ids = %s", job_ids) 

181 return job_ids 

182 

183 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

184 """Return run information based upon given constraints. 

185 

186 Parameters 

187 ---------- 

188 wms_workflow_id : `str` 

189 Limit to specific run based on id. 

190 user : `str` 

191 Limit results to runs for this user. 

192 hist : `float` 

193 Limit history search to this many days. 

194 pass_thru : `str` 

195 Constraints to pass through to HTCondor. 

196 

197 Returns 

198 ------- 

199 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

200 Information about runs from given job information. 

201 message : `str` 

202 Extra message for report command to print. This could be pointers 

203 to documentation or to WMS specific commands. 

204 """ 

205 message = "" 

206 

207 if wms_workflow_id: 

208 # Explicitly checking if wms_workflow_id can be converted to a 

209 # float instead of using try/except to avoid catching a different 

210 # ValueError from _report_from_id 

211 try: 

212 float(wms_workflow_id) 

213 is_float = True 

214 except ValueError: # Don't need TypeError here as None goes to else branch. 

215 is_float = False 

216 

217 if is_float: 

218 run_reports, message = _report_from_id(float(wms_workflow_id), hist) 

219 else: 

220 run_reports, message = _report_from_path(wms_workflow_id) 

221 else: 

222 run_reports, message = _summary_report(user, hist, pass_thru) 

223 _LOG.debug("report: %s, %s", run_reports, message) 

224 

225 return list(run_reports.values()), message 

226 

227 def cancel(self, wms_id, pass_thru=None): 

228 """Cancel submitted workflows/jobs. 

229 

230 Parameters 

231 ---------- 

232 wms_id : `str` 

233 ID or path of job that should be canceled. 

234 pass_thru : `str`, optional 

235 Information to pass through to WMS. 

236 

237 Returns 

238 -------- 

239 deleted : `bool` 

240 Whether successful deletion or not. Currently, if any doubt or any 

241 individual jobs not deleted, return False. 

242 message : `str` 

243 Any message from WMS (e.g., error details). 

244 """ 

245 _LOG.debug("Canceling wms_id = %s", wms_id) 

246 

247 cluster_id = _wms_id_to_cluster(wms_id) 

248 if cluster_id == 0: 

249 deleted = False 

250 message = "Invalid id" 

251 else: 

252 _LOG.debug("Canceling cluster_id = %s", cluster_id) 

253 schedd = htcondor.Schedd() 

254 constraint = f"ClusterId == {cluster_id}" 

255 if pass_thru is not None and "-forcex" in pass_thru: 

256 pass_thru_2 = pass_thru.replace("-forcex", "") 

257 if pass_thru_2 and not pass_thru_2.isspace(): 

258 constraint += f"&& ({pass_thru_2})" 

259 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

260 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

261 else: 

262 if pass_thru: 

263 constraint += f"&& ({pass_thru})" 

264 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

265 results = schedd.act(htcondor.JobAction.Remove, constraint) 

266 _LOG.debug("Remove results: %s", results) 

267 

268 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

269 deleted = True 

270 message = "" 

271 else: 

272 deleted = False 

273 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

274 message = "no such bps job in batch queue" 

275 else: 

276 message = f"unknown problems deleting: {results}" 

277 

278 _LOG.debug("deleted: %s; message = %s", deleted, message) 

279 return deleted, message 

280 

281 

282class HTCondorWorkflow(BaseWmsWorkflow): 

283 """Single HTCondor workflow. 

284 

285 Parameters 

286 ---------- 

287 name : `str` 

288 Unique name for Workflow used when naming files. 

289 config : `lsst.ctrl.bps.BpsConfig` 

290 BPS configuration that includes necessary submit/runtime information. 

291 """ 

292 def __init__(self, name, config=None): 

293 super().__init__(name, config) 

294 self.dag = None 

295 

296 @classmethod 

297 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

298 # Docstring inherited 

299 htc_workflow = cls(generic_workflow.name, config) 

300 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

301 

302 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

303 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

304 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

305 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}"}) 

306 

307 # Determine the hard limit for the memory requirement. 

308 found, limit = config.search('memoryLimit') 

309 if not found: 

310 search_opts = {"default": DEFAULT_HTC_EXEC_PATT} 

311 _, site = config.search("computeSite") 

312 if site: 

313 search_opts["curvals"] = {"curr_site": site} 

314 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

315 

316 # To reduce the amount of data, ignore dynamic slots (if any) as, 

317 # by definition, they cannot have more memory than 

318 # the partitionable slot they are the part of. 

319 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

320 pool_info = condor_status(constraint=constraint) 

321 try: 

322 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

323 except ValueError: 

324 _LOG.debug("No execute machine in the pool matches %s", patt) 

325 if limit: 

326 config[".bps_defined.memory_limit"] = limit 

327 

328 # Create all DAG jobs 

329 for job_name in generic_workflow: 

330 gwjob = generic_workflow.get_job(job_name) 

331 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix) 

332 htc_workflow.dag.add_job(htc_job) 

333 

334 # Add job dependencies to the DAG 

335 for job_name in generic_workflow: 

336 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

337 

338 # If final job exists in generic workflow, create DAG final job 

339 final = generic_workflow.get_final() 

340 if final and isinstance(final, GenericWorkflowJob): 

341 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix) 

342 if "post" not in final_htjob.dagcmds: 

343 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

344 f" {final.name} $DAG_STATUS $RETURN" 

345 htc_workflow.dag.add_final_job(final_htjob) 

346 elif final and isinstance(final, GenericWorkflow): 

347 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

348 elif final: 

349 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

350 

351 return htc_workflow 

352 

353 @staticmethod 

354 def _create_job(config, generic_workflow, gwjob, out_prefix): 

355 """Convert GenericWorkflow job nodes to DAG jobs. 

356 

357 Parameters 

358 ---------- 

359 config : `lsst.ctrl.bps.BpsConfig` 

360 BPS configuration that includes necessary submit/runtime 

361 information. 

362 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

363 Generic workflow that is being converted. 

364 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

365 The generic job to convert to a HTCondor job. 

366 out_prefix : `str` 

367 Directory prefix for HTCondor files. 

368 

369 Returns 

370 ------- 

371 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

372 The HTCondor job equivalent to the given generic job. 

373 """ 

374 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

375 

376 curvals = dataclasses.asdict(gwjob) 

377 if gwjob.tags: 

378 curvals.update(gwjob.tags) 

379 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals}) 

380 if not found: 

381 subdir = "jobs" 

382 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

383 

384 htc_job_cmds = { 

385 "universe": "vanilla", 

386 "should_transfer_files": "YES", 

387 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

388 "transfer_executable": "False", 

389 "getenv": "True", 

390 

391 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

392 # to put SIGBUS jobs on hold. 

393 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

394 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

395 "on_exit_hold_subcode": "34" 

396 } 

397 

398 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob)) 

399 

400 # job stdout, stderr, htcondor user log. 

401 for key in ("output", "error", "log"): 

402 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

403 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

404 

405 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

406 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix)) 

407 

408 # Add the job cmds dict to the job object. 

409 htc_job.add_job_cmds(htc_job_cmds) 

410 

411 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

412 

413 # Add run level attributes to job. 

414 htc_job.add_job_attrs(generic_workflow.run_attrs) 

415 

416 # Add job attributes to job. 

417 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

418 htc_job.add_job_attrs(gwjob.attrs) 

419 if gwjob.tags: 

420 htc_job.add_job_attrs({"bps_job_quanta": gwjob.tags.get("quanta_summary", "")}) 

421 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

422 "bps_job_label": gwjob.label}) 

423 

424 return htc_job 

425 

426 def write(self, out_prefix): 

427 """Output HTCondor DAGMan files needed for workflow submission. 

428 

429 Parameters 

430 ---------- 

431 out_prefix : `str` 

432 Directory prefix for HTCondor files. 

433 """ 

434 self.submit_path = out_prefix 

435 os.makedirs(out_prefix, exist_ok=True) 

436 

437 # Write down the workflow in HTCondor format. 

438 self.dag.write(out_prefix, "jobs/{self.label}") 

439 

440 

441def _translate_job_cmds(config, generic_workflow, gwjob): 

442 """Translate the job data that are one to one mapping 

443 

444 Parameters 

445 ---------- 

446 config : `lsst.ctrl.bps.BpsConfig` 

447 BPS configuration that includes necessary submit/runtime 

448 information. 

449 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

450 Generic workflow that contains job to being converted. 

451 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

452 Generic workflow job to be converted. 

453 

454 Returns 

455 ------- 

456 htc_job_commands : `dict` [`str`, `Any`] 

457 Contains commands which can appear in the HTCondor submit description 

458 file. 

459 """ 

460 # Values in the job script that just are name mappings. 

461 job_translation = {"mail_to": "notify_user", 

462 "when_to_mail": "notification", 

463 "request_cpus": "request_cpus", 

464 "priority": "priority", 

465 "category": "category"} 

466 

467 jobcmds = {} 

468 for gwkey, htckey in job_translation.items(): 

469 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

470 

471 # job commands that need modification 

472 if gwjob.number_of_retries: 

473 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

474 

475 if gwjob.retry_unless_exit: 

476 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

477 

478 if gwjob.request_disk: 

479 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

480 

481 if gwjob.request_memory: 

482 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

483 

484 if gwjob.memory_multiplier: 

485 # Do not use try-except! At the moment, BpsConfig returns an empty 

486 # string if it does not contain the key. 

487 memory_limit = config[".bps_defined.memory_limit"] 

488 if not memory_limit: 

489 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

490 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

491 "search pattern 'executeMachinesPattern' might help.") 

492 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier) 

493 

494 # Periodically release jobs which are being held due to exceeding 

495 # memory. Stop doing that (by removing the job from the HTCondor queue) 

496 # after the maximal number of retries has been reached or the memory 

497 # requirements cannot be satisfied. 

498 jobcmds["periodic_release"] = \ 

499 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)" 

500 jobcmds["periodic_remove"] = \ 

501 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \ 

502 f"JobStatus == 5 && NumJobStarts > JobMaxRetries" 

503 

504 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

505 # May need to move to special site-specific implementation if sites use 

506 # other mechanisms. 

507 if gwjob.concurrency_limit: 

508 jobcmds["concurrency_limit"] = ",".join(gwjob.concurrency_limit) 

509 

510 # Handle command line 

511 if gwjob.executable.transfer_executable: 

512 jobcmds["transfer_executable"] = "True" 

513 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

514 else: 

515 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

516 

517 if gwjob.arguments: 

518 arguments = gwjob.arguments 

519 arguments = _replace_cmd_vars(arguments, gwjob) 

520 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob) 

521 arguments = _fix_env_var_syntax(arguments) 

522 jobcmds["arguments"] = arguments 

523 

524 # Add extra "pass-thru" job commands 

525 if gwjob.profile: 

526 for key, val in gwjob.profile.items(): 

527 jobcmds[key] = htc_escape(val) 

528 

529 return jobcmds 

530 

531 

532def _translate_dag_cmds(gwjob): 

533 """Translate job values into DAGMan commands. 

534 

535 Parameters 

536 ---------- 

537 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

538 Job containing values to be translated. 

539 

540 Returns 

541 ------- 

542 dagcmds : `dict` [`str`, `Any`] 

543 DAGMan commands for the job. 

544 """ 

545 # Values in the dag script that just are name mappings. 

546 dag_translation = {"abort_on_value": "abort_dag_on", 

547 "abort_return_value": "abort_exit"} 

548 

549 dagcmds = {} 

550 for gwkey, htckey in dag_translation.items(): 

551 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

552 

553 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

554 return dagcmds 

555 

556 

557def _fix_env_var_syntax(oldstr): 

558 """Change ENV place holders to HTCondor Env var syntax. 

559 

560 Parameters 

561 ---------- 

562 oldstr : `str` 

563 String in which environment variable syntax is to be fixed. 

564 

565 Returns 

566 ------- 

567 newstr : `str` 

568 Given string with environment variable syntax fixed. 

569 """ 

570 newstr = oldstr 

571 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

572 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

573 return newstr 

574 

575 

576def _replace_file_vars(config, arguments, workflow, gwjob): 

577 """Replace file placeholders in command line arguments with correct 

578 physical file names. 

579 

580 Parameters 

581 ---------- 

582 config : `lsst.ctrl.bps.BpsConfig` 

583 BPS configuration that includes necessary submit/runtime 

584 information. 

585 arguments : `str` 

586 Arguments string in which to replace file placeholders. 

587 workflow : `lsst.ctrl.bps.GenericWorkflow` 

588 Generic workflow that contains file information. 

589 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

590 The job corresponding to the arguments. 

591 

592 Returns 

593 ------- 

594 arguments : `str` 

595 Given arguments string with file placeholders replaced. 

596 """ 

597 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

598 

599 # Replace input file placeholders with paths. 

600 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

601 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared: 

602 uri = os.path.basename(gwfile.src_uri) 

603 else: 

604 uri = gwfile.src_uri 

605 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

606 

607 # Replace output file placeholders with paths. 

608 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

609 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared: 

610 uri = os.path.basename(gwfile.src_uri) 

611 else: 

612 uri = gwfile.src_uri 

613 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

614 return arguments 

615 

616 

617def _replace_cmd_vars(arguments, gwjob): 

618 """Replace format-style placeholders in arguments. 

619 

620 Parameters 

621 ---------- 

622 arguments : `str` 

623 Arguments string in which to replace placeholders. 

624 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

625 Job containing values to be used to replace placeholders 

626 (in particular gwjob.cmdvals). 

627 

628 Returns 

629 ------- 

630 arguments : `str` 

631 Given arguments string with placeholders replaced. 

632 """ 

633 try: 

634 arguments = arguments.format(**gwjob.cmdvals) 

635 except (KeyError, TypeError): # TypeError in case None instead of {} 

636 _LOG.error("Could not replace command variables:\n" 

637 "arguments: %s\n" 

638 "cmdvals: %s", arguments, gwjob.cmdvals) 

639 raise 

640 return arguments 

641 

642 

643def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

644 """Add job input files from generic workflow to job. 

645 

646 Parameters 

647 ---------- 

648 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

649 The generic workflow (e.g., has executable name and arguments). 

650 job_name : `str` 

651 Unique name for the job. 

652 use_shared : `bool` 

653 Whether job has access to files via shared filesystem. 

654 out_prefix : `str` 

655 The root directory into which all WMS-specific files are written. 

656 

657 Returns 

658 ------- 

659 htc_commands : `dict` [`str`, `str`] 

660 HTCondor commands for the job submission script. 

661 """ 

662 htc_commands = {} 

663 inputs = [] 

664 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

665 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

666 if not use_shared or not gwf_file.job_shared: 

667 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix)) 

668 

669 if inputs: 

670 htc_commands["transfer_input_files"] = ",".join(inputs) 

671 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

672 return htc_commands 

673 

674 

675def _report_from_path(wms_path): 

676 """Gather run information from a given run directory. 

677 

678 Parameters 

679 ---------- 

680 wms_path : `str` 

681 The directory containing the submit side files (e.g., HTCondor files). 

682 

683 Returns 

684 ------- 

685 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

686 Run information for the detailed report. The key is the HTCondor id 

687 and the value is a collection of report information for that run. 

688 message : `str` 

689 Message to be printed with the summary report. 

690 """ 

691 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

692 if wms_workflow_id == MISSING_ID: 

693 run_reports = {} 

694 else: 

695 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

696 return run_reports, message 

697 

698 

699def _report_from_id(wms_workflow_id, hist): 

700 """Gather run information from a given run directory. 

701 

702 Parameters 

703 ---------- 

704 wms_workflow_id : `int` or `str` 

705 Limit to specific run based on id. 

706 hist : `float` 

707 Limit history search to this many days. 

708 

709 Returns 

710 ------- 

711 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

712 Run information for the detailed report. The key is the HTCondor id 

713 and the value is a collection of report information for that run. 

714 message : `str` 

715 Message to be printed with the summary report. 

716 """ 

717 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \ 

718 f"{int(float(wms_workflow_id))})" 

719 jobs = condor_q(constraint) 

720 if hist: 

721 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

722 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

723 hist_jobs = condor_history(constraint) 

724 _update_jobs(jobs, hist_jobs) 

725 

726 # keys in dictionary will be strings of format "ClusterId.ProcId" 

727 wms_workflow_id = str(wms_workflow_id) 

728 if not wms_workflow_id.endswith(".0"): 

729 wms_workflow_id += ".0" 

730 

731 if wms_workflow_id in jobs: 

732 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"]) 

733 _update_jobs(jobs, path_jobs) 

734 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

735 else: 

736 run_reports = {} 

737 message = f"Found 0 records for run id {wms_workflow_id}" 

738 return run_reports, message 

739 

740 

741def _get_info_from_path(wms_path): 

742 """Gather run information from a given run directory. 

743 

744 Parameters 

745 ---------- 

746 wms_path : `str` 

747 Directory containing HTCondor files. 

748 

749 Returns 

750 ------- 

751 wms_workflow_id : `str` 

752 The run id which is a DAGman job id. 

753 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

754 Information about jobs read from files in the given directory. 

755 The key is the HTCondor id and the value is a dictionary of HTCondor 

756 keys and values. 

757 message : `str` 

758 Message to be printed with the summary report. 

759 """ 

760 try: 

761 wms_workflow_id, jobs = read_dag_log(wms_path) 

762 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

763 _update_jobs(jobs, read_node_status(wms_path)) 

764 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

765 

766 # Add more info for DAGman job 

767 job = jobs[wms_workflow_id] 

768 job.update(read_dag_status(wms_path)) 

769 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

770 if "bps_run" not in job: 

771 _add_run_info(wms_path, job) 

772 

773 message = htc_check_dagman_output(wms_path) 

774 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

775 jobs[wms_workflow_id]["total_jobs"]) 

776 except StopIteration: 

777 message = f"Could not find HTCondor files in {wms_path}" 

778 _LOG.warning(message) 

779 wms_workflow_id = MISSING_ID 

780 jobs = {} 

781 

782 return wms_workflow_id, jobs, message 

783 

784 

785def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

786 """Gather run information to be used in generating summary reports. 

787 

788 Parameters 

789 ---------- 

790 wms_workflow_id : `str` 

791 Run lookup restricted to given user. 

792 jobs : `float` 

793 How many previous days to search for run information. 

794 

795 Returns 

796 ------- 

797 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

798 Run information for the detailed report. The key is the given HTCondor 

799 id and the value is a collection of report information for that run. 

800 """ 

801 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

802 dag_job = jobs[wms_workflow_id] 

803 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job: 

804 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id) 

805 return {} 

806 report = WmsRunReport(wms_id=wms_workflow_id, 

807 path=dag_job["Iwd"], 

808 label=dag_job.get("bps_job_label", "MISS"), 

809 run=dag_job.get("bps_run", "MISS"), 

810 project=dag_job.get("bps_project", "MISS"), 

811 campaign=dag_job.get("bps_campaign", "MISS"), 

812 payload=dag_job.get("bps_payload", "MISS"), 

813 operator=_get_owner(dag_job), 

814 run_summary=_get_run_summary(dag_job), 

815 state=_htc_status_to_wms_state(dag_job), 

816 jobs=[], 

817 total_number_jobs=dag_job["total_jobs"], 

818 job_state_counts=dag_job["state_counts"]) 

819 

820 try: 

821 for job in jobs.values(): 

822 if job["ClusterId"] != int(float(wms_workflow_id)): 

823 job_report = WmsJobReport(wms_id=job["ClusterId"], 

824 name=job.get("DAGNodeName", str(job["ClusterId"])), 

825 label=job.get("bps_job_label", 

826 pegasus_name_to_label(job["DAGNodeName"])), 

827 state=_htc_status_to_wms_state(job)) 

828 if job_report.label == "init": 

829 job_report.label = "pipetaskInit" 

830 report.jobs.append(job_report) 

831 except KeyError as ex: 

832 _LOG.error("Job missing key '%s': %s", str(ex), job) 

833 raise 

834 

835 run_reports = {report.wms_id: report} 

836 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

837 return run_reports 

838 

839 

840def _summary_report(user, hist, pass_thru): 

841 """Gather run information to be used in generating summary reports. 

842 

843 Parameters 

844 ---------- 

845 user : `str` 

846 Run lookup restricted to given user. 

847 hist : `float` 

848 How many previous days to search for run information. 

849 pass_thru : `str` 

850 Advanced users can define the HTCondor constraint to be used 

851 when searching queue and history. 

852 

853 Returns 

854 ------- 

855 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

856 Run information for the summary report. The keys are HTCondor ids and 

857 the values are collections of report information for each run. 

858 message : `str` 

859 Message to be printed with the summary report. 

860 """ 

861 # only doing summary report so only look for dagman jobs 

862 if pass_thru: 

863 constraint = pass_thru 

864 else: 

865 # Notes: 

866 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

867 # manually restarted. 

868 # * Any job with DAGManJobID isn't a DAG job 

869 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

870 if user: 

871 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

872 

873 # Check runs in queue. 

874 jobs = condor_q(constraint) 

875 

876 if hist: 

877 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

878 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

879 hist_jobs = condor_history(constraint) 

880 _update_jobs(jobs, hist_jobs) 

881 

882 _LOG.debug("Job ids from queue and history %s", jobs.keys()) 

883 

884 # Have list of DAGMan jobs, need to get run_report info. 

885 run_reports = {} 

886 for job in jobs.values(): 

887 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

888 # If didn't get from queue information (e.g., Kerberos bug), 

889 # try reading from file. 

890 if total_jobs == 0: 

891 try: 

892 job.update(read_dag_status(job["Iwd"])) 

893 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

894 except StopIteration: 

895 pass # don't kill report can't find htcondor files 

896 

897 if "bps_run" not in job: 

898 _add_run_info(job["Iwd"], job) 

899 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)), 

900 path=job["Iwd"], 

901 label=job.get("bps_job_label", "MISS"), 

902 run=job.get("bps_run", "MISS"), 

903 project=job.get("bps_project", "MISS"), 

904 campaign=job.get("bps_campaign", "MISS"), 

905 payload=job.get("bps_payload", "MISS"), 

906 operator=_get_owner(job), 

907 run_summary=_get_run_summary(job), 

908 state=_htc_status_to_wms_state(job), 

909 jobs=[], 

910 total_number_jobs=total_jobs, 

911 job_state_counts=state_counts) 

912 

913 run_reports[report.wms_id] = report 

914 

915 return run_reports, "" 

916 

917 

918def _add_run_info(wms_path, job): 

919 """Find BPS run information elsewhere for runs without bps attributes. 

920 

921 Parameters 

922 ---------- 

923 wms_path : `str` 

924 Path to submit files for the run. 

925 job : `dict` [`str`, `Any`] 

926 HTCondor dag job information. 

927 

928 Raises 

929 ------ 

930 StopIteration 

931 If cannot find file it is looking for. Permission errors are 

932 caught and job's run is marked with error. 

933 """ 

934 path = Path(wms_path) / "jobs" 

935 try: 

936 subfile = next(path.glob("**/*.sub")) 

937 except (StopIteration, PermissionError): 

938 job["bps_run"] = "Unavailable" 

939 else: 

940 _LOG.debug("_add_run_info: subfile = %s", subfile) 

941 try: 

942 with open(subfile, "r") as fh: 

943 for line in fh: 

944 if line.startswith("+bps_"): 

945 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

946 if m: 

947 _LOG.debug("Matching line: %s", line) 

948 job[m.group(1)] = m.group(2).replace('"', "") 

949 else: 

950 _LOG.debug("Could not parse attribute: %s", line) 

951 except PermissionError: 

952 job["bps_run"] = "PermissionError" 

953 _LOG.debug("After adding job = %s", job) 

954 

955 

956def _get_owner(job): 

957 """Get the owner of a dag job. 

958 

959 Parameters 

960 ---------- 

961 job : `dict` [`str`, `Any`] 

962 HTCondor dag job information. 

963 

964 Returns 

965 ------- 

966 owner : `str` 

967 Owner of the dag job. 

968 """ 

969 owner = job.get("bps_operator", None) 

970 if not owner: 

971 owner = job.get("Owner", None) 

972 if not owner: 

973 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

974 owner = "MISS" 

975 return owner 

976 

977 

978def _get_run_summary(job): 

979 """Get the run summary for a job. 

980 

981 Parameters 

982 ---------- 

983 job : `dict` [`str`, `Any`] 

984 HTCondor dag job information. 

985 

986 Returns 

987 ------- 

988 summary : `str` 

989 Number of jobs per PipelineTask label in approximate pipeline order. 

990 Format: <label>:<count>[;<label>:<count>]+ 

991 """ 

992 summary = job.get("bps_run_summary", None) 

993 if not summary: 

994 summary, _ = summary_from_dag(job["Iwd"]) 

995 if not summary: 

996 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

997 _LOG.debug("_get_run_summary: summary=%s", summary) 

998 

999 # Workaround sometimes using init vs pipetaskInit 

1000 summary = summary.replace("init:", "pipetaskInit:") 

1001 

1002 if "pegasus_version" in job and "pegasus" not in summary: 

1003 summary += ";pegasus:0" 

1004 

1005 return summary 

1006 

1007 

1008def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1009 """Count number of jobs per WMS state. 

1010 

1011 Parameters 

1012 ---------- 

1013 wms_workflow_id : `str` 

1014 HTCondor job id. 

1015 jobs : `dict` [`str`, `Any`] 

1016 HTCondor dag job information. 

1017 

1018 Returns 

1019 ------- 

1020 total_count : `int` 

1021 Total number of dag nodes. 

1022 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1023 Keys are the different WMS states and values are counts of jobs 

1024 that are in that WMS state. 

1025 """ 

1026 state_counts = dict.fromkeys(WmsStates, 0) 

1027 

1028 for jid, jinfo in jobs.items(): 

1029 if jid != wms_workflow_id: 

1030 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1031 

1032 total_counted = sum(state_counts.values()) 

1033 if "NodesTotal" in jobs[wms_workflow_id]: 

1034 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1035 else: 

1036 total_count = total_counted 

1037 

1038 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1039 

1040 return total_count, state_counts 

1041 

1042 

1043def _get_state_counts_from_dag_job(job): 

1044 """Count number of jobs per WMS state. 

1045 

1046 Parameters 

1047 ---------- 

1048 job : `dict` [`str`, `Any`] 

1049 HTCondor dag job information. 

1050 

1051 Returns 

1052 ------- 

1053 total_count : `int` 

1054 Total number of dag nodes. 

1055 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1056 Keys are the different WMS states and values are counts of jobs 

1057 that are in that WMS state. 

1058 """ 

1059 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1060 state_counts = dict.fromkeys(WmsStates, 0) 

1061 if "DAG_NodesReady" in job: 

1062 state_counts = { 

1063 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1064 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1065 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1066 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1067 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1068 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1069 total_jobs = job.get("DAG_NodesTotal") 

1070 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1071 elif "NodesFailed" in job: 

1072 state_counts = { 

1073 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1074 WmsStates.READY: job.get("NodesReady", 0), 

1075 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1076 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1077 WmsStates.FAILED: job.get("NodesFailed", 0), 

1078 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1079 try: 

1080 total_jobs = job.get("NodesTotal") 

1081 except KeyError as ex: 

1082 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1083 raise 

1084 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1085 else: 

1086 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1087 # for every DAG. 

1088 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1089 total_jobs = 0 

1090 

1091 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1092 return total_jobs, state_counts 

1093 

1094 

1095def _htc_status_to_wms_state(job): 

1096 """Convert HTCondor job status to generic wms state. 

1097 

1098 Parameters 

1099 ---------- 

1100 job : `dict` [`str`, `Any`] 

1101 HTCondor job information. 

1102 

1103 Returns 

1104 ------- 

1105 wms_state : `WmsStates` 

1106 The equivalent WmsState to given job's status. 

1107 """ 

1108 wms_state = WmsStates.MISFIT 

1109 if "JobStatus" in job: 

1110 wms_state = _htc_job_status_to_wms_state(job) 

1111 elif "NodeStatus" in job: 

1112 wms_state = _htc_node_status_to_wms_state(job) 

1113 return wms_state 

1114 

1115 

1116def _htc_job_status_to_wms_state(job): 

1117 """Convert HTCondor job status to generic wms state. 

1118 

1119 Parameters 

1120 ---------- 

1121 job : `dict` [`str`, `Any`] 

1122 HTCondor job information. 

1123 

1124 Returns 

1125 ------- 

1126 wms_state : `lsst.ctrl.bps.WmsStates` 

1127 The equivalent WmsState to given job's status. 

1128 """ 

1129 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1130 type(job["JobStatus"])) 

1131 job_status = int(job["JobStatus"]) 

1132 wms_state = WmsStates.MISFIT 

1133 

1134 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1135 if job_status == JobStatus.IDLE: 

1136 wms_state = WmsStates.PENDING 

1137 elif job_status == JobStatus.RUNNING: 

1138 wms_state = WmsStates.RUNNING 

1139 elif job_status == JobStatus.REMOVED: 

1140 wms_state = WmsStates.DELETED 

1141 elif job_status == JobStatus.COMPLETED: 

1142 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1143 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1144 job.get("ReturnValue", 0): 

1145 wms_state = WmsStates.FAILED 

1146 else: 

1147 wms_state = WmsStates.SUCCEEDED 

1148 elif job_status == JobStatus.HELD: 

1149 wms_state = WmsStates.HELD 

1150 

1151 return wms_state 

1152 

1153 

1154def _htc_node_status_to_wms_state(job): 

1155 """Convert HTCondor status to generic wms state. 

1156 

1157 Parameters 

1158 ---------- 

1159 job : `dict` [`str`, `Any`] 

1160 HTCondor job information. 

1161 

1162 Returns 

1163 ------- 

1164 wms_state : `lsst.ctrl.bps.WmsStates` 

1165 The equivalent WmsState to given node's status. 

1166 """ 

1167 wms_state = WmsStates.MISFIT 

1168 

1169 status = job["NodeStatus"] 

1170 if status == NodeStatus.NOT_READY: 

1171 wms_state = WmsStates.UNREADY 

1172 elif status == NodeStatus.READY: 

1173 wms_state = WmsStates.READY 

1174 elif status == NodeStatus.PRERUN: 

1175 wms_state = WmsStates.MISFIT 

1176 elif status == NodeStatus.SUBMITTED: 

1177 if job["JobProcsHeld"]: 

1178 wms_state = WmsStates.HELD 

1179 elif job["StatusDetails"] == "not_idle": 

1180 wms_state = WmsStates.RUNNING 

1181 elif job["JobProcsQueued"]: 

1182 wms_state = WmsStates.PENDING 

1183 elif status == NodeStatus.POSTRUN: 

1184 wms_state = WmsStates.MISFIT 

1185 elif status == NodeStatus.DONE: 

1186 wms_state = WmsStates.SUCCEEDED 

1187 elif status == NodeStatus.ERROR: 

1188 wms_state = WmsStates.FAILED 

1189 

1190 return wms_state 

1191 

1192 

1193def _update_jobs(jobs1, jobs2): 

1194 """Update jobs1 with info in jobs2. 

1195 

1196 (Basically an update for nested dictionaries.) 

1197 

1198 Parameters 

1199 ---------- 

1200 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1201 HTCondor job information to be updated. 

1202 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1203 Additional HTCondor job information. 

1204 """ 

1205 for jid, jinfo in jobs2.items(): 

1206 if jid in jobs1: 

1207 jobs1[jid].update(jinfo) 

1208 else: 

1209 jobs1[jid] = jinfo 

1210 

1211 

1212def _wms_id_to_cluster(wms_id): 

1213 """Convert WMS ID to cluster ID. 

1214 

1215 Parameters 

1216 ---------- 

1217 wms_id : `int` or `float` or `str` 

1218 HTCondor job id or path. 

1219 

1220 Returns 

1221 ------- 

1222 cluster_id : `int` 

1223 HTCondor cluster id. 

1224 """ 

1225 # If wms_id represents path, get numeric id. 

1226 try: 

1227 cluster_id = int(float(wms_id)) 

1228 except ValueError: 

1229 wms_path = Path(wms_id) 

1230 if wms_path.exists(): 

1231 try: 

1232 cluster_id, _ = read_dag_log(wms_id) 

1233 cluster_id = int(float(cluster_id)) 

1234 except StopIteration: 

1235 cluster_id = 0 

1236 else: 

1237 cluster_id = 0 

1238 return cluster_id 

1239 

1240 

1241def _create_request_memory_expr(memory, multiplier): 

1242 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1243 

1244 Parameters 

1245 ---------- 

1246 memory : `int` 

1247 Requested memory in MB. 

1248 multiplier : `float` 

1249 Memory growth rate between retires. 

1250 

1251 Returns 

1252 ------- 

1253 ad : `str` 

1254 A string representing an HTCondor ClassAd expression enabling safe 

1255 memory scaling between job retries. 

1256 """ 

1257 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1258 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1259 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1260 

1261 # If job runs the first time ('MemoryUsage' is not defined), set the 

1262 # required memory to a given value. 

1263 ad = f"ifThenElse({was_mem_exceeded}, " \ 

1264 f"ifThenElse(isUndefined(MemoryUsage), {memory}, int({multiplier} * MemoryUsage)), " \ 

1265 f"ifThenElse(isUndefined(MemoryUsage), {memory}, max({memory}, MemoryUsage)))" 

1266 return ad