Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import dataclasses 

29import os 

30import re 

31import logging 

32from datetime import datetime, timedelta 

33from pathlib import Path 

34 

35import htcondor 

36 

37from ... import ( 

38 BaseWmsWorkflow, 

39 BaseWmsService, 

40 GenericWorkflow, 

41 GenericWorkflowJob, 

42 WmsRunReport, 

43 WmsJobReport, 

44 WmsStates 

45) 

46from ...bps_utils import ( 

47 chdir, 

48 create_count_summary 

49) 

50from .lssthtc import ( 

51 HTCDag, 

52 HTCJob, 

53 MISSING_ID, 

54 JobStatus, 

55 NodeStatus, 

56 htc_check_dagman_output, 

57 htc_escape, 

58 htc_submit_dag, 

59 read_dag_log, 

60 read_dag_status, 

61 read_node_status, 

62 condor_history, 

63 condor_q, 

64 condor_status, 

65 pegasus_name_to_label, 

66 summary_from_dag, 

67) 

68 

69 

70DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

71"""Default pattern for searching execute machines in an HTCondor pool. 

72""" 

73 

74_LOG = logging.getLogger(__name__) 

75 

76 

77class HTCondorService(BaseWmsService): 

78 """HTCondor version of WMS service. 

79 """ 

80 def prepare(self, config, generic_workflow, out_prefix=None): 

81 """Convert generic workflow to an HTCondor DAG ready for submission. 

82 

83 Parameters 

84 ---------- 

85 config : `lsst.ctrl.bps.BpsConfig` 

86 BPS configuration that includes necessary submit/runtime 

87 information. 

88 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

89 The generic workflow (e.g., has executable name and arguments). 

90 out_prefix : `str` 

91 The root directory into which all WMS-specific files are written. 

92 

93 Returns 

94 ---------- 

95 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

96 HTCondor workflow ready to be run. 

97 """ 

98 _LOG.debug("out_prefix = '%s'", out_prefix) 

99 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

100 f"{self.__class__.__module__}." 

101 f"{self.__class__.__name__}") 

102 workflow.write(out_prefix) 

103 return workflow 

104 

105 def submit(self, workflow): 

106 """Submit a single HTCondor workflow. 

107 

108 Parameters 

109 ---------- 

110 workflow : `lsst.ctrl.bps.BaseWorkflow` 

111 A single HTCondor workflow to submit. run_id is updated after 

112 successful submission to WMS. 

113 """ 

114 # For workflow portability, internal paths are all relative. Hence 

115 # the DAG needs to be submitted to HTCondor from inside the submit 

116 # directory. 

117 with chdir(workflow.submit_path): 

118 _LOG.info("Submitting from directory: %s", os.getcwd()) 

119 htc_submit_dag(workflow.dag, {}) 

120 workflow.run_id = workflow.dag.run_id 

121 

122 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

123 """Query WMS for list of submitted WMS workflows/jobs. 

124 

125 This should be a quick lookup function to create list of jobs for 

126 other functions. 

127 

128 Parameters 

129 ---------- 

130 wms_id : `int` or `str`, optional 

131 Id or path that can be used by WMS service to look up job. 

132 user : `str`, optional 

133 User whose submitted jobs should be listed. 

134 require_bps : `bool`, optional 

135 Whether to require jobs returned in list to be bps-submitted jobs. 

136 pass_thru : `str`, optional 

137 Information to pass through to WMS. 

138 

139 Returns 

140 ------- 

141 job_ids : `list` [`Any`] 

142 Only job ids to be used by cancel and other functions. Typically 

143 this means top-level jobs (i.e., not children jobs). 

144 """ 

145 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s", 

146 wms_id, user, require_bps, pass_thru) 

147 constraint = "" 

148 

149 if wms_id is None: 

150 if user is not None: 

151 constraint = f'(Owner == "{user}")' 

152 else: 

153 cluster_id = _wms_id_to_cluster(wms_id) 

154 if cluster_id != 0: 

155 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

156 

157 if require_bps: 

158 constraint += ' && (bps_isjob == "True")' 

159 

160 if pass_thru: 

161 if "-forcex" in pass_thru: 

162 pass_thru_2 = pass_thru.replace("-forcex", "") 

163 if pass_thru_2 and not pass_thru_2.isspace(): 

164 constraint += f"&& ({pass_thru_2})" 

165 else: 

166 constraint += f" && ({pass_thru})" 

167 

168 _LOG.debug("constraint = %s", constraint) 

169 jobs = condor_q(constraint) 

170 

171 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

172 job_ids = [] 

173 for job_id, job_info in jobs.items(): 

174 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None")) 

175 if "DAGManJobId" not in job_info: # orphaned job 

176 job_ids.append(job_id) 

177 else: 

178 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0") 

179 _LOG.debug("\tin jobs.keys() = %s", jobs.keys()) 

180 if f"{job_info['DAGManJobId']}.0" not in jobs: 

181 job_ids.append(job_id) 

182 

183 _LOG.debug("job_ids = %s", job_ids) 

184 return job_ids 

185 

186 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

187 """Return run information based upon given constraints. 

188 

189 Parameters 

190 ---------- 

191 wms_workflow_id : `str` 

192 Limit to specific run based on id. 

193 user : `str` 

194 Limit results to runs for this user. 

195 hist : `float` 

196 Limit history search to this many days. 

197 pass_thru : `str` 

198 Constraints to pass through to HTCondor. 

199 

200 Returns 

201 ------- 

202 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

203 Information about runs from given job information. 

204 message : `str` 

205 Extra message for report command to print. This could be pointers 

206 to documentation or to WMS specific commands. 

207 """ 

208 message = "" 

209 

210 if wms_workflow_id: 

211 # Explicitly checking if wms_workflow_id can be converted to a 

212 # float instead of using try/except to avoid catching a different 

213 # ValueError from _report_from_id 

214 try: 

215 float(wms_workflow_id) 

216 is_float = True 

217 except ValueError: # Don't need TypeError here as None goes to else branch. 

218 is_float = False 

219 

220 if is_float: 

221 run_reports, message = _report_from_id(float(wms_workflow_id), hist) 

222 else: 

223 run_reports, message = _report_from_path(wms_workflow_id) 

224 else: 

225 run_reports, message = _summary_report(user, hist, pass_thru) 

226 _LOG.debug("report: %s, %s", run_reports, message) 

227 

228 return list(run_reports.values()), message 

229 

230 def cancel(self, wms_id, pass_thru=None): 

231 """Cancel submitted workflows/jobs. 

232 

233 Parameters 

234 ---------- 

235 wms_id : `str` 

236 ID or path of job that should be canceled. 

237 pass_thru : `str`, optional 

238 Information to pass through to WMS. 

239 

240 Returns 

241 -------- 

242 deleted : `bool` 

243 Whether successful deletion or not. Currently, if any doubt or any 

244 individual jobs not deleted, return False. 

245 message : `str` 

246 Any message from WMS (e.g., error details). 

247 """ 

248 _LOG.debug("Canceling wms_id = %s", wms_id) 

249 

250 cluster_id = _wms_id_to_cluster(wms_id) 

251 if cluster_id == 0: 

252 deleted = False 

253 message = "Invalid id" 

254 else: 

255 _LOG.debug("Canceling cluster_id = %s", cluster_id) 

256 schedd = htcondor.Schedd() 

257 constraint = f"ClusterId == {cluster_id}" 

258 if pass_thru is not None and "-forcex" in pass_thru: 

259 pass_thru_2 = pass_thru.replace("-forcex", "") 

260 if pass_thru_2 and not pass_thru_2.isspace(): 

261 constraint += f"&& ({pass_thru_2})" 

262 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

263 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

264 else: 

265 if pass_thru: 

266 constraint += f"&& ({pass_thru})" 

267 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

268 results = schedd.act(htcondor.JobAction.Remove, constraint) 

269 _LOG.debug("Remove results: %s", results) 

270 

271 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

272 deleted = True 

273 message = "" 

274 else: 

275 deleted = False 

276 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

277 message = "no such bps job in batch queue" 

278 else: 

279 message = f"unknown problems deleting: {results}" 

280 

281 _LOG.debug("deleted: %s; message = %s", deleted, message) 

282 return deleted, message 

283 

284 

285class HTCondorWorkflow(BaseWmsWorkflow): 

286 """Single HTCondor workflow. 

287 

288 Parameters 

289 ---------- 

290 name : `str` 

291 Unique name for Workflow used when naming files. 

292 config : `lsst.ctrl.bps.BpsConfig` 

293 BPS configuration that includes necessary submit/runtime information. 

294 """ 

295 def __init__(self, name, config=None): 

296 super().__init__(name, config) 

297 self.dag = None 

298 

299 @classmethod 

300 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

301 # Docstring inherited 

302 htc_workflow = cls(generic_workflow.name, config) 

303 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

304 

305 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

306 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

307 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

308 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

309 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

310 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

311 

312 # Determine the hard limit for the memory requirement. 

313 found, limit = config.search('memoryLimit') 

314 if not found: 

315 search_opts = {"default": DEFAULT_HTC_EXEC_PATT} 

316 _, site = config.search("computeSite") 

317 if site: 

318 search_opts["curvals"] = {"curr_site": site} 

319 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

320 

321 # To reduce the amount of data, ignore dynamic slots (if any) as, 

322 # by definition, they cannot have more memory than 

323 # the partitionable slot they are the part of. 

324 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

325 pool_info = condor_status(constraint=constraint) 

326 try: 

327 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

328 except ValueError: 

329 _LOG.debug("No execute machine in the pool matches %s", patt) 

330 if limit: 

331 config[".bps_defined.memory_limit"] = limit 

332 

333 # Create all DAG jobs 

334 for job_name in generic_workflow: 

335 gwjob = generic_workflow.get_job(job_name) 

336 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix) 

337 htc_workflow.dag.add_job(htc_job) 

338 

339 # Add job dependencies to the DAG 

340 for job_name in generic_workflow: 

341 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

342 

343 # If final job exists in generic workflow, create DAG final job 

344 final = generic_workflow.get_final() 

345 if final and isinstance(final, GenericWorkflowJob): 

346 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix) 

347 if "post" not in final_htjob.dagcmds: 

348 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

349 f" {final.name} $DAG_STATUS $RETURN" 

350 htc_workflow.dag.add_final_job(final_htjob) 

351 elif final and isinstance(final, GenericWorkflow): 

352 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

353 elif final: 

354 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

355 

356 return htc_workflow 

357 

358 @staticmethod 

359 def _create_job(config, generic_workflow, gwjob, out_prefix): 

360 """Convert GenericWorkflow job nodes to DAG jobs. 

361 

362 Parameters 

363 ---------- 

364 config : `lsst.ctrl.bps.BpsConfig` 

365 BPS configuration that includes necessary submit/runtime 

366 information. 

367 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

368 Generic workflow that is being converted. 

369 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

370 The generic job to convert to a HTCondor job. 

371 out_prefix : `str` 

372 Directory prefix for HTCondor files. 

373 

374 Returns 

375 ------- 

376 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

377 The HTCondor job equivalent to the given generic job. 

378 """ 

379 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

380 

381 curvals = dataclasses.asdict(gwjob) 

382 if gwjob.tags: 

383 curvals.update(gwjob.tags) 

384 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals}) 

385 if not found: 

386 subdir = "jobs" 

387 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

388 

389 htc_job_cmds = { 

390 "universe": "vanilla", 

391 "should_transfer_files": "YES", 

392 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

393 "transfer_output_files": '""', # Set to empty string to disable 

394 "transfer_executable": "False", 

395 "getenv": "True", 

396 

397 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

398 # to put SIGBUS jobs on hold. 

399 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

400 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

401 "on_exit_hold_subcode": "34" 

402 } 

403 

404 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob)) 

405 

406 # job stdout, stderr, htcondor user log. 

407 for key in ("output", "error", "log"): 

408 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

409 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

410 

411 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

412 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix)) 

413 

414 # Add the job cmds dict to the job object. 

415 htc_job.add_job_cmds(htc_job_cmds) 

416 

417 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

418 

419 # Add job attributes to job. 

420 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

421 htc_job.add_job_attrs(gwjob.attrs) 

422 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

423 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

424 "bps_job_label": gwjob.label}) 

425 

426 return htc_job 

427 

428 def write(self, out_prefix): 

429 """Output HTCondor DAGMan files needed for workflow submission. 

430 

431 Parameters 

432 ---------- 

433 out_prefix : `str` 

434 Directory prefix for HTCondor files. 

435 """ 

436 self.submit_path = out_prefix 

437 os.makedirs(out_prefix, exist_ok=True) 

438 

439 # Write down the workflow in HTCondor format. 

440 self.dag.write(out_prefix, "jobs/{self.label}") 

441 

442 

443def _translate_job_cmds(config, generic_workflow, gwjob): 

444 """Translate the job data that are one to one mapping 

445 

446 Parameters 

447 ---------- 

448 config : `lsst.ctrl.bps.BpsConfig` 

449 BPS configuration that includes necessary submit/runtime 

450 information. 

451 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

452 Generic workflow that contains job to being converted. 

453 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

454 Generic workflow job to be converted. 

455 

456 Returns 

457 ------- 

458 htc_job_commands : `dict` [`str`, `Any`] 

459 Contains commands which can appear in the HTCondor submit description 

460 file. 

461 """ 

462 # Values in the job script that just are name mappings. 

463 job_translation = {"mail_to": "notify_user", 

464 "when_to_mail": "notification", 

465 "request_cpus": "request_cpus", 

466 "priority": "priority", 

467 "category": "category"} 

468 

469 jobcmds = {} 

470 for gwkey, htckey in job_translation.items(): 

471 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

472 

473 # job commands that need modification 

474 if gwjob.number_of_retries: 

475 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

476 

477 if gwjob.retry_unless_exit: 

478 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

479 

480 if gwjob.request_disk: 

481 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

482 

483 if gwjob.request_memory: 

484 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

485 

486 if gwjob.memory_multiplier: 

487 # Do not use try-except! At the moment, BpsConfig returns an empty 

488 # string if it does not contain the key. 

489 memory_limit = config[".bps_defined.memory_limit"] 

490 if not memory_limit: 

491 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

492 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

493 "search pattern 'executeMachinesPattern' might help.") 

494 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier) 

495 

496 # Periodically release jobs which are being held due to exceeding 

497 # memory. Stop doing that (by removing the job from the HTCondor queue) 

498 # after the maximal number of retries has been reached or the memory 

499 # requirements cannot be satisfied. 

500 jobcmds["periodic_release"] = \ 

501 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)" 

502 jobcmds["periodic_remove"] = \ 

503 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \ 

504 f"JobStatus == 5 && NumJobStarts > JobMaxRetries" 

505 

506 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

507 # May need to move to special site-specific implementation if sites use 

508 # other mechanisms. 

509 if gwjob.concurrency_limit: 

510 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

511 

512 # Handle command line 

513 if gwjob.executable.transfer_executable: 

514 jobcmds["transfer_executable"] = "True" 

515 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

516 else: 

517 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

518 

519 if gwjob.arguments: 

520 arguments = gwjob.arguments 

521 arguments = _replace_cmd_vars(arguments, gwjob) 

522 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob) 

523 arguments = _fix_env_var_syntax(arguments) 

524 jobcmds["arguments"] = arguments 

525 

526 # Add extra "pass-thru" job commands 

527 if gwjob.profile: 

528 for key, val in gwjob.profile.items(): 

529 jobcmds[key] = htc_escape(val) 

530 

531 return jobcmds 

532 

533 

534def _translate_dag_cmds(gwjob): 

535 """Translate job values into DAGMan commands. 

536 

537 Parameters 

538 ---------- 

539 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

540 Job containing values to be translated. 

541 

542 Returns 

543 ------- 

544 dagcmds : `dict` [`str`, `Any`] 

545 DAGMan commands for the job. 

546 """ 

547 # Values in the dag script that just are name mappings. 

548 dag_translation = {"abort_on_value": "abort_dag_on", 

549 "abort_return_value": "abort_exit"} 

550 

551 dagcmds = {} 

552 for gwkey, htckey in dag_translation.items(): 

553 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

554 

555 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

556 return dagcmds 

557 

558 

559def _fix_env_var_syntax(oldstr): 

560 """Change ENV place holders to HTCondor Env var syntax. 

561 

562 Parameters 

563 ---------- 

564 oldstr : `str` 

565 String in which environment variable syntax is to be fixed. 

566 

567 Returns 

568 ------- 

569 newstr : `str` 

570 Given string with environment variable syntax fixed. 

571 """ 

572 newstr = oldstr 

573 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

574 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

575 return newstr 

576 

577 

578def _replace_file_vars(config, arguments, workflow, gwjob): 

579 """Replace file placeholders in command line arguments with correct 

580 physical file names. 

581 

582 Parameters 

583 ---------- 

584 config : `lsst.ctrl.bps.BpsConfig` 

585 BPS configuration that includes necessary submit/runtime 

586 information. 

587 arguments : `str` 

588 Arguments string in which to replace file placeholders. 

589 workflow : `lsst.ctrl.bps.GenericWorkflow` 

590 Generic workflow that contains file information. 

591 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

592 The job corresponding to the arguments. 

593 

594 Returns 

595 ------- 

596 arguments : `str` 

597 Given arguments string with file placeholders replaced. 

598 """ 

599 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

600 

601 # Replace input file placeholders with paths. 

602 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

603 if not gwfile.wms_transfer: 

604 # Must assume full URI if in command line and told WMS is not 

605 # responsible for transferring file. 

606 uri = gwfile.src_uri 

607 elif use_shared: 

608 if gwfile.job_shared: 

609 # Have shared filesystems and jobs can share file. 

610 uri = gwfile.src_uri 

611 else: 

612 # Taking advantage of inside knowledge. Not future-proof. 

613 # Temporary fix until have job wrapper that pulls files 

614 # within job. 

615 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

616 uri = "butler.yaml" 

617 else: 

618 uri = os.path.basename(gwfile.src_uri) 

619 else: # Using push transfer 

620 uri = os.path.basename(gwfile.src_uri) 

621 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

622 

623 # Replace output file placeholders with paths. 

624 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

625 if not gwfile.wms_transfer: 

626 # Must assume full URI if in command line and told WMS is not 

627 # responsible for transferring file. 

628 uri = gwfile.src_uri 

629 elif use_shared: 

630 if gwfile.job_shared: 

631 # Have shared filesystems and jobs can share file. 

632 uri = gwfile.src_uri 

633 else: 

634 uri = os.path.basename(gwfile.src_uri) 

635 else: # Using push transfer 

636 uri = os.path.basename(gwfile.src_uri) 

637 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

638 return arguments 

639 

640 

641def _replace_cmd_vars(arguments, gwjob): 

642 """Replace format-style placeholders in arguments. 

643 

644 Parameters 

645 ---------- 

646 arguments : `str` 

647 Arguments string in which to replace placeholders. 

648 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

649 Job containing values to be used to replace placeholders 

650 (in particular gwjob.cmdvals). 

651 

652 Returns 

653 ------- 

654 arguments : `str` 

655 Given arguments string with placeholders replaced. 

656 """ 

657 try: 

658 arguments = arguments.format(**gwjob.cmdvals) 

659 except (KeyError, TypeError): # TypeError in case None instead of {} 

660 _LOG.error("Could not replace command variables:\n" 

661 "arguments: %s\n" 

662 "cmdvals: %s", arguments, gwjob.cmdvals) 

663 raise 

664 return arguments 

665 

666 

667def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

668 """Add job input files from generic workflow to job. 

669 

670 Parameters 

671 ---------- 

672 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

673 The generic workflow (e.g., has executable name and arguments). 

674 job_name : `str` 

675 Unique name for the job. 

676 use_shared : `bool` 

677 Whether job has access to files via shared filesystem. 

678 out_prefix : `str` 

679 The root directory into which all WMS-specific files are written. 

680 

681 Returns 

682 ------- 

683 htc_commands : `dict` [`str`, `str`] 

684 HTCondor commands for the job submission script. 

685 """ 

686 htc_commands = {} 

687 inputs = [] 

688 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

689 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

690 

691 uri = Path(gwf_file.src_uri) 

692 

693 # Note if use_shared and job_shared, don't need to transfer file. 

694 

695 if not use_shared: # Copy file using push to job 

696 inputs.append(str(uri.relative_to(out_prefix))) 

697 elif not gwf_file.job_shared: # Jobs require own copy 

698 

699 # if using shared filesystem, but still need copy in job. Use 

700 # HTCondor's curl plugin for a local copy. 

701 

702 # Execution butler is represented as a directory which the 

703 # curl plugin does not handle. Taking advantage of inside 

704 # knowledge for temporary fix until have job wrapper that pulls 

705 # files within job. 

706 if gwf_file.name == "butlerConfig": 

707 # The execution butler directory doesn't normally exist until 

708 # the submit phase so checking for suffix instead of using 

709 # is_dir(). If other non-yaml file exists they would have a 

710 # different gwf_file.name. 

711 if uri.suffix == ".yaml": # Single file, so just copy. 

712 inputs.append(f"file://{uri}") 

713 else: 

714 inputs.append(f"file://{uri / 'butler.yaml'}") 

715 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

716 elif uri.is_dir(): 

717 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)", 

718 gwf_file.src_uri) 

719 else: 

720 inputs.append(f"file://{uri}") 

721 

722 if inputs: 

723 htc_commands["transfer_input_files"] = ",".join(inputs) 

724 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

725 return htc_commands 

726 

727 

728def _report_from_path(wms_path): 

729 """Gather run information from a given run directory. 

730 

731 Parameters 

732 ---------- 

733 wms_path : `str` 

734 The directory containing the submit side files (e.g., HTCondor files). 

735 

736 Returns 

737 ------- 

738 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

739 Run information for the detailed report. The key is the HTCondor id 

740 and the value is a collection of report information for that run. 

741 message : `str` 

742 Message to be printed with the summary report. 

743 """ 

744 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

745 if wms_workflow_id == MISSING_ID: 

746 run_reports = {} 

747 else: 

748 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

749 return run_reports, message 

750 

751 

752def _report_from_id(wms_workflow_id, hist): 

753 """Gather run information from a given run directory. 

754 

755 Parameters 

756 ---------- 

757 wms_workflow_id : `int` or `str` 

758 Limit to specific run based on id. 

759 hist : `float` 

760 Limit history search to this many days. 

761 

762 Returns 

763 ------- 

764 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

765 Run information for the detailed report. The key is the HTCondor id 

766 and the value is a collection of report information for that run. 

767 message : `str` 

768 Message to be printed with the summary report. 

769 """ 

770 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \ 

771 f"{int(float(wms_workflow_id))})" 

772 jobs = condor_q(constraint) 

773 if hist: 

774 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

775 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

776 hist_jobs = condor_history(constraint) 

777 _update_jobs(jobs, hist_jobs) 

778 

779 # keys in dictionary will be strings of format "ClusterId.ProcId" 

780 wms_workflow_id = str(wms_workflow_id) 

781 if not wms_workflow_id.endswith(".0"): 

782 wms_workflow_id += ".0" 

783 

784 if wms_workflow_id in jobs: 

785 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"]) 

786 _update_jobs(jobs, path_jobs) 

787 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

788 else: 

789 run_reports = {} 

790 message = f"Found 0 records for run id {wms_workflow_id}" 

791 return run_reports, message 

792 

793 

794def _get_info_from_path(wms_path): 

795 """Gather run information from a given run directory. 

796 

797 Parameters 

798 ---------- 

799 wms_path : `str` 

800 Directory containing HTCondor files. 

801 

802 Returns 

803 ------- 

804 wms_workflow_id : `str` 

805 The run id which is a DAGman job id. 

806 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

807 Information about jobs read from files in the given directory. 

808 The key is the HTCondor id and the value is a dictionary of HTCondor 

809 keys and values. 

810 message : `str` 

811 Message to be printed with the summary report. 

812 """ 

813 try: 

814 wms_workflow_id, jobs = read_dag_log(wms_path) 

815 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

816 _update_jobs(jobs, read_node_status(wms_path)) 

817 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

818 

819 # Add more info for DAGman job 

820 job = jobs[wms_workflow_id] 

821 job.update(read_dag_status(wms_path)) 

822 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

823 if "bps_run" not in job: 

824 _add_run_info(wms_path, job) 

825 

826 message = htc_check_dagman_output(wms_path) 

827 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

828 jobs[wms_workflow_id]["total_jobs"]) 

829 except StopIteration: 

830 message = f"Could not find HTCondor files in {wms_path}" 

831 _LOG.warning(message) 

832 wms_workflow_id = MISSING_ID 

833 jobs = {} 

834 

835 return wms_workflow_id, jobs, message 

836 

837 

838def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

839 """Gather run information to be used in generating summary reports. 

840 

841 Parameters 

842 ---------- 

843 wms_workflow_id : `str` 

844 Run lookup restricted to given user. 

845 jobs : `float` 

846 How many previous days to search for run information. 

847 

848 Returns 

849 ------- 

850 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

851 Run information for the detailed report. The key is the given HTCondor 

852 id and the value is a collection of report information for that run. 

853 """ 

854 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

855 dag_job = jobs[wms_workflow_id] 

856 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job: 

857 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id) 

858 return {} 

859 report = WmsRunReport(wms_id=wms_workflow_id, 

860 path=dag_job["Iwd"], 

861 label=dag_job.get("bps_job_label", "MISS"), 

862 run=dag_job.get("bps_run", "MISS"), 

863 project=dag_job.get("bps_project", "MISS"), 

864 campaign=dag_job.get("bps_campaign", "MISS"), 

865 payload=dag_job.get("bps_payload", "MISS"), 

866 operator=_get_owner(dag_job), 

867 run_summary=_get_run_summary(dag_job), 

868 state=_htc_status_to_wms_state(dag_job), 

869 jobs=[], 

870 total_number_jobs=dag_job["total_jobs"], 

871 job_state_counts=dag_job["state_counts"]) 

872 

873 try: 

874 for job in jobs.values(): 

875 if job["ClusterId"] != int(float(wms_workflow_id)): 

876 job_report = WmsJobReport(wms_id=job["ClusterId"], 

877 name=job.get("DAGNodeName", str(job["ClusterId"])), 

878 label=job.get("bps_job_label", 

879 pegasus_name_to_label(job["DAGNodeName"])), 

880 state=_htc_status_to_wms_state(job)) 

881 if job_report.label == "init": 

882 job_report.label = "pipetaskInit" 

883 report.jobs.append(job_report) 

884 except KeyError as ex: 

885 _LOG.error("Job missing key '%s': %s", str(ex), job) 

886 raise 

887 

888 run_reports = {report.wms_id: report} 

889 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

890 return run_reports 

891 

892 

893def _summary_report(user, hist, pass_thru): 

894 """Gather run information to be used in generating summary reports. 

895 

896 Parameters 

897 ---------- 

898 user : `str` 

899 Run lookup restricted to given user. 

900 hist : `float` 

901 How many previous days to search for run information. 

902 pass_thru : `str` 

903 Advanced users can define the HTCondor constraint to be used 

904 when searching queue and history. 

905 

906 Returns 

907 ------- 

908 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

909 Run information for the summary report. The keys are HTCondor ids and 

910 the values are collections of report information for each run. 

911 message : `str` 

912 Message to be printed with the summary report. 

913 """ 

914 # only doing summary report so only look for dagman jobs 

915 if pass_thru: 

916 constraint = pass_thru 

917 else: 

918 # Notes: 

919 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

920 # manually restarted. 

921 # * Any job with DAGManJobID isn't a DAG job 

922 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

923 if user: 

924 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

925 

926 # Check runs in queue. 

927 jobs = condor_q(constraint) 

928 

929 if hist: 

930 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

931 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

932 hist_jobs = condor_history(constraint) 

933 _update_jobs(jobs, hist_jobs) 

934 

935 _LOG.debug("Job ids from queue and history %s", jobs.keys()) 

936 

937 # Have list of DAGMan jobs, need to get run_report info. 

938 run_reports = {} 

939 for job in jobs.values(): 

940 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

941 # If didn't get from queue information (e.g., Kerberos bug), 

942 # try reading from file. 

943 if total_jobs == 0: 

944 try: 

945 job.update(read_dag_status(job["Iwd"])) 

946 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

947 except StopIteration: 

948 pass # don't kill report can't find htcondor files 

949 

950 if "bps_run" not in job: 

951 _add_run_info(job["Iwd"], job) 

952 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)), 

953 path=job["Iwd"], 

954 label=job.get("bps_job_label", "MISS"), 

955 run=job.get("bps_run", "MISS"), 

956 project=job.get("bps_project", "MISS"), 

957 campaign=job.get("bps_campaign", "MISS"), 

958 payload=job.get("bps_payload", "MISS"), 

959 operator=_get_owner(job), 

960 run_summary=_get_run_summary(job), 

961 state=_htc_status_to_wms_state(job), 

962 jobs=[], 

963 total_number_jobs=total_jobs, 

964 job_state_counts=state_counts) 

965 

966 run_reports[report.wms_id] = report 

967 

968 return run_reports, "" 

969 

970 

971def _add_run_info(wms_path, job): 

972 """Find BPS run information elsewhere for runs without bps attributes. 

973 

974 Parameters 

975 ---------- 

976 wms_path : `str` 

977 Path to submit files for the run. 

978 job : `dict` [`str`, `Any`] 

979 HTCondor dag job information. 

980 

981 Raises 

982 ------ 

983 StopIteration 

984 If cannot find file it is looking for. Permission errors are 

985 caught and job's run is marked with error. 

986 """ 

987 path = Path(wms_path) / "jobs" 

988 try: 

989 subfile = next(path.glob("**/*.sub")) 

990 except (StopIteration, PermissionError): 

991 job["bps_run"] = "Unavailable" 

992 else: 

993 _LOG.debug("_add_run_info: subfile = %s", subfile) 

994 try: 

995 with open(subfile, "r") as fh: 

996 for line in fh: 

997 if line.startswith("+bps_"): 

998 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

999 if m: 

1000 _LOG.debug("Matching line: %s", line) 

1001 job[m.group(1)] = m.group(2).replace('"', "") 

1002 else: 

1003 _LOG.debug("Could not parse attribute: %s", line) 

1004 except PermissionError: 

1005 job["bps_run"] = "PermissionError" 

1006 _LOG.debug("After adding job = %s", job) 

1007 

1008 

1009def _get_owner(job): 

1010 """Get the owner of a dag job. 

1011 

1012 Parameters 

1013 ---------- 

1014 job : `dict` [`str`, `Any`] 

1015 HTCondor dag job information. 

1016 

1017 Returns 

1018 ------- 

1019 owner : `str` 

1020 Owner of the dag job. 

1021 """ 

1022 owner = job.get("bps_operator", None) 

1023 if not owner: 

1024 owner = job.get("Owner", None) 

1025 if not owner: 

1026 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1027 owner = "MISS" 

1028 return owner 

1029 

1030 

1031def _get_run_summary(job): 

1032 """Get the run summary for a job. 

1033 

1034 Parameters 

1035 ---------- 

1036 job : `dict` [`str`, `Any`] 

1037 HTCondor dag job information. 

1038 

1039 Returns 

1040 ------- 

1041 summary : `str` 

1042 Number of jobs per PipelineTask label in approximate pipeline order. 

1043 Format: <label>:<count>[;<label>:<count>]+ 

1044 """ 

1045 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1046 if not summary: 

1047 summary, _ = summary_from_dag(job["Iwd"]) 

1048 if not summary: 

1049 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1050 _LOG.debug("_get_run_summary: summary=%s", summary) 

1051 

1052 # Workaround sometimes using init vs pipetaskInit 

1053 summary = summary.replace("init:", "pipetaskInit:") 

1054 

1055 if "pegasus_version" in job and "pegasus" not in summary: 

1056 summary += ";pegasus:0" 

1057 

1058 return summary 

1059 

1060 

1061def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1062 """Count number of jobs per WMS state. 

1063 

1064 Parameters 

1065 ---------- 

1066 wms_workflow_id : `str` 

1067 HTCondor job id. 

1068 jobs : `dict` [`str`, `Any`] 

1069 HTCondor dag job information. 

1070 

1071 Returns 

1072 ------- 

1073 total_count : `int` 

1074 Total number of dag nodes. 

1075 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1076 Keys are the different WMS states and values are counts of jobs 

1077 that are in that WMS state. 

1078 """ 

1079 state_counts = dict.fromkeys(WmsStates, 0) 

1080 

1081 for jid, jinfo in jobs.items(): 

1082 if jid != wms_workflow_id: 

1083 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1084 

1085 total_counted = sum(state_counts.values()) 

1086 if "NodesTotal" in jobs[wms_workflow_id]: 

1087 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1088 else: 

1089 total_count = total_counted 

1090 

1091 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1092 

1093 return total_count, state_counts 

1094 

1095 

1096def _get_state_counts_from_dag_job(job): 

1097 """Count number of jobs per WMS state. 

1098 

1099 Parameters 

1100 ---------- 

1101 job : `dict` [`str`, `Any`] 

1102 HTCondor dag job information. 

1103 

1104 Returns 

1105 ------- 

1106 total_count : `int` 

1107 Total number of dag nodes. 

1108 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1109 Keys are the different WMS states and values are counts of jobs 

1110 that are in that WMS state. 

1111 """ 

1112 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1113 state_counts = dict.fromkeys(WmsStates, 0) 

1114 if "DAG_NodesReady" in job: 

1115 state_counts = { 

1116 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1117 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1118 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1119 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1120 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1121 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1122 total_jobs = job.get("DAG_NodesTotal") 

1123 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1124 elif "NodesFailed" in job: 

1125 state_counts = { 

1126 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1127 WmsStates.READY: job.get("NodesReady", 0), 

1128 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1129 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1130 WmsStates.FAILED: job.get("NodesFailed", 0), 

1131 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1132 try: 

1133 total_jobs = job.get("NodesTotal") 

1134 except KeyError as ex: 

1135 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1136 raise 

1137 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1138 else: 

1139 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1140 # for every DAG. 

1141 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1142 total_jobs = 0 

1143 

1144 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1145 return total_jobs, state_counts 

1146 

1147 

1148def _htc_status_to_wms_state(job): 

1149 """Convert HTCondor job status to generic wms state. 

1150 

1151 Parameters 

1152 ---------- 

1153 job : `dict` [`str`, `Any`] 

1154 HTCondor job information. 

1155 

1156 Returns 

1157 ------- 

1158 wms_state : `WmsStates` 

1159 The equivalent WmsState to given job's status. 

1160 """ 

1161 wms_state = WmsStates.MISFIT 

1162 if "JobStatus" in job: 

1163 wms_state = _htc_job_status_to_wms_state(job) 

1164 elif "NodeStatus" in job: 

1165 wms_state = _htc_node_status_to_wms_state(job) 

1166 return wms_state 

1167 

1168 

1169def _htc_job_status_to_wms_state(job): 

1170 """Convert HTCondor job status to generic wms state. 

1171 

1172 Parameters 

1173 ---------- 

1174 job : `dict` [`str`, `Any`] 

1175 HTCondor job information. 

1176 

1177 Returns 

1178 ------- 

1179 wms_state : `lsst.ctrl.bps.WmsStates` 

1180 The equivalent WmsState to given job's status. 

1181 """ 

1182 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1183 type(job["JobStatus"])) 

1184 job_status = int(job["JobStatus"]) 

1185 wms_state = WmsStates.MISFIT 

1186 

1187 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1188 if job_status == JobStatus.IDLE: 

1189 wms_state = WmsStates.PENDING 

1190 elif job_status == JobStatus.RUNNING: 

1191 wms_state = WmsStates.RUNNING 

1192 elif job_status == JobStatus.REMOVED: 

1193 wms_state = WmsStates.DELETED 

1194 elif job_status == JobStatus.COMPLETED: 

1195 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1196 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1197 job.get("ReturnValue", 0): 

1198 wms_state = WmsStates.FAILED 

1199 else: 

1200 wms_state = WmsStates.SUCCEEDED 

1201 elif job_status == JobStatus.HELD: 

1202 wms_state = WmsStates.HELD 

1203 

1204 return wms_state 

1205 

1206 

1207def _htc_node_status_to_wms_state(job): 

1208 """Convert HTCondor status to generic wms state. 

1209 

1210 Parameters 

1211 ---------- 

1212 job : `dict` [`str`, `Any`] 

1213 HTCondor job information. 

1214 

1215 Returns 

1216 ------- 

1217 wms_state : `lsst.ctrl.bps.WmsStates` 

1218 The equivalent WmsState to given node's status. 

1219 """ 

1220 wms_state = WmsStates.MISFIT 

1221 

1222 status = job["NodeStatus"] 

1223 if status == NodeStatus.NOT_READY: 

1224 wms_state = WmsStates.UNREADY 

1225 elif status == NodeStatus.READY: 

1226 wms_state = WmsStates.READY 

1227 elif status == NodeStatus.PRERUN: 

1228 wms_state = WmsStates.MISFIT 

1229 elif status == NodeStatus.SUBMITTED: 

1230 if job["JobProcsHeld"]: 

1231 wms_state = WmsStates.HELD 

1232 elif job["StatusDetails"] == "not_idle": 

1233 wms_state = WmsStates.RUNNING 

1234 elif job["JobProcsQueued"]: 

1235 wms_state = WmsStates.PENDING 

1236 elif status == NodeStatus.POSTRUN: 

1237 wms_state = WmsStates.MISFIT 

1238 elif status == NodeStatus.DONE: 

1239 wms_state = WmsStates.SUCCEEDED 

1240 elif status == NodeStatus.ERROR: 

1241 # Use job exist instead of post script exit 

1242 if "DAGMAN error 0" in job["StatusDetails"]: 

1243 wms_state = WmsStates.SUCCEEDED 

1244 else: 

1245 wms_state = WmsStates.FAILED 

1246 

1247 return wms_state 

1248 

1249 

1250def _update_jobs(jobs1, jobs2): 

1251 """Update jobs1 with info in jobs2. 

1252 

1253 (Basically an update for nested dictionaries.) 

1254 

1255 Parameters 

1256 ---------- 

1257 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1258 HTCondor job information to be updated. 

1259 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1260 Additional HTCondor job information. 

1261 """ 

1262 for jid, jinfo in jobs2.items(): 

1263 if jid in jobs1: 

1264 jobs1[jid].update(jinfo) 

1265 else: 

1266 jobs1[jid] = jinfo 

1267 

1268 

1269def _wms_id_to_cluster(wms_id): 

1270 """Convert WMS ID to cluster ID. 

1271 

1272 Parameters 

1273 ---------- 

1274 wms_id : `int` or `float` or `str` 

1275 HTCondor job id or path. 

1276 

1277 Returns 

1278 ------- 

1279 cluster_id : `int` 

1280 HTCondor cluster id. 

1281 """ 

1282 # If wms_id represents path, get numeric id. 

1283 try: 

1284 cluster_id = int(float(wms_id)) 

1285 except ValueError: 

1286 wms_path = Path(wms_id) 

1287 if wms_path.exists(): 

1288 try: 

1289 cluster_id, _ = read_dag_log(wms_id) 

1290 cluster_id = int(float(cluster_id)) 

1291 except StopIteration: 

1292 cluster_id = 0 

1293 else: 

1294 cluster_id = 0 

1295 return cluster_id 

1296 

1297 

1298def _create_request_memory_expr(memory, multiplier): 

1299 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1300 

1301 Parameters 

1302 ---------- 

1303 memory : `int` 

1304 Requested memory in MB. 

1305 multiplier : `float` 

1306 Memory growth rate between retires. 

1307 

1308 Returns 

1309 ------- 

1310 ad : `str` 

1311 A string representing an HTCondor ClassAd expression enabling safe 

1312 memory scaling between job retries. 

1313 """ 

1314 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue. 

1315 # The special comparison operators ensure that all comparisons below will 

1316 # evaluate to FALSE in this case. 

1317 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1318 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1319 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1320 

1321 # If job runs the first time or was held for reasons other than exceeding 

1322 # the memory, set the required memory to the requested value or use 

1323 # the memory value measured by HTCondor (MemoryUsage) depending on 

1324 # whichever is greater. 

1325 ad = f"({was_mem_exceeded}) " \ 

1326 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \ 

1327 f": max({{{memory}, MemoryUsage ?: 0}}))" 

1328 return ad