Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import dataclasses 

29import os 

30import re 

31import logging 

32from enum import IntEnum, auto 

33from pathlib import Path 

34 

35import htcondor 

36 

37from ... import ( 

38 BaseWmsWorkflow, 

39 BaseWmsService, 

40 GenericWorkflow, 

41 GenericWorkflowJob, 

42 WmsRunReport, 

43 WmsJobReport, 

44 WmsStates 

45) 

46from ...bps_utils import ( 

47 chdir, 

48 create_count_summary 

49) 

50from .lssthtc import ( 

51 HTCDag, 

52 HTCJob, 

53 MISSING_ID, 

54 JobStatus, 

55 NodeStatus, 

56 htc_check_dagman_output, 

57 htc_escape, 

58 htc_submit_dag, 

59 read_dag_info, 

60 read_dag_log, 

61 read_dag_status, 

62 read_node_status, 

63 condor_q, 

64 condor_search, 

65 condor_status, 

66 pegasus_name_to_label, 

67 summary_from_dag, 

68) 

69 

70 

71class WmsIdType(IntEnum): 

72 """Type of valid WMS ids. 

73 """ 

74 

75 UNKNOWN = auto() 

76 """The type of id cannot be determined. 

77 """ 

78 

79 LOCAL = auto() 

80 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

81 """ 

82 

83 GLOBAL = auto() 

84 """Id is a HTCondor's global job id. 

85 """ 

86 

87 PATH = auto() 

88 """Id is a submission path. 

89 """ 

90 

91 

92DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

93"""Default pattern for searching execute machines in an HTCondor pool. 

94""" 

95 

96_LOG = logging.getLogger(__name__) 

97 

98 

99class HTCondorService(BaseWmsService): 

100 """HTCondor version of WMS service. 

101 """ 

102 def prepare(self, config, generic_workflow, out_prefix=None): 

103 """Convert generic workflow to an HTCondor DAG ready for submission. 

104 

105 Parameters 

106 ---------- 

107 config : `lsst.ctrl.bps.BpsConfig` 

108 BPS configuration that includes necessary submit/runtime 

109 information. 

110 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

111 The generic workflow (e.g., has executable name and arguments). 

112 out_prefix : `str` 

113 The root directory into which all WMS-specific files are written. 

114 

115 Returns 

116 ---------- 

117 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

118 HTCondor workflow ready to be run. 

119 """ 

120 _LOG.debug("out_prefix = '%s'", out_prefix) 

121 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

122 f"{self.__class__.__module__}." 

123 f"{self.__class__.__name__}") 

124 workflow.write(out_prefix) 

125 return workflow 

126 

127 def submit(self, workflow): 

128 """Submit a single HTCondor workflow. 

129 

130 Parameters 

131 ---------- 

132 workflow : `lsst.ctrl.bps.BaseWorkflow` 

133 A single HTCondor workflow to submit. run_id is updated after 

134 successful submission to WMS. 

135 """ 

136 # For workflow portability, internal paths are all relative. Hence 

137 # the DAG needs to be submitted to HTCondor from inside the submit 

138 # directory. 

139 with chdir(workflow.submit_path): 

140 _LOG.info("Submitting from directory: %s", os.getcwd()) 

141 htc_submit_dag(workflow.dag, {}) 

142 workflow.run_id = workflow.dag.run_id 

143 

144 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

145 """Query WMS for list of submitted WMS workflows/jobs. 

146 

147 This should be a quick lookup function to create list of jobs for 

148 other functions. 

149 

150 Parameters 

151 ---------- 

152 wms_id : `int` or `str`, optional 

153 Id or path that can be used by WMS service to look up job. 

154 user : `str`, optional 

155 User whose submitted jobs should be listed. 

156 require_bps : `bool`, optional 

157 Whether to require jobs returned in list to be bps-submitted jobs. 

158 pass_thru : `str`, optional 

159 Information to pass through to WMS. 

160 is_global : `bool`, optional 

161 If set, all job queues (and their histories) will be queried for 

162 job information. Defaults to False which means that only the local 

163 job queue will be queried. 

164 

165 Returns 

166 ------- 

167 job_ids : `list` [`Any`] 

168 Only job ids to be used by cancel and other functions. Typically 

169 this means top-level jobs (i.e., not children jobs). 

170 """ 

171 _LOG.debug("list_submitted_jobs params: " 

172 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

173 wms_id, user, require_bps, pass_thru, is_global) 

174 

175 # Determine which Schedds will be queried for job information. 

176 coll = htcondor.Collector() 

177 

178 schedd_ads = [] 

179 if is_global: 

180 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

181 else: 

182 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

183 

184 # Construct appropriate constraint expression using provided arguments. 

185 constraint = "False" 

186 if wms_id is None: 

187 if user is not None: 

188 constraint = f'(Owner == "{user}")' 

189 else: 

190 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

191 if cluster_id is not None: 

192 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

193 

194 # If provided id is either a submission path or a global id, 

195 # make sure the right Schedd will be queried regardless of 

196 # 'is_global' value. 

197 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

198 schedd_ads = [schedd_ad] 

199 if require_bps: 

200 constraint += ' && (bps_isjob == "True")' 

201 if pass_thru: 

202 if "-forcex" in pass_thru: 

203 pass_thru_2 = pass_thru.replace("-forcex", "") 

204 if pass_thru_2 and not pass_thru_2.isspace(): 

205 constraint += f" && ({pass_thru_2})" 

206 else: 

207 constraint += f" && ({pass_thru})" 

208 

209 # Create a list of scheduler daemons which need to be queried. 

210 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

211 

212 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

213 results = condor_q(constraint=constraint, schedds=schedds) 

214 

215 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

216 job_ids = [] 

217 for schedd_name, job_info in results.items(): 

218 for job_id, job_ad in job_info.items(): 

219 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

220 if "DAGManJobId" not in job_ad: 

221 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

222 else: 

223 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

224 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

225 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

226 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

227 

228 _LOG.debug("job_ids = %s", job_ids) 

229 return job_ids 

230 

231 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

232 """Return run information based upon given constraints. 

233 

234 Parameters 

235 ---------- 

236 wms_workflow_id : `str`, optional 

237 Limit to specific run based on id. 

238 user : `str`, optional 

239 Limit results to runs for this user. 

240 hist : `float`, optional 

241 Limit history search to this many days. Defaults to 0. 

242 pass_thru : `str`, optional 

243 Constraints to pass through to HTCondor. 

244 is_global : `bool`, optional 

245 If set, all job queues (and their histories) will be queried for 

246 job information. Defaults to False which means that only the local 

247 job queue will be queried. 

248 

249 Returns 

250 ------- 

251 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

252 Information about runs from given job information. 

253 message : `str` 

254 Extra message for report command to print. This could be pointers 

255 to documentation or to WMS specific commands. 

256 """ 

257 if wms_workflow_id: 

258 id_type = _wms_id_type(wms_workflow_id) 

259 if id_type == WmsIdType.LOCAL: 

260 schedulers = _locate_schedds(locate_all=is_global) 

261 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

262 elif id_type == WmsIdType.GLOBAL: 

263 schedulers = _locate_schedds(locate_all=True) 

264 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

265 elif id_type == WmsIdType.PATH: 

266 run_reports, message = _report_from_path(wms_workflow_id) 

267 else: 

268 run_reports, message = {}, 'Invalid job id' 

269 else: 

270 schedulers = _locate_schedds(locate_all=is_global) 

271 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

272 _LOG.debug("report: %s, %s", run_reports, message) 

273 

274 return list(run_reports.values()), message 

275 

276 def cancel(self, wms_id, pass_thru=None): 

277 """Cancel submitted workflows/jobs. 

278 

279 Parameters 

280 ---------- 

281 wms_id : `str` 

282 Id or path of job that should be canceled. 

283 pass_thru : `str`, optional 

284 Information to pass through to WMS. 

285 

286 Returns 

287 -------- 

288 deleted : `bool` 

289 Whether successful deletion or not. Currently, if any doubt or any 

290 individual jobs not deleted, return False. 

291 message : `str` 

292 Any message from WMS (e.g., error details). 

293 """ 

294 _LOG.debug("Canceling wms_id = %s", wms_id) 

295 

296 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

297 

298 if cluster_id is None: 

299 deleted = False 

300 message = "invalid id" 

301 else: 

302 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s", 

303 cluster_id, schedd_ad["Name"]) 

304 schedd = htcondor.Schedd(schedd_ad) 

305 

306 constraint = f"ClusterId == {cluster_id}" 

307 if pass_thru is not None and "-forcex" in pass_thru: 

308 pass_thru_2 = pass_thru.replace("-forcex", "") 

309 if pass_thru_2 and not pass_thru_2.isspace(): 

310 constraint += f"&& ({pass_thru_2})" 

311 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

312 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

313 else: 

314 if pass_thru: 

315 constraint += f"&& ({pass_thru})" 

316 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

317 results = schedd.act(htcondor.JobAction.Remove, constraint) 

318 _LOG.debug("Remove results: %s", results) 

319 

320 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

321 deleted = True 

322 message = "" 

323 else: 

324 deleted = False 

325 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

326 message = "no such bps job in batch queue" 

327 else: 

328 message = f"unknown problems deleting: {results}" 

329 

330 _LOG.debug("deleted: %s; message = %s", deleted, message) 

331 return deleted, message 

332 

333 

334class HTCondorWorkflow(BaseWmsWorkflow): 

335 """Single HTCondor workflow. 

336 

337 Parameters 

338 ---------- 

339 name : `str` 

340 Unique name for Workflow used when naming files. 

341 config : `lsst.ctrl.bps.BpsConfig` 

342 BPS configuration that includes necessary submit/runtime information. 

343 """ 

344 def __init__(self, name, config=None): 

345 super().__init__(name, config) 

346 self.dag = None 

347 

348 @classmethod 

349 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

350 # Docstring inherited 

351 htc_workflow = cls(generic_workflow.name, config) 

352 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

353 

354 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

355 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

356 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

357 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

358 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

359 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

360 

361 # Determine the hard limit for the memory requirement. 

362 found, limit = config.search('memoryLimit') 

363 if not found: 

364 search_opts = {"default": DEFAULT_HTC_EXEC_PATT} 

365 _, site = config.search("computeSite") 

366 if site: 

367 search_opts["curvals"] = {"curr_site": site} 

368 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

369 

370 # To reduce the amount of data, ignore dynamic slots (if any) as, 

371 # by definition, they cannot have more memory than 

372 # the partitionable slot they are the part of. 

373 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

374 pool_info = condor_status(constraint=constraint) 

375 try: 

376 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

377 except ValueError: 

378 _LOG.debug("No execute machine in the pool matches %s", patt) 

379 if limit: 

380 config[".bps_defined.memory_limit"] = limit 

381 

382 # Create all DAG jobs 

383 for job_name in generic_workflow: 

384 gwjob = generic_workflow.get_job(job_name) 

385 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix) 

386 htc_workflow.dag.add_job(htc_job) 

387 

388 # Add job dependencies to the DAG 

389 for job_name in generic_workflow: 

390 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

391 

392 # If final job exists in generic workflow, create DAG final job 

393 final = generic_workflow.get_final() 

394 if final and isinstance(final, GenericWorkflowJob): 

395 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix) 

396 if "post" not in final_htjob.dagcmds: 

397 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

398 f" {final.name} $DAG_STATUS $RETURN" 

399 htc_workflow.dag.add_final_job(final_htjob) 

400 elif final and isinstance(final, GenericWorkflow): 

401 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

402 elif final: 

403 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

404 

405 return htc_workflow 

406 

407 @staticmethod 

408 def _create_job(config, generic_workflow, gwjob, out_prefix): 

409 """Convert GenericWorkflow job nodes to DAG jobs. 

410 

411 Parameters 

412 ---------- 

413 config : `lsst.ctrl.bps.BpsConfig` 

414 BPS configuration that includes necessary submit/runtime 

415 information. 

416 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

417 Generic workflow that is being converted. 

418 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

419 The generic job to convert to a HTCondor job. 

420 out_prefix : `str` 

421 Directory prefix for HTCondor files. 

422 

423 Returns 

424 ------- 

425 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

426 The HTCondor job equivalent to the given generic job. 

427 """ 

428 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

429 

430 curvals = dataclasses.asdict(gwjob) 

431 if gwjob.tags: 

432 curvals.update(gwjob.tags) 

433 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals}) 

434 if not found: 

435 subdir = "jobs" 

436 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

437 

438 htc_job_cmds = { 

439 "universe": "vanilla", 

440 "should_transfer_files": "YES", 

441 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

442 "transfer_output_files": '""', # Set to empty string to disable 

443 "transfer_executable": "False", 

444 "getenv": "True", 

445 

446 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

447 # to put SIGBUS jobs on hold. 

448 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

449 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

450 "on_exit_hold_subcode": "34" 

451 } 

452 

453 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob)) 

454 

455 # job stdout, stderr, htcondor user log. 

456 for key in ("output", "error", "log"): 

457 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

458 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

459 

460 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

461 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix)) 

462 

463 # Add the job cmds dict to the job object. 

464 htc_job.add_job_cmds(htc_job_cmds) 

465 

466 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

467 

468 # Add job attributes to job. 

469 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

470 htc_job.add_job_attrs(gwjob.attrs) 

471 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

472 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

473 "bps_job_label": gwjob.label}) 

474 

475 return htc_job 

476 

477 def write(self, out_prefix): 

478 """Output HTCondor DAGMan files needed for workflow submission. 

479 

480 Parameters 

481 ---------- 

482 out_prefix : `str` 

483 Directory prefix for HTCondor files. 

484 """ 

485 self.submit_path = out_prefix 

486 os.makedirs(out_prefix, exist_ok=True) 

487 

488 # Write down the workflow in HTCondor format. 

489 self.dag.write(out_prefix, "jobs/{self.label}") 

490 

491 

492def _translate_job_cmds(config, generic_workflow, gwjob): 

493 """Translate the job data that are one to one mapping 

494 

495 Parameters 

496 ---------- 

497 config : `lsst.ctrl.bps.BpsConfig` 

498 BPS configuration that includes necessary submit/runtime 

499 information. 

500 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

501 Generic workflow that contains job to being converted. 

502 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

503 Generic workflow job to be converted. 

504 

505 Returns 

506 ------- 

507 htc_job_commands : `dict` [`str`, `Any`] 

508 Contains commands which can appear in the HTCondor submit description 

509 file. 

510 """ 

511 # Values in the job script that just are name mappings. 

512 job_translation = {"mail_to": "notify_user", 

513 "when_to_mail": "notification", 

514 "request_cpus": "request_cpus", 

515 "priority": "priority", 

516 "category": "category"} 

517 

518 jobcmds = {} 

519 for gwkey, htckey in job_translation.items(): 

520 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

521 

522 # job commands that need modification 

523 if gwjob.number_of_retries: 

524 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

525 

526 if gwjob.retry_unless_exit: 

527 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

528 

529 if gwjob.request_disk: 

530 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

531 

532 if gwjob.request_memory: 

533 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

534 

535 if gwjob.memory_multiplier: 

536 # Do not use try-except! At the moment, BpsConfig returns an empty 

537 # string if it does not contain the key. 

538 memory_limit = config[".bps_defined.memory_limit"] 

539 if not memory_limit: 

540 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

541 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

542 "search pattern 'executeMachinesPattern' might help.") 

543 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier) 

544 

545 # Periodically release jobs which are being held due to exceeding 

546 # memory. Stop doing that (by removing the job from the HTCondor queue) 

547 # after the maximal number of retries has been reached or the memory 

548 # requirements cannot be satisfied. 

549 jobcmds["periodic_release"] = \ 

550 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)" 

551 jobcmds["periodic_remove"] = \ 

552 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \ 

553 f"JobStatus == 5 && NumJobStarts > JobMaxRetries" 

554 

555 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

556 # May need to move to special site-specific implementation if sites use 

557 # other mechanisms. 

558 if gwjob.concurrency_limit: 

559 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

560 

561 # Handle command line 

562 if gwjob.executable.transfer_executable: 

563 jobcmds["transfer_executable"] = "True" 

564 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

565 else: 

566 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

567 

568 if gwjob.arguments: 

569 arguments = gwjob.arguments 

570 arguments = _replace_cmd_vars(arguments, gwjob) 

571 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob) 

572 arguments = _fix_env_var_syntax(arguments) 

573 jobcmds["arguments"] = arguments 

574 

575 # Add extra "pass-thru" job commands 

576 if gwjob.profile: 

577 for key, val in gwjob.profile.items(): 

578 jobcmds[key] = htc_escape(val) 

579 

580 return jobcmds 

581 

582 

583def _translate_dag_cmds(gwjob): 

584 """Translate job values into DAGMan commands. 

585 

586 Parameters 

587 ---------- 

588 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

589 Job containing values to be translated. 

590 

591 Returns 

592 ------- 

593 dagcmds : `dict` [`str`, `Any`] 

594 DAGMan commands for the job. 

595 """ 

596 # Values in the dag script that just are name mappings. 

597 dag_translation = {"abort_on_value": "abort_dag_on", 

598 "abort_return_value": "abort_exit"} 

599 

600 dagcmds = {} 

601 for gwkey, htckey in dag_translation.items(): 

602 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

603 

604 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

605 return dagcmds 

606 

607 

608def _fix_env_var_syntax(oldstr): 

609 """Change ENV place holders to HTCondor Env var syntax. 

610 

611 Parameters 

612 ---------- 

613 oldstr : `str` 

614 String in which environment variable syntax is to be fixed. 

615 

616 Returns 

617 ------- 

618 newstr : `str` 

619 Given string with environment variable syntax fixed. 

620 """ 

621 newstr = oldstr 

622 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

623 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

624 return newstr 

625 

626 

627def _replace_file_vars(config, arguments, workflow, gwjob): 

628 """Replace file placeholders in command line arguments with correct 

629 physical file names. 

630 

631 Parameters 

632 ---------- 

633 config : `lsst.ctrl.bps.BpsConfig` 

634 BPS configuration that includes necessary submit/runtime 

635 information. 

636 arguments : `str` 

637 Arguments string in which to replace file placeholders. 

638 workflow : `lsst.ctrl.bps.GenericWorkflow` 

639 Generic workflow that contains file information. 

640 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

641 The job corresponding to the arguments. 

642 

643 Returns 

644 ------- 

645 arguments : `str` 

646 Given arguments string with file placeholders replaced. 

647 """ 

648 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

649 

650 # Replace input file placeholders with paths. 

651 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

652 if not gwfile.wms_transfer: 

653 # Must assume full URI if in command line and told WMS is not 

654 # responsible for transferring file. 

655 uri = gwfile.src_uri 

656 elif use_shared: 

657 if gwfile.job_shared: 

658 # Have shared filesystems and jobs can share file. 

659 uri = gwfile.src_uri 

660 else: 

661 # Taking advantage of inside knowledge. Not future-proof. 

662 # Temporary fix until have job wrapper that pulls files 

663 # within job. 

664 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

665 uri = "butler.yaml" 

666 else: 

667 uri = os.path.basename(gwfile.src_uri) 

668 else: # Using push transfer 

669 uri = os.path.basename(gwfile.src_uri) 

670 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

671 

672 # Replace output file placeholders with paths. 

673 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

674 if not gwfile.wms_transfer: 

675 # Must assume full URI if in command line and told WMS is not 

676 # responsible for transferring file. 

677 uri = gwfile.src_uri 

678 elif use_shared: 

679 if gwfile.job_shared: 

680 # Have shared filesystems and jobs can share file. 

681 uri = gwfile.src_uri 

682 else: 

683 uri = os.path.basename(gwfile.src_uri) 

684 else: # Using push transfer 

685 uri = os.path.basename(gwfile.src_uri) 

686 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

687 return arguments 

688 

689 

690def _replace_cmd_vars(arguments, gwjob): 

691 """Replace format-style placeholders in arguments. 

692 

693 Parameters 

694 ---------- 

695 arguments : `str` 

696 Arguments string in which to replace placeholders. 

697 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

698 Job containing values to be used to replace placeholders 

699 (in particular gwjob.cmdvals). 

700 

701 Returns 

702 ------- 

703 arguments : `str` 

704 Given arguments string with placeholders replaced. 

705 """ 

706 try: 

707 arguments = arguments.format(**gwjob.cmdvals) 

708 except (KeyError, TypeError): # TypeError in case None instead of {} 

709 _LOG.error("Could not replace command variables:\n" 

710 "arguments: %s\n" 

711 "cmdvals: %s", arguments, gwjob.cmdvals) 

712 raise 

713 return arguments 

714 

715 

716def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

717 """Add job input files from generic workflow to job. 

718 

719 Parameters 

720 ---------- 

721 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

722 The generic workflow (e.g., has executable name and arguments). 

723 job_name : `str` 

724 Unique name for the job. 

725 use_shared : `bool` 

726 Whether job has access to files via shared filesystem. 

727 out_prefix : `str` 

728 The root directory into which all WMS-specific files are written. 

729 

730 Returns 

731 ------- 

732 htc_commands : `dict` [`str`, `str`] 

733 HTCondor commands for the job submission script. 

734 """ 

735 htc_commands = {} 

736 inputs = [] 

737 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

738 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

739 

740 uri = Path(gwf_file.src_uri) 

741 

742 # Note if use_shared and job_shared, don't need to transfer file. 

743 

744 if not use_shared: # Copy file using push to job 

745 inputs.append(str(uri.relative_to(out_prefix))) 

746 elif not gwf_file.job_shared: # Jobs require own copy 

747 

748 # if using shared filesystem, but still need copy in job. Use 

749 # HTCondor's curl plugin for a local copy. 

750 

751 # Execution butler is represented as a directory which the 

752 # curl plugin does not handle. Taking advantage of inside 

753 # knowledge for temporary fix until have job wrapper that pulls 

754 # files within job. 

755 if gwf_file.name == "butlerConfig": 

756 # The execution butler directory doesn't normally exist until 

757 # the submit phase so checking for suffix instead of using 

758 # is_dir(). If other non-yaml file exists they would have a 

759 # different gwf_file.name. 

760 if uri.suffix == ".yaml": # Single file, so just copy. 

761 inputs.append(f"file://{uri}") 

762 else: 

763 inputs.append(f"file://{uri / 'butler.yaml'}") 

764 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

765 elif uri.is_dir(): 

766 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)", 

767 gwf_file.src_uri) 

768 else: 

769 inputs.append(f"file://{uri}") 

770 

771 if inputs: 

772 htc_commands["transfer_input_files"] = ",".join(inputs) 

773 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

774 return htc_commands 

775 

776 

777def _report_from_path(wms_path): 

778 """Gather run information from a given run directory. 

779 

780 Parameters 

781 ---------- 

782 wms_path : `str` 

783 The directory containing the submit side files (e.g., HTCondor files). 

784 

785 Returns 

786 ------- 

787 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

788 Run information for the detailed report. The key is the HTCondor id 

789 and the value is a collection of report information for that run. 

790 message : `str` 

791 Message to be printed with the summary report. 

792 """ 

793 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

794 if wms_workflow_id == MISSING_ID: 

795 run_reports = {} 

796 else: 

797 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

798 return run_reports, message 

799 

800 

801def _report_from_id(wms_workflow_id, hist, schedds=None): 

802 """Gather run information using workflow id. 

803 

804 Parameters 

805 ---------- 

806 wms_workflow_id : `str` 

807 Limit to specific run based on id. 

808 hist : `float` 

809 Limit history search to this many days. 

810 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

811 HTCondor schedulers which to query for job information. If None 

812 (default), all queries will be run against the local scheduler only. 

813 

814 Returns 

815 ------- 

816 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

817 Run information for the detailed report. The key is the HTCondor id 

818 and the value is a collection of report information for that run. 

819 message : `str` 

820 Message to be printed with the summary report. 

821 """ 

822 dag_constraint = 'regexp("dagman$", Cmd)' 

823 try: 

824 cluster_id = int(float(wms_workflow_id)) 

825 except ValueError: 

826 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

827 else: 

828 dag_constraint += f" && ClusterId == {cluster_id}" 

829 

830 # With the current implementation of the condor_* functions the query will 

831 # always return only one match per Scheduler. 

832 # 

833 # Even in the highly unlikely situation where HTCondor history (which 

834 # condor_search queries too) is long enough to have jobs from before the 

835 # cluster ids were rolled over (and as a result there is more then one job 

836 # with the same cluster id) they will not show up in the results. 

837 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

838 if len(schedd_dag_info) == 0: 

839 run_reports = {} 

840 message = "" 

841 elif len(schedd_dag_info) == 1: 

842 _, dag_info = schedd_dag_info.popitem() 

843 dag_id, dag_ad = dag_info.popitem() 

844 

845 # Create a mapping between jobs and their classads. The keys will be 

846 # of format 'ClusterId.ProcId'. 

847 job_info = {dag_id: dag_ad} 

848 

849 # Find jobs (nodes) belonging to that DAGMan job. 

850 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

851 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

852 _, node_info = schedd_job_info.popitem() 

853 job_info.update(node_info) 

854 

855 # Collect additional pieces of information about jobs using HTCondor 

856 # files in the submission directory. 

857 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

858 _update_jobs(job_info, path_jobs) 

859 

860 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

861 message = "" 

862 else: 

863 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

864 run_reports = {} 

865 message = f"More than one job matches id '{wms_workflow_id}', " \ 

866 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

867 return run_reports, message 

868 

869 

870def _get_info_from_path(wms_path): 

871 """Gather run information from a given run directory. 

872 

873 Parameters 

874 ---------- 

875 wms_path : `str` 

876 Directory containing HTCondor files. 

877 

878 Returns 

879 ------- 

880 wms_workflow_id : `str` 

881 The run id which is a DAGman job id. 

882 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

883 Information about jobs read from files in the given directory. 

884 The key is the HTCondor id and the value is a dictionary of HTCondor 

885 keys and values. 

886 message : `str` 

887 Message to be printed with the summary report. 

888 """ 

889 messages = [] 

890 try: 

891 wms_workflow_id, jobs = read_dag_log(wms_path) 

892 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

893 _update_jobs(jobs, read_node_status(wms_path)) 

894 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

895 

896 # Add more info for DAGman job 

897 job = jobs[wms_workflow_id] 

898 job.update(read_dag_status(wms_path)) 

899 

900 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

901 if "bps_run" not in job: 

902 _add_run_info(wms_path, job) 

903 

904 message = htc_check_dagman_output(wms_path) 

905 if message: 

906 messages.append(message) 

907 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

908 jobs[wms_workflow_id]["total_jobs"]) 

909 

910 # Add extra pieces of information which cannot be found in HTCondor 

911 # generated files like 'GlobalJobId'. 

912 # 

913 # Do not treat absence of this file as a serious error. Neither runs 

914 # submitted with earlier versions of the plugin nor the runs submitted 

915 # with Pegasus plugin will have it at the moment. However, once enough 

916 # time passes and Pegasus plugin will have its own report() method 

917 # (instead of sneakily using HTCondor's one), the lack of that file 

918 # should be treated as seriously as lack of any other file. 

919 try: 

920 job_info = read_dag_info(wms_path) 

921 except FileNotFoundError as exc: 

922 message = f"Warn: Some information may not be available: {exc}" 

923 messages.append(message) 

924 else: 

925 schedd_name = next(iter(job_info)) 

926 job_ad = next(iter(job_info[schedd_name].values())) 

927 job.update(job_ad) 

928 except FileNotFoundError: 

929 message = f"Could not find HTCondor files in '{wms_path}'" 

930 _LOG.warning(message) 

931 messages.append(message) 

932 wms_workflow_id = MISSING_ID 

933 jobs = {} 

934 

935 message = '\n'.join([msg for msg in messages if msg]) 

936 return wms_workflow_id, jobs, message 

937 

938 

939def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

940 """Gather run information to be used in generating summary reports. 

941 

942 Parameters 

943 ---------- 

944 wms_workflow_id : `str` 

945 The run id to create the report for. 

946 jobs : `dict` [`str`, `dict` [`str`, Any]] 

947 Mapping HTCondor job id to job information. 

948 

949 Returns 

950 ------- 

951 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

952 Run information for the detailed report. The key is the given HTCondor 

953 id and the value is a collection of report information for that run. 

954 """ 

955 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

956 dag_job = jobs[wms_workflow_id] 

957 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

958 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

959 path=dag_job["Iwd"], 

960 label=dag_job.get("bps_job_label", "MISS"), 

961 run=dag_job.get("bps_run", "MISS"), 

962 project=dag_job.get("bps_project", "MISS"), 

963 campaign=dag_job.get("bps_campaign", "MISS"), 

964 payload=dag_job.get("bps_payload", "MISS"), 

965 operator=_get_owner(dag_job), 

966 run_summary=_get_run_summary(dag_job), 

967 state=_htc_status_to_wms_state(dag_job), 

968 jobs=[], 

969 total_number_jobs=dag_job["total_jobs"], 

970 job_state_counts=dag_job["state_counts"]) 

971 

972 for job_id, job_info in jobs.items(): 

973 try: 

974 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

975 job_report = WmsJobReport(wms_id=job_id, 

976 name=job_info.get("DAGNodeName", job_id), 

977 label=job_info.get("bps_job_label", 

978 pegasus_name_to_label(job_info["DAGNodeName"])), 

979 state=_htc_status_to_wms_state(job_info)) 

980 if job_report.label == "init": 

981 job_report.label = "pipetaskInit" 

982 report.jobs.append(job_report) 

983 except KeyError as ex: 

984 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

985 raise 

986 

987 run_reports = {report.wms_id: report} 

988 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

989 return run_reports 

990 

991 

992def _summary_report(user, hist, pass_thru, schedds=None): 

993 """Gather run information to be used in generating summary reports. 

994 

995 Parameters 

996 ---------- 

997 user : `str` 

998 Run lookup restricted to given user. 

999 hist : `float` 

1000 How many previous days to search for run information. 

1001 pass_thru : `str` 

1002 Advanced users can define the HTCondor constraint to be used 

1003 when searching queue and history. 

1004 

1005 Returns 

1006 ------- 

1007 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1008 Run information for the summary report. The keys are HTCondor ids and 

1009 the values are collections of report information for each run. 

1010 message : `str` 

1011 Message to be printed with the summary report. 

1012 """ 

1013 # only doing summary report so only look for dagman jobs 

1014 if pass_thru: 

1015 constraint = pass_thru 

1016 else: 

1017 # Notes: 

1018 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1019 # manually restarted. 

1020 # * Any job with DAGManJobID isn't a DAG job 

1021 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1022 if user: 

1023 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1024 

1025 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1026 

1027 # Have list of DAGMan jobs, need to get run_report info. 

1028 run_reports = {} 

1029 for jobs in job_info.values(): 

1030 for job_id, job in jobs.items(): 

1031 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1032 # If didn't get from queue information (e.g., Kerberos bug), 

1033 # try reading from file. 

1034 if total_jobs == 0: 

1035 try: 

1036 job.update(read_dag_status(job["Iwd"])) 

1037 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1038 except StopIteration: 

1039 pass # don't kill report can't find htcondor files 

1040 

1041 if "bps_run" not in job: 

1042 _add_run_info(job["Iwd"], job) 

1043 report = WmsRunReport(wms_id=job_id, 

1044 global_wms_id=job["GlobalJobId"], 

1045 path=job["Iwd"], 

1046 label=job.get("bps_job_label", "MISS"), 

1047 run=job.get("bps_run", "MISS"), 

1048 project=job.get("bps_project", "MISS"), 

1049 campaign=job.get("bps_campaign", "MISS"), 

1050 payload=job.get("bps_payload", "MISS"), 

1051 operator=_get_owner(job), 

1052 run_summary=_get_run_summary(job), 

1053 state=_htc_status_to_wms_state(job), 

1054 jobs=[], 

1055 total_number_jobs=total_jobs, 

1056 job_state_counts=state_counts) 

1057 run_reports[report.global_wms_id] = report 

1058 

1059 return run_reports, "" 

1060 

1061 

1062def _add_run_info(wms_path, job): 

1063 """Find BPS run information elsewhere for runs without bps attributes. 

1064 

1065 Parameters 

1066 ---------- 

1067 wms_path : `str` 

1068 Path to submit files for the run. 

1069 job : `dict` [`str`, `Any`] 

1070 HTCondor dag job information. 

1071 

1072 Raises 

1073 ------ 

1074 StopIteration 

1075 If cannot find file it is looking for. Permission errors are 

1076 caught and job's run is marked with error. 

1077 """ 

1078 path = Path(wms_path) / "jobs" 

1079 try: 

1080 subfile = next(path.glob("**/*.sub")) 

1081 except (StopIteration, PermissionError): 

1082 job["bps_run"] = "Unavailable" 

1083 else: 

1084 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1085 try: 

1086 with open(subfile, "r") as fh: 

1087 for line in fh: 

1088 if line.startswith("+bps_"): 

1089 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1090 if m: 

1091 _LOG.debug("Matching line: %s", line) 

1092 job[m.group(1)] = m.group(2).replace('"', "") 

1093 else: 

1094 _LOG.debug("Could not parse attribute: %s", line) 

1095 except PermissionError: 

1096 job["bps_run"] = "PermissionError" 

1097 _LOG.debug("After adding job = %s", job) 

1098 

1099 

1100def _get_owner(job): 

1101 """Get the owner of a dag job. 

1102 

1103 Parameters 

1104 ---------- 

1105 job : `dict` [`str`, `Any`] 

1106 HTCondor dag job information. 

1107 

1108 Returns 

1109 ------- 

1110 owner : `str` 

1111 Owner of the dag job. 

1112 """ 

1113 owner = job.get("bps_operator", None) 

1114 if not owner: 

1115 owner = job.get("Owner", None) 

1116 if not owner: 

1117 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1118 owner = "MISS" 

1119 return owner 

1120 

1121 

1122def _get_run_summary(job): 

1123 """Get the run summary for a job. 

1124 

1125 Parameters 

1126 ---------- 

1127 job : `dict` [`str`, `Any`] 

1128 HTCondor dag job information. 

1129 

1130 Returns 

1131 ------- 

1132 summary : `str` 

1133 Number of jobs per PipelineTask label in approximate pipeline order. 

1134 Format: <label>:<count>[;<label>:<count>]+ 

1135 """ 

1136 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1137 if not summary: 

1138 summary, _ = summary_from_dag(job["Iwd"]) 

1139 if not summary: 

1140 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1141 _LOG.debug("_get_run_summary: summary=%s", summary) 

1142 

1143 # Workaround sometimes using init vs pipetaskInit 

1144 summary = summary.replace("init:", "pipetaskInit:") 

1145 

1146 if "pegasus_version" in job and "pegasus" not in summary: 

1147 summary += ";pegasus:0" 

1148 

1149 return summary 

1150 

1151 

1152def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1153 """Count number of jobs per WMS state. 

1154 

1155 Parameters 

1156 ---------- 

1157 wms_workflow_id : `str` 

1158 HTCondor job id. 

1159 jobs : `dict` [`str`, `Any`] 

1160 HTCondor dag job information. 

1161 

1162 Returns 

1163 ------- 

1164 total_count : `int` 

1165 Total number of dag nodes. 

1166 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1167 Keys are the different WMS states and values are counts of jobs 

1168 that are in that WMS state. 

1169 """ 

1170 state_counts = dict.fromkeys(WmsStates, 0) 

1171 

1172 for jid, jinfo in jobs.items(): 

1173 if jid != wms_workflow_id: 

1174 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1175 

1176 total_counted = sum(state_counts.values()) 

1177 if "NodesTotal" in jobs[wms_workflow_id]: 

1178 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1179 else: 

1180 total_count = total_counted 

1181 

1182 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1183 

1184 return total_count, state_counts 

1185 

1186 

1187def _get_state_counts_from_dag_job(job): 

1188 """Count number of jobs per WMS state. 

1189 

1190 Parameters 

1191 ---------- 

1192 job : `dict` [`str`, `Any`] 

1193 HTCondor dag job information. 

1194 

1195 Returns 

1196 ------- 

1197 total_count : `int` 

1198 Total number of dag nodes. 

1199 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1200 Keys are the different WMS states and values are counts of jobs 

1201 that are in that WMS state. 

1202 """ 

1203 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1204 state_counts = dict.fromkeys(WmsStates, 0) 

1205 if "DAG_NodesReady" in job: 

1206 state_counts = { 

1207 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1208 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1209 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1210 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1211 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1212 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1213 total_jobs = job.get("DAG_NodesTotal") 

1214 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1215 elif "NodesFailed" in job: 

1216 state_counts = { 

1217 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1218 WmsStates.READY: job.get("NodesReady", 0), 

1219 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1220 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1221 WmsStates.FAILED: job.get("NodesFailed", 0), 

1222 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1223 try: 

1224 total_jobs = job.get("NodesTotal") 

1225 except KeyError as ex: 

1226 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1227 raise 

1228 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1229 else: 

1230 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1231 # for every DAG. 

1232 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1233 total_jobs = 0 

1234 

1235 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1236 return total_jobs, state_counts 

1237 

1238 

1239def _htc_status_to_wms_state(job): 

1240 """Convert HTCondor job status to generic wms state. 

1241 

1242 Parameters 

1243 ---------- 

1244 job : `dict` [`str`, `Any`] 

1245 HTCondor job information. 

1246 

1247 Returns 

1248 ------- 

1249 wms_state : `WmsStates` 

1250 The equivalent WmsState to given job's status. 

1251 """ 

1252 wms_state = WmsStates.MISFIT 

1253 if "JobStatus" in job: 

1254 wms_state = _htc_job_status_to_wms_state(job) 

1255 elif "NodeStatus" in job: 

1256 wms_state = _htc_node_status_to_wms_state(job) 

1257 return wms_state 

1258 

1259 

1260def _htc_job_status_to_wms_state(job): 

1261 """Convert HTCondor job status to generic wms state. 

1262 

1263 Parameters 

1264 ---------- 

1265 job : `dict` [`str`, `Any`] 

1266 HTCondor job information. 

1267 

1268 Returns 

1269 ------- 

1270 wms_state : `lsst.ctrl.bps.WmsStates` 

1271 The equivalent WmsState to given job's status. 

1272 """ 

1273 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1274 type(job["JobStatus"])) 

1275 job_status = int(job["JobStatus"]) 

1276 wms_state = WmsStates.MISFIT 

1277 

1278 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1279 if job_status == JobStatus.IDLE: 

1280 wms_state = WmsStates.PENDING 

1281 elif job_status == JobStatus.RUNNING: 

1282 wms_state = WmsStates.RUNNING 

1283 elif job_status == JobStatus.REMOVED: 

1284 wms_state = WmsStates.DELETED 

1285 elif job_status == JobStatus.COMPLETED: 

1286 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1287 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1288 job.get("ReturnValue", 0): 

1289 wms_state = WmsStates.FAILED 

1290 else: 

1291 wms_state = WmsStates.SUCCEEDED 

1292 elif job_status == JobStatus.HELD: 

1293 wms_state = WmsStates.HELD 

1294 

1295 return wms_state 

1296 

1297 

1298def _htc_node_status_to_wms_state(job): 

1299 """Convert HTCondor status to generic wms state. 

1300 

1301 Parameters 

1302 ---------- 

1303 job : `dict` [`str`, `Any`] 

1304 HTCondor job information. 

1305 

1306 Returns 

1307 ------- 

1308 wms_state : `lsst.ctrl.bps.WmsStates` 

1309 The equivalent WmsState to given node's status. 

1310 """ 

1311 wms_state = WmsStates.MISFIT 

1312 

1313 status = job["NodeStatus"] 

1314 if status == NodeStatus.NOT_READY: 

1315 wms_state = WmsStates.UNREADY 

1316 elif status == NodeStatus.READY: 

1317 wms_state = WmsStates.READY 

1318 elif status == NodeStatus.PRERUN: 

1319 wms_state = WmsStates.MISFIT 

1320 elif status == NodeStatus.SUBMITTED: 

1321 if job["JobProcsHeld"]: 

1322 wms_state = WmsStates.HELD 

1323 elif job["StatusDetails"] == "not_idle": 

1324 wms_state = WmsStates.RUNNING 

1325 elif job["JobProcsQueued"]: 

1326 wms_state = WmsStates.PENDING 

1327 elif status == NodeStatus.POSTRUN: 

1328 wms_state = WmsStates.MISFIT 

1329 elif status == NodeStatus.DONE: 

1330 wms_state = WmsStates.SUCCEEDED 

1331 elif status == NodeStatus.ERROR: 

1332 # Use job exist instead of post script exit 

1333 if "DAGMAN error 0" in job["StatusDetails"]: 

1334 wms_state = WmsStates.SUCCEEDED 

1335 else: 

1336 wms_state = WmsStates.FAILED 

1337 

1338 return wms_state 

1339 

1340 

1341def _update_jobs(jobs1, jobs2): 

1342 """Update jobs1 with info in jobs2. 

1343 

1344 (Basically an update for nested dictionaries.) 

1345 

1346 Parameters 

1347 ---------- 

1348 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1349 HTCondor job information to be updated. 

1350 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1351 Additional HTCondor job information. 

1352 """ 

1353 for jid, jinfo in jobs2.items(): 

1354 if jid in jobs1: 

1355 jobs1[jid].update(jinfo) 

1356 else: 

1357 jobs1[jid] = jinfo 

1358 

1359 

1360def _wms_id_type(wms_id): 

1361 """Determine the type of the WMS id. 

1362 

1363 Parameters 

1364 ---------- 

1365 wms_id : `str` 

1366 WMS id identifying a job. 

1367 

1368 Returns 

1369 ------- 

1370 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1371 Type of WMS id. 

1372 """ 

1373 try: 

1374 int(float(wms_id)) 

1375 except ValueError: 

1376 wms_path = Path(wms_id) 

1377 if wms_path.exists(): 

1378 id_type = WmsIdType.PATH 

1379 else: 

1380 id_type = WmsIdType.GLOBAL 

1381 except TypeError: 

1382 id_type = WmsIdType.UNKNOWN 

1383 else: 

1384 id_type = WmsIdType.LOCAL 

1385 return id_type 

1386 

1387 

1388def _wms_id_to_cluster(wms_id): 

1389 """Convert WMS id to cluster id. 

1390 

1391 Parameters 

1392 ---------- 

1393 wms_id : `int` or `float` or `str` 

1394 HTCondor job id or path. 

1395 

1396 Returns 

1397 ------- 

1398 schedd_ad : `classad.ClassAd` 

1399 ClassAd describing the scheduler managing the job with the given id. 

1400 cluster_id : `int` 

1401 HTCondor cluster id. 

1402 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1403 The type of the provided id. 

1404 """ 

1405 coll = htcondor.Collector() 

1406 

1407 schedd_ad = None 

1408 cluster_id = None 

1409 id_type = _wms_id_type(wms_id) 

1410 if id_type == WmsIdType.LOCAL: 

1411 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1412 cluster_id = int(float(wms_id)) 

1413 elif id_type == WmsIdType.GLOBAL: 

1414 constraint = f'GlobalJobId == "{wms_id}"' 

1415 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1416 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1417 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1418 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries) 

1419 if (ads := query.nextAdsNonBlocking())} 

1420 if results: 

1421 schedd_name = next(iter(results)) 

1422 schedd_ad = schedd_ads[schedd_name] 

1423 cluster_id = results[schedd_name]["ClusterId"] 

1424 elif id_type == WmsIdType.PATH: 

1425 try: 

1426 job_info = read_dag_info(wms_id) 

1427 except (FileNotFoundError, PermissionError, IOError): 

1428 pass 

1429 else: 

1430 schedd_name = next(iter(job_info)) 

1431 job_id = next(iter(job_info[schedd_name])) 

1432 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1433 cluster_id = int(float(job_id)) 

1434 else: 

1435 pass 

1436 return schedd_ad, cluster_id, id_type 

1437 

1438 

1439def _create_request_memory_expr(memory, multiplier): 

1440 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1441 

1442 Parameters 

1443 ---------- 

1444 memory : `int` 

1445 Requested memory in MB. 

1446 multiplier : `float` 

1447 Memory growth rate between retires. 

1448 

1449 Returns 

1450 ------- 

1451 ad : `str` 

1452 A string representing an HTCondor ClassAd expression enabling safe 

1453 memory scaling between job retries. 

1454 """ 

1455 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue. 

1456 # The special comparison operators ensure that all comparisons below will 

1457 # evaluate to FALSE in this case. 

1458 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1459 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1460 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1461 

1462 # If job runs the first time or was held for reasons other than exceeding 

1463 # the memory, set the required memory to the requested value or use 

1464 # the memory value measured by HTCondor (MemoryUsage) depending on 

1465 # whichever is greater. 

1466 ad = f"({was_mem_exceeded}) " \ 

1467 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \ 

1468 f": max({{{memory}, MemoryUsage ?: 0}}))" 

1469 return ad 

1470 

1471 

1472def _locate_schedds(locate_all=False): 

1473 """Find out Scheduler daemons in an HTCondor pool. 

1474 

1475 Parameters 

1476 ---------- 

1477 locate_all : `bool`, optional 

1478 If True, all available schedulers in the HTCondor pool will be located. 

1479 False by default which means that the search will be limited to looking 

1480 for the Scheduler running on a local host. 

1481 

1482 Returns 

1483 ------- 

1484 schedds : `dict` [`str`, `htcondor.Schedd`] 

1485 A mapping between Scheduler names and Python objects allowing for 

1486 interacting with them. 

1487 """ 

1488 coll = htcondor.Collector() 

1489 

1490 schedd_ads = [] 

1491 if locate_all: 

1492 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1493 else: 

1494 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1495 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}