Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

577 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import dataclasses 

29import os 

30import re 

31import logging 

32from enum import IntEnum, auto 

33from pathlib import Path 

34 

35import htcondor 

36 

37from ... import ( 

38 BaseWmsWorkflow, 

39 BaseWmsService, 

40 GenericWorkflow, 

41 GenericWorkflowJob, 

42 WmsRunReport, 

43 WmsJobReport, 

44 WmsStates 

45) 

46from ...bps_utils import ( 

47 chdir, 

48 create_count_summary 

49) 

50from .lssthtc import ( 

51 HTCDag, 

52 HTCJob, 

53 MISSING_ID, 

54 JobStatus, 

55 NodeStatus, 

56 htc_check_dagman_output, 

57 htc_escape, 

58 htc_submit_dag, 

59 read_dag_info, 

60 read_dag_log, 

61 read_dag_status, 

62 read_node_status, 

63 condor_q, 

64 condor_search, 

65 condor_status, 

66 pegasus_name_to_label, 

67 summary_from_dag, 

68) 

69 

70 

71class WmsIdType(IntEnum): 

72 """Type of valid WMS ids. 

73 """ 

74 

75 UNKNOWN = auto() 

76 """The type of id cannot be determined. 

77 """ 

78 

79 LOCAL = auto() 

80 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

81 """ 

82 

83 GLOBAL = auto() 

84 """Id is a HTCondor's global job id. 

85 """ 

86 

87 PATH = auto() 

88 """Id is a submission path. 

89 """ 

90 

91 

92DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

93"""Default pattern for searching execute machines in an HTCondor pool. 

94""" 

95 

96_LOG = logging.getLogger(__name__) 

97 

98 

99class HTCondorService(BaseWmsService): 

100 """HTCondor version of WMS service. 

101 """ 

102 def prepare(self, config, generic_workflow, out_prefix=None): 

103 """Convert generic workflow to an HTCondor DAG ready for submission. 

104 

105 Parameters 

106 ---------- 

107 config : `lsst.ctrl.bps.BpsConfig` 

108 BPS configuration that includes necessary submit/runtime 

109 information. 

110 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

111 The generic workflow (e.g., has executable name and arguments). 

112 out_prefix : `str` 

113 The root directory into which all WMS-specific files are written. 

114 

115 Returns 

116 ---------- 

117 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

118 HTCondor workflow ready to be run. 

119 """ 

120 _LOG.debug("out_prefix = '%s'", out_prefix) 

121 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

122 f"{self.__class__.__module__}." 

123 f"{self.__class__.__name__}") 

124 workflow.write(out_prefix) 

125 return workflow 

126 

127 def submit(self, workflow): 

128 """Submit a single HTCondor workflow. 

129 

130 Parameters 

131 ---------- 

132 workflow : `lsst.ctrl.bps.BaseWorkflow` 

133 A single HTCondor workflow to submit. run_id is updated after 

134 successful submission to WMS. 

135 """ 

136 # For workflow portability, internal paths are all relative. Hence 

137 # the DAG needs to be submitted to HTCondor from inside the submit 

138 # directory. 

139 with chdir(workflow.submit_path): 

140 _LOG.info("Submitting from directory: %s", os.getcwd()) 

141 htc_submit_dag(workflow.dag, {}) 

142 workflow.run_id = workflow.dag.run_id 

143 

144 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

145 """Query WMS for list of submitted WMS workflows/jobs. 

146 

147 This should be a quick lookup function to create list of jobs for 

148 other functions. 

149 

150 Parameters 

151 ---------- 

152 wms_id : `int` or `str`, optional 

153 Id or path that can be used by WMS service to look up job. 

154 user : `str`, optional 

155 User whose submitted jobs should be listed. 

156 require_bps : `bool`, optional 

157 Whether to require jobs returned in list to be bps-submitted jobs. 

158 pass_thru : `str`, optional 

159 Information to pass through to WMS. 

160 is_global : `bool`, optional 

161 If set, all job queues (and their histories) will be queried for 

162 job information. Defaults to False which means that only the local 

163 job queue will be queried. 

164 

165 Returns 

166 ------- 

167 job_ids : `list` [`Any`] 

168 Only job ids to be used by cancel and other functions. Typically 

169 this means top-level jobs (i.e., not children jobs). 

170 """ 

171 _LOG.debug("list_submitted_jobs params: " 

172 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

173 wms_id, user, require_bps, pass_thru, is_global) 

174 

175 # Determine which Schedds will be queried for job information. 

176 coll = htcondor.Collector() 

177 

178 schedd_ads = [] 

179 if is_global: 

180 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

181 else: 

182 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

183 

184 # Construct appropriate constraint expression using provided arguments. 

185 constraint = "False" 

186 if wms_id is None: 

187 if user is not None: 

188 constraint = f'(Owner == "{user}")' 

189 else: 

190 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

191 if cluster_id is not None: 

192 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

193 

194 # If provided id is either a submission path or a global id, 

195 # make sure the right Schedd will be queried regardless of 

196 # 'is_global' value. 

197 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

198 schedd_ads = [schedd_ad] 

199 if require_bps: 

200 constraint += ' && (bps_isjob == "True")' 

201 if pass_thru: 

202 if "-forcex" in pass_thru: 

203 pass_thru_2 = pass_thru.replace("-forcex", "") 

204 if pass_thru_2 and not pass_thru_2.isspace(): 

205 constraint += f" && ({pass_thru_2})" 

206 else: 

207 constraint += f" && ({pass_thru})" 

208 

209 # Create a list of scheduler daemons which need to be queried. 

210 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

211 

212 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

213 results = condor_q(constraint=constraint, schedds=schedds) 

214 

215 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

216 job_ids = [] 

217 for schedd_name, job_info in results.items(): 

218 for job_id, job_ad in job_info.items(): 

219 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

220 if "DAGManJobId" not in job_ad: 

221 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

222 else: 

223 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

224 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

225 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

226 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

227 

228 _LOG.debug("job_ids = %s", job_ids) 

229 return job_ids 

230 

231 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

232 """Return run information based upon given constraints. 

233 

234 Parameters 

235 ---------- 

236 wms_workflow_id : `str`, optional 

237 Limit to specific run based on id. 

238 user : `str`, optional 

239 Limit results to runs for this user. 

240 hist : `float`, optional 

241 Limit history search to this many days. Defaults to 0. 

242 pass_thru : `str`, optional 

243 Constraints to pass through to HTCondor. 

244 is_global : `bool`, optional 

245 If set, all job queues (and their histories) will be queried for 

246 job information. Defaults to False which means that only the local 

247 job queue will be queried. 

248 

249 Returns 

250 ------- 

251 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

252 Information about runs from given job information. 

253 message : `str` 

254 Extra message for report command to print. This could be pointers 

255 to documentation or to WMS specific commands. 

256 """ 

257 if wms_workflow_id: 

258 id_type = _wms_id_type(wms_workflow_id) 

259 if id_type == WmsIdType.LOCAL: 

260 schedulers = _locate_schedds(locate_all=is_global) 

261 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

262 elif id_type == WmsIdType.GLOBAL: 

263 schedulers = _locate_schedds(locate_all=True) 

264 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

265 elif id_type == WmsIdType.PATH: 

266 run_reports, message = _report_from_path(wms_workflow_id) 

267 else: 

268 run_reports, message = {}, 'Invalid job id' 

269 else: 

270 schedulers = _locate_schedds(locate_all=is_global) 

271 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

272 _LOG.debug("report: %s, %s", run_reports, message) 

273 

274 return list(run_reports.values()), message 

275 

276 def cancel(self, wms_id, pass_thru=None): 

277 """Cancel submitted workflows/jobs. 

278 

279 Parameters 

280 ---------- 

281 wms_id : `str` 

282 Id or path of job that should be canceled. 

283 pass_thru : `str`, optional 

284 Information to pass through to WMS. 

285 

286 Returns 

287 -------- 

288 deleted : `bool` 

289 Whether successful deletion or not. Currently, if any doubt or any 

290 individual jobs not deleted, return False. 

291 message : `str` 

292 Any message from WMS (e.g., error details). 

293 """ 

294 _LOG.debug("Canceling wms_id = %s", wms_id) 

295 

296 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

297 

298 if cluster_id is None: 

299 deleted = False 

300 message = "invalid id" 

301 else: 

302 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s", 

303 cluster_id, schedd_ad["Name"]) 

304 schedd = htcondor.Schedd(schedd_ad) 

305 

306 constraint = f"ClusterId == {cluster_id}" 

307 if pass_thru is not None and "-forcex" in pass_thru: 

308 pass_thru_2 = pass_thru.replace("-forcex", "") 

309 if pass_thru_2 and not pass_thru_2.isspace(): 

310 constraint += f"&& ({pass_thru_2})" 

311 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

312 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

313 else: 

314 if pass_thru: 

315 constraint += f"&& ({pass_thru})" 

316 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

317 results = schedd.act(htcondor.JobAction.Remove, constraint) 

318 _LOG.debug("Remove results: %s", results) 

319 

320 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

321 deleted = True 

322 message = "" 

323 else: 

324 deleted = False 

325 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

326 message = "no such bps job in batch queue" 

327 else: 

328 message = f"unknown problems deleting: {results}" 

329 

330 _LOG.debug("deleted: %s; message = %s", deleted, message) 

331 return deleted, message 

332 

333 

334class HTCondorWorkflow(BaseWmsWorkflow): 

335 """Single HTCondor workflow. 

336 

337 Parameters 

338 ---------- 

339 name : `str` 

340 Unique name for Workflow used when naming files. 

341 config : `lsst.ctrl.bps.BpsConfig` 

342 BPS configuration that includes necessary submit/runtime information. 

343 """ 

344 def __init__(self, name, config=None): 

345 super().__init__(name, config) 

346 self.dag = None 

347 

348 @classmethod 

349 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

350 # Docstring inherited 

351 htc_workflow = cls(generic_workflow.name, config) 

352 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

353 

354 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

355 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

356 htc_workflow.dag.add_attribs({"bps_wms_service": service_class, 

357 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

358 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

359 "bps_job_summary": create_count_summary(generic_workflow.job_counts)}) 

360 

361 # Determine the hard limit for the memory requirement. 

362 found, limit = config.search('memoryLimit') 

363 if not found: 

364 search_opts = {"default": DEFAULT_HTC_EXEC_PATT} 

365 _, site = config.search("computeSite") 

366 if site: 

367 search_opts["curvals"] = {"curr_site": site} 

368 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

369 

370 # To reduce the amount of data, ignore dynamic slots (if any) as, 

371 # by definition, they cannot have more memory than 

372 # the partitionable slot they are the part of. 

373 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

374 pool_info = condor_status(constraint=constraint) 

375 try: 

376 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

377 except ValueError: 

378 _LOG.debug("No execute machine in the pool matches %s", patt) 

379 if limit: 

380 config[".bps_defined.memory_limit"] = limit 

381 

382 # Create all DAG jobs 

383 for job_name in generic_workflow: 

384 gwjob = generic_workflow.get_job(job_name) 

385 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix) 

386 htc_workflow.dag.add_job(htc_job) 

387 

388 # Add job dependencies to the DAG 

389 for job_name in generic_workflow: 

390 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

391 

392 # If final job exists in generic workflow, create DAG final job 

393 final = generic_workflow.get_final() 

394 if final and isinstance(final, GenericWorkflowJob): 

395 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix) 

396 if "post" not in final_htjob.dagcmds: 

397 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \ 

398 f" {final.name} $DAG_STATUS $RETURN" 

399 htc_workflow.dag.add_final_job(final_htjob) 

400 elif final and isinstance(final, GenericWorkflow): 

401 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

402 elif final: 

403 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

404 

405 return htc_workflow 

406 

407 @staticmethod 

408 def _create_job(config, generic_workflow, gwjob, out_prefix): 

409 """Convert GenericWorkflow job nodes to DAG jobs. 

410 

411 Parameters 

412 ---------- 

413 config : `lsst.ctrl.bps.BpsConfig` 

414 BPS configuration that includes necessary submit/runtime 

415 information. 

416 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

417 Generic workflow that is being converted. 

418 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

419 The generic job to convert to a HTCondor job. 

420 out_prefix : `str` 

421 Directory prefix for HTCondor files. 

422 

423 Returns 

424 ------- 

425 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

426 The HTCondor job equivalent to the given generic job. 

427 """ 

428 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

429 

430 curvals = dataclasses.asdict(gwjob) 

431 if gwjob.tags: 

432 curvals.update(gwjob.tags) 

433 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals}) 

434 if not found: 

435 subdir = "jobs" 

436 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

437 

438 htc_job_cmds = { 

439 "universe": "vanilla", 

440 "should_transfer_files": "YES", 

441 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

442 "transfer_output_files": '""', # Set to empty string to disable 

443 "transfer_executable": "False", 

444 "getenv": "True", 

445 

446 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor 

447 # to put SIGBUS jobs on hold. 

448 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)", 

449 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."', 

450 "on_exit_hold_subcode": "34" 

451 } 

452 

453 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob)) 

454 

455 # job stdout, stderr, htcondor user log. 

456 for key in ("output", "error", "log"): 

457 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

458 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

459 

460 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

461 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix)) 

462 

463 # Add the job cmds dict to the job object. 

464 htc_job.add_job_cmds(htc_job_cmds) 

465 

466 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

467 

468 # Add job attributes to job. 

469 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

470 htc_job.add_job_attrs(gwjob.attrs) 

471 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

472 htc_job.add_job_attrs({"bps_job_name": gwjob.name, 

473 "bps_job_label": gwjob.label}) 

474 

475 return htc_job 

476 

477 def write(self, out_prefix): 

478 """Output HTCondor DAGMan files needed for workflow submission. 

479 

480 Parameters 

481 ---------- 

482 out_prefix : `str` 

483 Directory prefix for HTCondor files. 

484 """ 

485 self.submit_path = out_prefix 

486 os.makedirs(out_prefix, exist_ok=True) 

487 

488 # Write down the workflow in HTCondor format. 

489 self.dag.write(out_prefix, "jobs/{self.label}") 

490 

491 

492def _translate_job_cmds(config, generic_workflow, gwjob): 

493 """Translate the job data that are one to one mapping 

494 

495 Parameters 

496 ---------- 

497 config : `lsst.ctrl.bps.BpsConfig` 

498 BPS configuration that includes necessary submit/runtime 

499 information. 

500 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

501 Generic workflow that contains job to being converted. 

502 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

503 Generic workflow job to be converted. 

504 

505 Returns 

506 ------- 

507 htc_job_commands : `dict` [`str`, `Any`] 

508 Contains commands which can appear in the HTCondor submit description 

509 file. 

510 """ 

511 # Values in the job script that just are name mappings. 

512 job_translation = {"mail_to": "notify_user", 

513 "when_to_mail": "notification", 

514 "request_cpus": "request_cpus", 

515 "priority": "priority", 

516 "category": "category"} 

517 

518 jobcmds = {} 

519 for gwkey, htckey in job_translation.items(): 

520 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

521 

522 # job commands that need modification 

523 if gwjob.number_of_retries: 

524 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

525 

526 if gwjob.retry_unless_exit: 

527 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

528 

529 if gwjob.request_disk: 

530 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

531 

532 if gwjob.request_memory: 

533 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

534 

535 if gwjob.memory_multiplier: 

536 # Do not use try-except! At the moment, BpsConfig returns an empty 

537 # string if it does not contain the key. 

538 memory_limit = config[".bps_defined.memory_limit"] 

539 if not memory_limit: 

540 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit " 

541 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

542 "search pattern 'executeMachinesPattern' might help.") 

543 

544 # Set maximal amount of memory job can ask for. 

545 # 

546 # The check below assumes that 'memory_limit' was set to a value which 

547 # realistically reflects actual physical limitations of a given compute 

548 # resource. 

549 memory_max = memory_limit 

550 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

551 memory_max = gwjob.request_memory_max 

552 

553 # Make job ask for more memory each time it failed due to insufficient 

554 # memory requirements. 

555 jobcmds["request_memory"] = \ 

556 _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

557 

558 # Periodically release jobs which are being held due to exceeding 

559 # memory. Stop doing that (by removing the job from the HTCondor queue) 

560 # after the maximal number of retries has been reached or the job was 

561 # already run at maximal allowed memory. 

562 jobcmds["periodic_release"] = \ 

563 _create_periodic_release_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

564 jobcmds["periodic_remove"] = \ 

565 _create_periodic_remove_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max) 

566 

567 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

568 # May need to move to special site-specific implementation if sites use 

569 # other mechanisms. 

570 if gwjob.concurrency_limit: 

571 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

572 

573 # Handle command line 

574 if gwjob.executable.transfer_executable: 

575 jobcmds["transfer_executable"] = "True" 

576 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

577 else: 

578 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

579 

580 if gwjob.arguments: 

581 arguments = gwjob.arguments 

582 arguments = _replace_cmd_vars(arguments, gwjob) 

583 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob) 

584 arguments = _fix_env_var_syntax(arguments) 

585 jobcmds["arguments"] = arguments 

586 

587 # Add extra "pass-thru" job commands 

588 if gwjob.profile: 

589 for key, val in gwjob.profile.items(): 

590 jobcmds[key] = htc_escape(val) 

591 

592 return jobcmds 

593 

594 

595def _translate_dag_cmds(gwjob): 

596 """Translate job values into DAGMan commands. 

597 

598 Parameters 

599 ---------- 

600 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

601 Job containing values to be translated. 

602 

603 Returns 

604 ------- 

605 dagcmds : `dict` [`str`, `Any`] 

606 DAGMan commands for the job. 

607 """ 

608 # Values in the dag script that just are name mappings. 

609 dag_translation = {"abort_on_value": "abort_dag_on", 

610 "abort_return_value": "abort_exit"} 

611 

612 dagcmds = {} 

613 for gwkey, htckey in dag_translation.items(): 

614 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

615 

616 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

617 return dagcmds 

618 

619 

620def _fix_env_var_syntax(oldstr): 

621 """Change ENV place holders to HTCondor Env var syntax. 

622 

623 Parameters 

624 ---------- 

625 oldstr : `str` 

626 String in which environment variable syntax is to be fixed. 

627 

628 Returns 

629 ------- 

630 newstr : `str` 

631 Given string with environment variable syntax fixed. 

632 """ 

633 newstr = oldstr 

634 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

635 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

636 return newstr 

637 

638 

639def _replace_file_vars(config, arguments, workflow, gwjob): 

640 """Replace file placeholders in command line arguments with correct 

641 physical file names. 

642 

643 Parameters 

644 ---------- 

645 config : `lsst.ctrl.bps.BpsConfig` 

646 BPS configuration that includes necessary submit/runtime 

647 information. 

648 arguments : `str` 

649 Arguments string in which to replace file placeholders. 

650 workflow : `lsst.ctrl.bps.GenericWorkflow` 

651 Generic workflow that contains file information. 

652 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

653 The job corresponding to the arguments. 

654 

655 Returns 

656 ------- 

657 arguments : `str` 

658 Given arguments string with file placeholders replaced. 

659 """ 

660 _, use_shared = config.search("bpsUseShared", opt={"default": False}) 

661 

662 # Replace input file placeholders with paths. 

663 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

664 if not gwfile.wms_transfer: 

665 # Must assume full URI if in command line and told WMS is not 

666 # responsible for transferring file. 

667 uri = gwfile.src_uri 

668 elif use_shared: 

669 if gwfile.job_shared: 

670 # Have shared filesystems and jobs can share file. 

671 uri = gwfile.src_uri 

672 else: 

673 # Taking advantage of inside knowledge. Not future-proof. 

674 # Temporary fix until have job wrapper that pulls files 

675 # within job. 

676 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

677 uri = "butler.yaml" 

678 else: 

679 uri = os.path.basename(gwfile.src_uri) 

680 else: # Using push transfer 

681 uri = os.path.basename(gwfile.src_uri) 

682 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

683 

684 # Replace output file placeholders with paths. 

685 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

686 if not gwfile.wms_transfer: 

687 # Must assume full URI if in command line and told WMS is not 

688 # responsible for transferring file. 

689 uri = gwfile.src_uri 

690 elif use_shared: 

691 if gwfile.job_shared: 

692 # Have shared filesystems and jobs can share file. 

693 uri = gwfile.src_uri 

694 else: 

695 uri = os.path.basename(gwfile.src_uri) 

696 else: # Using push transfer 

697 uri = os.path.basename(gwfile.src_uri) 

698 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

699 return arguments 

700 

701 

702def _replace_cmd_vars(arguments, gwjob): 

703 """Replace format-style placeholders in arguments. 

704 

705 Parameters 

706 ---------- 

707 arguments : `str` 

708 Arguments string in which to replace placeholders. 

709 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

710 Job containing values to be used to replace placeholders 

711 (in particular gwjob.cmdvals). 

712 

713 Returns 

714 ------- 

715 arguments : `str` 

716 Given arguments string with placeholders replaced. 

717 """ 

718 try: 

719 arguments = arguments.format(**gwjob.cmdvals) 

720 except (KeyError, TypeError): # TypeError in case None instead of {} 

721 _LOG.error("Could not replace command variables:\n" 

722 "arguments: %s\n" 

723 "cmdvals: %s", arguments, gwjob.cmdvals) 

724 raise 

725 return arguments 

726 

727 

728def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

729 """Add job input files from generic workflow to job. 

730 

731 Parameters 

732 ---------- 

733 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

734 The generic workflow (e.g., has executable name and arguments). 

735 job_name : `str` 

736 Unique name for the job. 

737 use_shared : `bool` 

738 Whether job has access to files via shared filesystem. 

739 out_prefix : `str` 

740 The root directory into which all WMS-specific files are written. 

741 

742 Returns 

743 ------- 

744 htc_commands : `dict` [`str`, `str`] 

745 HTCondor commands for the job submission script. 

746 """ 

747 htc_commands = {} 

748 inputs = [] 

749 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

750 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

751 

752 uri = Path(gwf_file.src_uri) 

753 

754 # Note if use_shared and job_shared, don't need to transfer file. 

755 

756 if not use_shared: # Copy file using push to job 

757 inputs.append(str(uri.relative_to(out_prefix))) 

758 elif not gwf_file.job_shared: # Jobs require own copy 

759 

760 # if using shared filesystem, but still need copy in job. Use 

761 # HTCondor's curl plugin for a local copy. 

762 

763 # Execution butler is represented as a directory which the 

764 # curl plugin does not handle. Taking advantage of inside 

765 # knowledge for temporary fix until have job wrapper that pulls 

766 # files within job. 

767 if gwf_file.name == "butlerConfig": 

768 # The execution butler directory doesn't normally exist until 

769 # the submit phase so checking for suffix instead of using 

770 # is_dir(). If other non-yaml file exists they would have a 

771 # different gwf_file.name. 

772 if uri.suffix == ".yaml": # Single file, so just copy. 

773 inputs.append(f"file://{uri}") 

774 else: 

775 inputs.append(f"file://{uri / 'butler.yaml'}") 

776 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

777 elif uri.is_dir(): 

778 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)", 

779 gwf_file.src_uri) 

780 else: 

781 inputs.append(f"file://{uri}") 

782 

783 if inputs: 

784 htc_commands["transfer_input_files"] = ",".join(inputs) 

785 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

786 return htc_commands 

787 

788 

789def _report_from_path(wms_path): 

790 """Gather run information from a given run directory. 

791 

792 Parameters 

793 ---------- 

794 wms_path : `str` 

795 The directory containing the submit side files (e.g., HTCondor files). 

796 

797 Returns 

798 ------- 

799 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

800 Run information for the detailed report. The key is the HTCondor id 

801 and the value is a collection of report information for that run. 

802 message : `str` 

803 Message to be printed with the summary report. 

804 """ 

805 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

806 if wms_workflow_id == MISSING_ID: 

807 run_reports = {} 

808 else: 

809 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

810 return run_reports, message 

811 

812 

813def _report_from_id(wms_workflow_id, hist, schedds=None): 

814 """Gather run information using workflow id. 

815 

816 Parameters 

817 ---------- 

818 wms_workflow_id : `str` 

819 Limit to specific run based on id. 

820 hist : `float` 

821 Limit history search to this many days. 

822 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

823 HTCondor schedulers which to query for job information. If None 

824 (default), all queries will be run against the local scheduler only. 

825 

826 Returns 

827 ------- 

828 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

829 Run information for the detailed report. The key is the HTCondor id 

830 and the value is a collection of report information for that run. 

831 message : `str` 

832 Message to be printed with the summary report. 

833 """ 

834 dag_constraint = 'regexp("dagman$", Cmd)' 

835 try: 

836 cluster_id = int(float(wms_workflow_id)) 

837 except ValueError: 

838 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

839 else: 

840 dag_constraint += f" && ClusterId == {cluster_id}" 

841 

842 # With the current implementation of the condor_* functions the query will 

843 # always return only one match per Scheduler. 

844 # 

845 # Even in the highly unlikely situation where HTCondor history (which 

846 # condor_search queries too) is long enough to have jobs from before the 

847 # cluster ids were rolled over (and as a result there is more then one job 

848 # with the same cluster id) they will not show up in the results. 

849 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

850 if len(schedd_dag_info) == 0: 

851 run_reports = {} 

852 message = "" 

853 elif len(schedd_dag_info) == 1: 

854 _, dag_info = schedd_dag_info.popitem() 

855 dag_id, dag_ad = dag_info.popitem() 

856 

857 # Create a mapping between jobs and their classads. The keys will be 

858 # of format 'ClusterId.ProcId'. 

859 job_info = {dag_id: dag_ad} 

860 

861 # Find jobs (nodes) belonging to that DAGMan job. 

862 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

863 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

864 _, node_info = schedd_job_info.popitem() 

865 job_info.update(node_info) 

866 

867 # Collect additional pieces of information about jobs using HTCondor 

868 # files in the submission directory. 

869 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

870 _update_jobs(job_info, path_jobs) 

871 

872 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

873 message = "" 

874 else: 

875 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

876 run_reports = {} 

877 message = f"More than one job matches id '{wms_workflow_id}', " \ 

878 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

879 return run_reports, message 

880 

881 

882def _get_info_from_path(wms_path): 

883 """Gather run information from a given run directory. 

884 

885 Parameters 

886 ---------- 

887 wms_path : `str` 

888 Directory containing HTCondor files. 

889 

890 Returns 

891 ------- 

892 wms_workflow_id : `str` 

893 The run id which is a DAGman job id. 

894 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

895 Information about jobs read from files in the given directory. 

896 The key is the HTCondor id and the value is a dictionary of HTCondor 

897 keys and values. 

898 message : `str` 

899 Message to be printed with the summary report. 

900 """ 

901 messages = [] 

902 try: 

903 wms_workflow_id, jobs = read_dag_log(wms_path) 

904 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

905 _update_jobs(jobs, read_node_status(wms_path)) 

906 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

907 

908 # Add more info for DAGman job 

909 job = jobs[wms_workflow_id] 

910 job.update(read_dag_status(wms_path)) 

911 

912 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

913 if "bps_run" not in job: 

914 _add_run_info(wms_path, job) 

915 

916 message = htc_check_dagman_output(wms_path) 

917 if message: 

918 messages.append(message) 

919 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id, 

920 jobs[wms_workflow_id]["total_jobs"]) 

921 

922 # Add extra pieces of information which cannot be found in HTCondor 

923 # generated files like 'GlobalJobId'. 

924 # 

925 # Do not treat absence of this file as a serious error. Neither runs 

926 # submitted with earlier versions of the plugin nor the runs submitted 

927 # with Pegasus plugin will have it at the moment. However, once enough 

928 # time passes and Pegasus plugin will have its own report() method 

929 # (instead of sneakily using HTCondor's one), the lack of that file 

930 # should be treated as seriously as lack of any other file. 

931 try: 

932 job_info = read_dag_info(wms_path) 

933 except FileNotFoundError as exc: 

934 message = f"Warn: Some information may not be available: {exc}" 

935 messages.append(message) 

936 else: 

937 schedd_name = next(iter(job_info)) 

938 job_ad = next(iter(job_info[schedd_name].values())) 

939 job.update(job_ad) 

940 except FileNotFoundError: 

941 message = f"Could not find HTCondor files in '{wms_path}'" 

942 _LOG.warning(message) 

943 messages.append(message) 

944 wms_workflow_id = MISSING_ID 

945 jobs = {} 

946 

947 message = '\n'.join([msg for msg in messages if msg]) 

948 return wms_workflow_id, jobs, message 

949 

950 

951def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

952 """Gather run information to be used in generating summary reports. 

953 

954 Parameters 

955 ---------- 

956 wms_workflow_id : `str` 

957 The run id to create the report for. 

958 jobs : `dict` [`str`, `dict` [`str`, Any]] 

959 Mapping HTCondor job id to job information. 

960 

961 Returns 

962 ------- 

963 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

964 Run information for the detailed report. The key is the given HTCondor 

965 id and the value is a collection of report information for that run. 

966 """ 

967 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

968 dag_job = jobs[wms_workflow_id] 

969 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

970 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

971 path=dag_job["Iwd"], 

972 label=dag_job.get("bps_job_label", "MISS"), 

973 run=dag_job.get("bps_run", "MISS"), 

974 project=dag_job.get("bps_project", "MISS"), 

975 campaign=dag_job.get("bps_campaign", "MISS"), 

976 payload=dag_job.get("bps_payload", "MISS"), 

977 operator=_get_owner(dag_job), 

978 run_summary=_get_run_summary(dag_job), 

979 state=_htc_status_to_wms_state(dag_job), 

980 jobs=[], 

981 total_number_jobs=dag_job["total_jobs"], 

982 job_state_counts=dag_job["state_counts"]) 

983 

984 for job_id, job_info in jobs.items(): 

985 try: 

986 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

987 job_report = WmsJobReport(wms_id=job_id, 

988 name=job_info.get("DAGNodeName", job_id), 

989 label=job_info.get("bps_job_label", 

990 pegasus_name_to_label(job_info["DAGNodeName"])), 

991 state=_htc_status_to_wms_state(job_info)) 

992 if job_report.label == "init": 

993 job_report.label = "pipetaskInit" 

994 report.jobs.append(job_report) 

995 except KeyError as ex: 

996 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

997 raise 

998 

999 run_reports = {report.wms_id: report} 

1000 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1001 return run_reports 

1002 

1003 

1004def _summary_report(user, hist, pass_thru, schedds=None): 

1005 """Gather run information to be used in generating summary reports. 

1006 

1007 Parameters 

1008 ---------- 

1009 user : `str` 

1010 Run lookup restricted to given user. 

1011 hist : `float` 

1012 How many previous days to search for run information. 

1013 pass_thru : `str` 

1014 Advanced users can define the HTCondor constraint to be used 

1015 when searching queue and history. 

1016 

1017 Returns 

1018 ------- 

1019 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1020 Run information for the summary report. The keys are HTCondor ids and 

1021 the values are collections of report information for each run. 

1022 message : `str` 

1023 Message to be printed with the summary report. 

1024 """ 

1025 # only doing summary report so only look for dagman jobs 

1026 if pass_thru: 

1027 constraint = pass_thru 

1028 else: 

1029 # Notes: 

1030 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1031 # manually restarted. 

1032 # * Any job with DAGManJobID isn't a DAG job 

1033 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1034 if user: 

1035 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1036 

1037 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1038 

1039 # Have list of DAGMan jobs, need to get run_report info. 

1040 run_reports = {} 

1041 for jobs in job_info.values(): 

1042 for job_id, job in jobs.items(): 

1043 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1044 # If didn't get from queue information (e.g., Kerberos bug), 

1045 # try reading from file. 

1046 if total_jobs == 0: 

1047 try: 

1048 job.update(read_dag_status(job["Iwd"])) 

1049 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1050 except StopIteration: 

1051 pass # don't kill report can't find htcondor files 

1052 

1053 if "bps_run" not in job: 

1054 _add_run_info(job["Iwd"], job) 

1055 report = WmsRunReport(wms_id=job_id, 

1056 global_wms_id=job["GlobalJobId"], 

1057 path=job["Iwd"], 

1058 label=job.get("bps_job_label", "MISS"), 

1059 run=job.get("bps_run", "MISS"), 

1060 project=job.get("bps_project", "MISS"), 

1061 campaign=job.get("bps_campaign", "MISS"), 

1062 payload=job.get("bps_payload", "MISS"), 

1063 operator=_get_owner(job), 

1064 run_summary=_get_run_summary(job), 

1065 state=_htc_status_to_wms_state(job), 

1066 jobs=[], 

1067 total_number_jobs=total_jobs, 

1068 job_state_counts=state_counts) 

1069 run_reports[report.global_wms_id] = report 

1070 

1071 return run_reports, "" 

1072 

1073 

1074def _add_run_info(wms_path, job): 

1075 """Find BPS run information elsewhere for runs without bps attributes. 

1076 

1077 Parameters 

1078 ---------- 

1079 wms_path : `str` 

1080 Path to submit files for the run. 

1081 job : `dict` [`str`, `Any`] 

1082 HTCondor dag job information. 

1083 

1084 Raises 

1085 ------ 

1086 StopIteration 

1087 If cannot find file it is looking for. Permission errors are 

1088 caught and job's run is marked with error. 

1089 """ 

1090 path = Path(wms_path) / "jobs" 

1091 try: 

1092 subfile = next(path.glob("**/*.sub")) 

1093 except (StopIteration, PermissionError): 

1094 job["bps_run"] = "Unavailable" 

1095 else: 

1096 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1097 try: 

1098 with open(subfile, "r") as fh: 

1099 for line in fh: 

1100 if line.startswith("+bps_"): 

1101 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1102 if m: 

1103 _LOG.debug("Matching line: %s", line) 

1104 job[m.group(1)] = m.group(2).replace('"', "") 

1105 else: 

1106 _LOG.debug("Could not parse attribute: %s", line) 

1107 except PermissionError: 

1108 job["bps_run"] = "PermissionError" 

1109 _LOG.debug("After adding job = %s", job) 

1110 

1111 

1112def _get_owner(job): 

1113 """Get the owner of a dag job. 

1114 

1115 Parameters 

1116 ---------- 

1117 job : `dict` [`str`, `Any`] 

1118 HTCondor dag job information. 

1119 

1120 Returns 

1121 ------- 

1122 owner : `str` 

1123 Owner of the dag job. 

1124 """ 

1125 owner = job.get("bps_operator", None) 

1126 if not owner: 

1127 owner = job.get("Owner", None) 

1128 if not owner: 

1129 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1130 owner = "MISS" 

1131 return owner 

1132 

1133 

1134def _get_run_summary(job): 

1135 """Get the run summary for a job. 

1136 

1137 Parameters 

1138 ---------- 

1139 job : `dict` [`str`, `Any`] 

1140 HTCondor dag job information. 

1141 

1142 Returns 

1143 ------- 

1144 summary : `str` 

1145 Number of jobs per PipelineTask label in approximate pipeline order. 

1146 Format: <label>:<count>[;<label>:<count>]+ 

1147 """ 

1148 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1149 if not summary: 

1150 summary, _ = summary_from_dag(job["Iwd"]) 

1151 if not summary: 

1152 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1153 _LOG.debug("_get_run_summary: summary=%s", summary) 

1154 

1155 # Workaround sometimes using init vs pipetaskInit 

1156 summary = summary.replace("init:", "pipetaskInit:") 

1157 

1158 if "pegasus_version" in job and "pegasus" not in summary: 

1159 summary += ";pegasus:0" 

1160 

1161 return summary 

1162 

1163 

1164def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1165 """Count number of jobs per WMS state. 

1166 

1167 Parameters 

1168 ---------- 

1169 wms_workflow_id : `str` 

1170 HTCondor job id. 

1171 jobs : `dict` [`str`, `Any`] 

1172 HTCondor dag job information. 

1173 

1174 Returns 

1175 ------- 

1176 total_count : `int` 

1177 Total number of dag nodes. 

1178 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1179 Keys are the different WMS states and values are counts of jobs 

1180 that are in that WMS state. 

1181 """ 

1182 state_counts = dict.fromkeys(WmsStates, 0) 

1183 

1184 for jid, jinfo in jobs.items(): 

1185 if jid != wms_workflow_id: 

1186 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1187 

1188 total_counted = sum(state_counts.values()) 

1189 if "NodesTotal" in jobs[wms_workflow_id]: 

1190 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1191 else: 

1192 total_count = total_counted 

1193 

1194 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1195 

1196 return total_count, state_counts 

1197 

1198 

1199def _get_state_counts_from_dag_job(job): 

1200 """Count number of jobs per WMS state. 

1201 

1202 Parameters 

1203 ---------- 

1204 job : `dict` [`str`, `Any`] 

1205 HTCondor dag job information. 

1206 

1207 Returns 

1208 ------- 

1209 total_count : `int` 

1210 Total number of dag nodes. 

1211 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1212 Keys are the different WMS states and values are counts of jobs 

1213 that are in that WMS state. 

1214 """ 

1215 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1216 state_counts = dict.fromkeys(WmsStates, 0) 

1217 if "DAG_NodesReady" in job: 

1218 state_counts = { 

1219 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1220 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1221 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1222 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1223 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1224 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)} 

1225 total_jobs = job.get("DAG_NodesTotal") 

1226 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1227 elif "NodesFailed" in job: 

1228 state_counts = { 

1229 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1230 WmsStates.READY: job.get("NodesReady", 0), 

1231 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1232 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1233 WmsStates.FAILED: job.get("NodesFailed", 0), 

1234 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)} 

1235 try: 

1236 total_jobs = job.get("NodesTotal") 

1237 except KeyError as ex: 

1238 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1239 raise 

1240 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1241 else: 

1242 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1243 # for every DAG. 

1244 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1245 total_jobs = 0 

1246 

1247 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1248 return total_jobs, state_counts 

1249 

1250 

1251def _htc_status_to_wms_state(job): 

1252 """Convert HTCondor job status to generic wms state. 

1253 

1254 Parameters 

1255 ---------- 

1256 job : `dict` [`str`, `Any`] 

1257 HTCondor job information. 

1258 

1259 Returns 

1260 ------- 

1261 wms_state : `WmsStates` 

1262 The equivalent WmsState to given job's status. 

1263 """ 

1264 wms_state = WmsStates.MISFIT 

1265 if "JobStatus" in job: 

1266 wms_state = _htc_job_status_to_wms_state(job) 

1267 elif "NodeStatus" in job: 

1268 wms_state = _htc_node_status_to_wms_state(job) 

1269 return wms_state 

1270 

1271 

1272def _htc_job_status_to_wms_state(job): 

1273 """Convert HTCondor job status to generic wms state. 

1274 

1275 Parameters 

1276 ---------- 

1277 job : `dict` [`str`, `Any`] 

1278 HTCondor job information. 

1279 

1280 Returns 

1281 ------- 

1282 wms_state : `lsst.ctrl.bps.WmsStates` 

1283 The equivalent WmsState to given job's status. 

1284 """ 

1285 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], 

1286 type(job["JobStatus"])) 

1287 job_status = int(job["JobStatus"]) 

1288 wms_state = WmsStates.MISFIT 

1289 

1290 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1291 if job_status == JobStatus.IDLE: 

1292 wms_state = WmsStates.PENDING 

1293 elif job_status == JobStatus.RUNNING: 

1294 wms_state = WmsStates.RUNNING 

1295 elif job_status == JobStatus.REMOVED: 

1296 wms_state = WmsStates.DELETED 

1297 elif job_status == JobStatus.COMPLETED: 

1298 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \ 

1299 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \ 

1300 job.get("ReturnValue", 0): 

1301 wms_state = WmsStates.FAILED 

1302 else: 

1303 wms_state = WmsStates.SUCCEEDED 

1304 elif job_status == JobStatus.HELD: 

1305 wms_state = WmsStates.HELD 

1306 

1307 return wms_state 

1308 

1309 

1310def _htc_node_status_to_wms_state(job): 

1311 """Convert HTCondor status to generic wms state. 

1312 

1313 Parameters 

1314 ---------- 

1315 job : `dict` [`str`, `Any`] 

1316 HTCondor job information. 

1317 

1318 Returns 

1319 ------- 

1320 wms_state : `lsst.ctrl.bps.WmsStates` 

1321 The equivalent WmsState to given node's status. 

1322 """ 

1323 wms_state = WmsStates.MISFIT 

1324 

1325 status = job["NodeStatus"] 

1326 if status == NodeStatus.NOT_READY: 

1327 wms_state = WmsStates.UNREADY 

1328 elif status == NodeStatus.READY: 

1329 wms_state = WmsStates.READY 

1330 elif status == NodeStatus.PRERUN: 

1331 wms_state = WmsStates.MISFIT 

1332 elif status == NodeStatus.SUBMITTED: 

1333 if job["JobProcsHeld"]: 

1334 wms_state = WmsStates.HELD 

1335 elif job["StatusDetails"] == "not_idle": 

1336 wms_state = WmsStates.RUNNING 

1337 elif job["JobProcsQueued"]: 

1338 wms_state = WmsStates.PENDING 

1339 elif status == NodeStatus.POSTRUN: 

1340 wms_state = WmsStates.MISFIT 

1341 elif status == NodeStatus.DONE: 

1342 wms_state = WmsStates.SUCCEEDED 

1343 elif status == NodeStatus.ERROR: 

1344 # Use job exist instead of post script exit 

1345 if "DAGMAN error 0" in job["StatusDetails"]: 

1346 wms_state = WmsStates.SUCCEEDED 

1347 else: 

1348 wms_state = WmsStates.FAILED 

1349 

1350 return wms_state 

1351 

1352 

1353def _update_jobs(jobs1, jobs2): 

1354 """Update jobs1 with info in jobs2. 

1355 

1356 (Basically an update for nested dictionaries.) 

1357 

1358 Parameters 

1359 ---------- 

1360 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1361 HTCondor job information to be updated. 

1362 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1363 Additional HTCondor job information. 

1364 """ 

1365 for jid, jinfo in jobs2.items(): 

1366 if jid in jobs1: 

1367 jobs1[jid].update(jinfo) 

1368 else: 

1369 jobs1[jid] = jinfo 

1370 

1371 

1372def _wms_id_type(wms_id): 

1373 """Determine the type of the WMS id. 

1374 

1375 Parameters 

1376 ---------- 

1377 wms_id : `str` 

1378 WMS id identifying a job. 

1379 

1380 Returns 

1381 ------- 

1382 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1383 Type of WMS id. 

1384 """ 

1385 try: 

1386 int(float(wms_id)) 

1387 except ValueError: 

1388 wms_path = Path(wms_id) 

1389 if wms_path.exists(): 

1390 id_type = WmsIdType.PATH 

1391 else: 

1392 id_type = WmsIdType.GLOBAL 

1393 except TypeError: 

1394 id_type = WmsIdType.UNKNOWN 

1395 else: 

1396 id_type = WmsIdType.LOCAL 

1397 return id_type 

1398 

1399 

1400def _wms_id_to_cluster(wms_id): 

1401 """Convert WMS id to cluster id. 

1402 

1403 Parameters 

1404 ---------- 

1405 wms_id : `int` or `float` or `str` 

1406 HTCondor job id or path. 

1407 

1408 Returns 

1409 ------- 

1410 schedd_ad : `classad.ClassAd` 

1411 ClassAd describing the scheduler managing the job with the given id. 

1412 cluster_id : `int` 

1413 HTCondor cluster id. 

1414 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1415 The type of the provided id. 

1416 """ 

1417 coll = htcondor.Collector() 

1418 

1419 schedd_ad = None 

1420 cluster_id = None 

1421 id_type = _wms_id_type(wms_id) 

1422 if id_type == WmsIdType.LOCAL: 

1423 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1424 cluster_id = int(float(wms_id)) 

1425 elif id_type == WmsIdType.GLOBAL: 

1426 constraint = f'GlobalJobId == "{wms_id}"' 

1427 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1428 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1429 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1430 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries) 

1431 if (ads := query.nextAdsNonBlocking())} 

1432 if results: 

1433 schedd_name = next(iter(results)) 

1434 schedd_ad = schedd_ads[schedd_name] 

1435 cluster_id = results[schedd_name]["ClusterId"] 

1436 elif id_type == WmsIdType.PATH: 

1437 try: 

1438 job_info = read_dag_info(wms_id) 

1439 except (FileNotFoundError, PermissionError, IOError): 

1440 pass 

1441 else: 

1442 schedd_name = next(iter(job_info)) 

1443 job_id = next(iter(job_info[schedd_name])) 

1444 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1445 cluster_id = int(float(job_id)) 

1446 else: 

1447 pass 

1448 return schedd_ad, cluster_id, id_type 

1449 

1450 

1451def _create_periodic_release_expr(memory, multiplier, limit): 

1452 """Construct an HTCondorAd expression for releasing held jobs. 

1453 

1454 The expression instruct HTCondor to release any job which was put on hold 

1455 due to exceeding memory requirements back to the job queue providing it 

1456 satisfies all of the conditions below: 

1457 

1458 * number of run attempts did not reach allowable number of retries, 

1459 * the memory requirements in the last failed run attempt did not reach 

1460 the specified memory limit. 

1461 

1462 Parameters 

1463 ---------- 

1464 memory : `int` 

1465 Requested memory in MB. 

1466 multiplier : `float` 

1467 Memory growth rate between retires. 

1468 limit : `int` 

1469 Memory limit. 

1470 

1471 Returns 

1472 ------- 

1473 expr : `str` 

1474 A string representing an HTCondor ClassAd expression for releasing jobs 

1475 which have been held due to exceeding the memory requirements. 

1476 """ 

1477 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1478 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1479 

1480 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1481 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1482 # The special comparison operators ensure that all comparisons below will 

1483 # evaluate to FALSE in this case. 

1484 # 

1485 # Note: 

1486 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1487 # the entire expression should evaluate to FALSE when the job is not HELD. 

1488 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1489 # but better safe than sorry. 

1490 was_mem_exceeded = "JobStatus == 5 " \ 

1491 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1492 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1493 

1494 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1495 return expr 

1496 

1497 

1498def _create_periodic_remove_expr(memory, multiplier, limit): 

1499 """Construct an HTCondorAd expression for removing jobs from the queue. 

1500 

1501 The expression instruct HTCondor to remove any job which was put on hold 

1502 due to exceeding memory requirements from the job queue providing it 

1503 satisfies any of the conditions below: 

1504 

1505 * allowable number of retries was reached, 

1506 * the memory requirements during the last failed run attempt reached 

1507 the specified memory limit. 

1508 

1509 Parameters 

1510 ---------- 

1511 memory : `int` 

1512 Requested memory in MB. 

1513 multiplier : `float` 

1514 Memory growth rate between retires. 

1515 limit : `int` 

1516 Memory limit. 

1517 

1518 Returns 

1519 ------- 

1520 expr : `str` 

1521 A string representing an HTCondor ClassAd expression for removing jobs 

1522 which were run at the maximal allowable memory and still exceeded 

1523 the memory requirements. 

1524 """ 

1525 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1526 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1527 

1528 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1529 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1530 # The special comparison operators ensure that all comparisons below will 

1531 # evaluate to FALSE in this case. 

1532 # 

1533 # Note: 

1534 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1535 # the entire expression should evaluate to FALSE when the job is not HELD. 

1536 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1537 # but better safe than sorry. 

1538 was_mem_exceeded = "JobStatus == 5 " \ 

1539 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \ 

1540 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1541 

1542 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1543 return expr 

1544 

1545 

1546def _create_request_memory_expr(memory, multiplier, limit): 

1547 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1548 

1549 Parameters 

1550 ---------- 

1551 memory : `int` 

1552 Requested memory in MB. 

1553 multiplier : `float` 

1554 Memory growth rate between retires. 

1555 limit : `int` 

1556 Memory limit. 

1557 

1558 Returns 

1559 ------- 

1560 expr : `str` 

1561 A string representing an HTCondor ClassAd expression enabling safe 

1562 memory scaling between job retries. 

1563 """ 

1564 # The check if the job was held due to exceeding memory requirements 

1565 # will be made *after* job was released back to the job queue (is in 

1566 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1567 # the ones describing job's current state. 

1568 # 

1569 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1570 # initially put in the job queue. The special comparison operators ensure 

1571 # that all comparisons below will evaluate to FALSE in this case. 

1572 was_mem_exceeded = "LastJobStatus =?= 5 " \ 

1573 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \ 

1574 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1575 

1576 # If job runs the first time or was held for reasons other than exceeding 

1577 # the memory, set the required memory to the requested value or use 

1578 # the memory value measured by HTCondor (MemoryUsage) depending on 

1579 # whichever is greater. 

1580 expr = f"({was_mem_exceeded}) " \ 

1581 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " \ 

1582 f": max({{{memory}, MemoryUsage ?: 0}})" 

1583 return expr 

1584 

1585 

1586def _locate_schedds(locate_all=False): 

1587 """Find out Scheduler daemons in an HTCondor pool. 

1588 

1589 Parameters 

1590 ---------- 

1591 locate_all : `bool`, optional 

1592 If True, all available schedulers in the HTCondor pool will be located. 

1593 False by default which means that the search will be limited to looking 

1594 for the Scheduler running on a local host. 

1595 

1596 Returns 

1597 ------- 

1598 schedds : `dict` [`str`, `htcondor.Schedd`] 

1599 A mapping between Scheduler names and Python objects allowing for 

1600 interacting with them. 

1601 """ 

1602 coll = htcondor.Collector() 

1603 

1604 schedd_ads = [] 

1605 if locate_all: 

1606 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1607 else: 

1608 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1609 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}