Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

659 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Interface between generic workflow to HTCondor workflow system. 

23""" 

24 

25__all__ = ["HTCondorService", "HTCondorWorkflow"] 

26 

27 

28import logging 

29import os 

30import re 

31from collections import defaultdict 

32from enum import IntEnum, auto 

33from pathlib import Path 

34 

35import htcondor 

36from lsst.utils.timer import time_this 

37from packaging import version 

38 

39from ... import ( 

40 BaseWmsService, 

41 BaseWmsWorkflow, 

42 GenericWorkflow, 

43 GenericWorkflowJob, 

44 WmsJobReport, 

45 WmsRunReport, 

46 WmsStates, 

47) 

48from ...bps_utils import chdir, create_count_summary 

49from .lssthtc import ( 

50 MISSING_ID, 

51 HTCDag, 

52 HTCJob, 

53 JobStatus, 

54 NodeStatus, 

55 condor_q, 

56 condor_search, 

57 condor_status, 

58 htc_backup_files, 

59 htc_check_dagman_output, 

60 htc_create_submit_from_cmd, 

61 htc_create_submit_from_dag, 

62 htc_create_submit_from_file, 

63 htc_escape, 

64 htc_submit_dag, 

65 htc_version, 

66 pegasus_name_to_label, 

67 read_dag_info, 

68 read_dag_log, 

69 read_dag_status, 

70 read_node_status, 

71 summary_from_dag, 

72 write_dag_info, 

73) 

74 

75 

76class WmsIdType(IntEnum): 

77 """Type of valid WMS ids.""" 

78 

79 UNKNOWN = auto() 

80 """The type of id cannot be determined. 

81 """ 

82 

83 LOCAL = auto() 

84 """The id is HTCondor job's ClusterId (with optional '.ProcId'). 

85 """ 

86 

87 GLOBAL = auto() 

88 """Id is a HTCondor's global job id. 

89 """ 

90 

91 PATH = auto() 

92 """Id is a submission path. 

93 """ 

94 

95 

96DEFAULT_HTC_EXEC_PATT = ".*worker.*" 

97"""Default pattern for searching execute machines in an HTCondor pool. 

98""" 

99 

100_LOG = logging.getLogger(__name__) 

101 

102 

103class HTCondorService(BaseWmsService): 

104 """HTCondor version of WMS service.""" 

105 

106 def prepare(self, config, generic_workflow, out_prefix=None): 

107 """Convert generic workflow to an HTCondor DAG ready for submission. 

108 

109 Parameters 

110 ---------- 

111 config : `lsst.ctrl.bps.BpsConfig` 

112 BPS configuration that includes necessary submit/runtime 

113 information. 

114 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

115 The generic workflow (e.g., has executable name and arguments). 

116 out_prefix : `str` 

117 The root directory into which all WMS-specific files are written. 

118 

119 Returns 

120 ---------- 

121 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow` 

122 HTCondor workflow ready to be run. 

123 """ 

124 _LOG.debug("out_prefix = '%s'", out_prefix) 

125 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"): 

126 workflow = HTCondorWorkflow.from_generic_workflow( 

127 config, 

128 generic_workflow, 

129 out_prefix, 

130 f"{self.__class__.__module__}." f"{self.__class__.__name__}", 

131 ) 

132 

133 with time_this( 

134 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow" 

135 ): 

136 workflow.write(out_prefix) 

137 return workflow 

138 

139 def submit(self, workflow): 

140 """Submit a single HTCondor workflow. 

141 

142 Parameters 

143 ---------- 

144 workflow : `lsst.ctrl.bps.BaseWorkflow` 

145 A single HTCondor workflow to submit. run_id is updated after 

146 successful submission to WMS. 

147 """ 

148 dag = workflow.dag 

149 

150 ver = version.parse(htc_version()) 

151 if ver >= version.parse("8.9.3"): 

152 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {}) 

153 else: 

154 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {}) 

155 

156 # For workflow portability, internal paths are all relative. Hence 

157 # the DAG needs to be submitted to HTCondor from inside the submit 

158 # directory. 

159 with chdir(workflow.submit_path): 

160 _LOG.info("Submitting from directory: %s", os.getcwd()) 

161 schedd_dag_info = htc_submit_dag(sub) 

162 if schedd_dag_info: 

163 write_dag_info(f"{dag.name}.info.json", schedd_dag_info) 

164 

165 _, dag_info = schedd_dag_info.popitem() 

166 _, dag_ad = dag_info.popitem() 

167 

168 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

169 workflow.run_id = dag.run_id 

170 else: 

171 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information") 

172 

173 def restart(self, wms_workflow_id): 

174 """Restart a failed DAGMan workflow. 

175 

176 Parameters 

177 ---------- 

178 wms_workflow_id : `str` 

179 The directory with HTCondor files. 

180 

181 Returns 

182 ------- 

183 run_id : `str` 

184 HTCondor id of the restarted DAGMan job. If restart failed, it will 

185 be set to None. 

186 run_name : `str` 

187 Name of the restarted workflow. If restart failed, it will be set 

188 to None. 

189 message : `str` 

190 A message describing any issues encountered during the restart. 

191 If there were no issues, an empty string is returned. 

192 """ 

193 wms_path = Path(wms_workflow_id) 

194 if not wms_path.is_dir(): 

195 return None, None, f"Directory '{wms_path}' not found" 

196 

197 _LOG.info("Restarting workflow from directory '%s'", wms_path) 

198 rescue_dags = list(wms_path.glob("*.dag.rescue*")) 

199 if not rescue_dags: 

200 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'" 

201 

202 _LOG.info("Verifying that the workflow is not already in the job queue") 

203 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"') 

204 if schedd_dag_info: 

205 _, dag_info = schedd_dag_info.popitem() 

206 _, dag_ad = dag_info.popitem() 

207 id_ = dag_ad["GlobalJobId"] 

208 return None, None, f"Workflow already in the job queue (global job id: '{id_}')" 

209 

210 _LOG.info("Checking execution status of the workflow") 

211 warn = False 

212 dag_ad = read_dag_status(str(wms_path)) 

213 if dag_ad: 

214 nodes_total = dag_ad.get("NodesTotal", 0) 

215 if nodes_total != 0: 

216 nodes_done = dag_ad.get("NodesDone", 0) 

217 if nodes_total == nodes_done: 

218 return None, None, "All jobs in the workflow finished successfully" 

219 else: 

220 warn = True 

221 else: 

222 warn = True 

223 if warn: 

224 _LOG.warning( 

225 "Cannot determine the execution status of the workflow, " "continuing with restart regardless" 

226 ) 

227 

228 _LOG.info("Backing up select HTCondor files from previous run attempt") 

229 htc_backup_files(wms_path, subdir="backups") 

230 

231 # For workflow portability, internal paths are all relative. Hence 

232 # the DAG needs to be resubmitted to HTCondor from inside the submit 

233 # directory. 

234 _LOG.info("Adding workflow to the job queue") 

235 run_id, run_name, message = None, None, "" 

236 with chdir(wms_path): 

237 try: 

238 dag_path = next(wms_path.glob("*.dag.condor.sub")) 

239 except StopIteration: 

240 message = f"DAGMan submit description file not found in '{wms_path}'" 

241 else: 

242 sub = htc_create_submit_from_file(dag_path.name) 

243 schedd_dag_info = htc_submit_dag(sub) 

244 

245 # Save select information about the DAGMan job to a file. Use 

246 # the run name (available in the ClassAd) as the filename. 

247 if schedd_dag_info: 

248 dag_info = next(iter(schedd_dag_info.values())) 

249 dag_ad = next(iter(dag_info.values())) 

250 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info) 

251 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}" 

252 run_name = dag_ad["bps_run"] 

253 else: 

254 message = "DAGMan job information unavailable" 

255 

256 return run_id, run_name, message 

257 

258 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

259 """Query WMS for list of submitted WMS workflows/jobs. 

260 

261 This should be a quick lookup function to create list of jobs for 

262 other functions. 

263 

264 Parameters 

265 ---------- 

266 wms_id : `int` or `str`, optional 

267 Id or path that can be used by WMS service to look up job. 

268 user : `str`, optional 

269 User whose submitted jobs should be listed. 

270 require_bps : `bool`, optional 

271 Whether to require jobs returned in list to be bps-submitted jobs. 

272 pass_thru : `str`, optional 

273 Information to pass through to WMS. 

274 is_global : `bool`, optional 

275 If set, all job queues (and their histories) will be queried for 

276 job information. Defaults to False which means that only the local 

277 job queue will be queried. 

278 

279 Returns 

280 ------- 

281 job_ids : `list` [`Any`] 

282 Only job ids to be used by cancel and other functions. Typically 

283 this means top-level jobs (i.e., not children jobs). 

284 """ 

285 _LOG.debug( 

286 "list_submitted_jobs params: " "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s", 

287 wms_id, 

288 user, 

289 require_bps, 

290 pass_thru, 

291 is_global, 

292 ) 

293 

294 # Determine which Schedds will be queried for job information. 

295 coll = htcondor.Collector() 

296 

297 schedd_ads = [] 

298 if is_global: 

299 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

300 else: 

301 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

302 

303 # Construct appropriate constraint expression using provided arguments. 

304 constraint = "False" 

305 if wms_id is None: 

306 if user is not None: 

307 constraint = f'(Owner == "{user}")' 

308 else: 

309 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id) 

310 if cluster_id is not None: 

311 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})" 

312 

313 # If provided id is either a submission path or a global id, 

314 # make sure the right Schedd will be queried regardless of 

315 # 'is_global' value. 

316 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}: 

317 schedd_ads = [schedd_ad] 

318 if require_bps: 

319 constraint += ' && (bps_isjob == "True")' 

320 if pass_thru: 

321 if "-forcex" in pass_thru: 

322 pass_thru_2 = pass_thru.replace("-forcex", "") 

323 if pass_thru_2 and not pass_thru_2.isspace(): 

324 constraint += f" && ({pass_thru_2})" 

325 else: 

326 constraint += f" && ({pass_thru})" 

327 

328 # Create a list of scheduler daemons which need to be queried. 

329 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

330 

331 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds)) 

332 results = condor_q(constraint=constraint, schedds=schedds) 

333 

334 # Prune child jobs where DAG job is in queue (i.e., aren't orphans). 

335 job_ids = [] 

336 for schedd_name, job_info in results.items(): 

337 for job_id, job_ad in job_info.items(): 

338 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None")) 

339 if "DAGManJobId" not in job_ad: 

340 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

341 else: 

342 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0") 

343 _LOG.debug("\tin jobs.keys() = %s", job_info.keys()) 

344 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job 

345 job_ids.append(job_ad.get("GlobalJobId", job_id)) 

346 

347 _LOG.debug("job_ids = %s", job_ids) 

348 return job_ids 

349 

350 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

351 """Return run information based upon given constraints. 

352 

353 Parameters 

354 ---------- 

355 wms_workflow_id : `str`, optional 

356 Limit to specific run based on id. 

357 user : `str`, optional 

358 Limit results to runs for this user. 

359 hist : `float`, optional 

360 Limit history search to this many days. Defaults to 0. 

361 pass_thru : `str`, optional 

362 Constraints to pass through to HTCondor. 

363 is_global : `bool`, optional 

364 If set, all job queues (and their histories) will be queried for 

365 job information. Defaults to False which means that only the local 

366 job queue will be queried. 

367 

368 Returns 

369 ------- 

370 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

371 Information about runs from given job information. 

372 message : `str` 

373 Extra message for report command to print. This could be pointers 

374 to documentation or to WMS specific commands. 

375 """ 

376 if wms_workflow_id: 

377 id_type = _wms_id_type(wms_workflow_id) 

378 if id_type == WmsIdType.LOCAL: 

379 schedulers = _locate_schedds(locate_all=is_global) 

380 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

381 elif id_type == WmsIdType.GLOBAL: 

382 schedulers = _locate_schedds(locate_all=True) 

383 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers) 

384 elif id_type == WmsIdType.PATH: 

385 run_reports, message = _report_from_path(wms_workflow_id) 

386 else: 

387 run_reports, message = {}, "Invalid job id" 

388 else: 

389 schedulers = _locate_schedds(locate_all=is_global) 

390 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers) 

391 _LOG.debug("report: %s, %s", run_reports, message) 

392 

393 return list(run_reports.values()), message 

394 

395 def cancel(self, wms_id, pass_thru=None): 

396 """Cancel submitted workflows/jobs. 

397 

398 Parameters 

399 ---------- 

400 wms_id : `str` 

401 Id or path of job that should be canceled. 

402 pass_thru : `str`, optional 

403 Information to pass through to WMS. 

404 

405 Returns 

406 -------- 

407 deleted : `bool` 

408 Whether successful deletion or not. Currently, if any doubt or any 

409 individual jobs not deleted, return False. 

410 message : `str` 

411 Any message from WMS (e.g., error details). 

412 """ 

413 _LOG.debug("Canceling wms_id = %s", wms_id) 

414 

415 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id) 

416 

417 if cluster_id is None: 

418 deleted = False 

419 message = "invalid id" 

420 else: 

421 _LOG.debug( 

422 "Canceling job managed by schedd_name = %s with cluster_id = %s", 

423 cluster_id, 

424 schedd_ad["Name"], 

425 ) 

426 schedd = htcondor.Schedd(schedd_ad) 

427 

428 constraint = f"ClusterId == {cluster_id}" 

429 if pass_thru is not None and "-forcex" in pass_thru: 

430 pass_thru_2 = pass_thru.replace("-forcex", "") 

431 if pass_thru_2 and not pass_thru_2.isspace(): 

432 constraint += f"&& ({pass_thru_2})" 

433 _LOG.debug("JobAction.RemoveX constraint = %s", constraint) 

434 results = schedd.act(htcondor.JobAction.RemoveX, constraint) 

435 else: 

436 if pass_thru: 

437 constraint += f"&& ({pass_thru})" 

438 _LOG.debug("JobAction.Remove constraint = %s", constraint) 

439 results = schedd.act(htcondor.JobAction.Remove, constraint) 

440 _LOG.debug("Remove results: %s", results) 

441 

442 if results["TotalSuccess"] > 0 and results["TotalError"] == 0: 

443 deleted = True 

444 message = "" 

445 else: 

446 deleted = False 

447 if results["TotalSuccess"] == 0 and results["TotalError"] == 0: 

448 message = "no such bps job in batch queue" 

449 else: 

450 message = f"unknown problems deleting: {results}" 

451 

452 _LOG.debug("deleted: %s; message = %s", deleted, message) 

453 return deleted, message 

454 

455 

456class HTCondorWorkflow(BaseWmsWorkflow): 

457 """Single HTCondor workflow. 

458 

459 Parameters 

460 ---------- 

461 name : `str` 

462 Unique name for Workflow used when naming files. 

463 config : `lsst.ctrl.bps.BpsConfig` 

464 BPS configuration that includes necessary submit/runtime information. 

465 """ 

466 

467 def __init__(self, name, config=None): 

468 super().__init__(name, config) 

469 self.dag = None 

470 

471 @classmethod 

472 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

473 # Docstring inherited 

474 htc_workflow = cls(generic_workflow.name, config) 

475 htc_workflow.dag = HTCDag(name=generic_workflow.name) 

476 

477 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs) 

478 htc_workflow.dag.add_attribs(generic_workflow.run_attrs) 

479 htc_workflow.dag.add_attribs( 

480 { 

481 "bps_wms_service": service_class, 

482 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}", 

483 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts), 

484 "bps_job_summary": create_count_summary(generic_workflow.job_counts), 

485 } 

486 ) 

487 

488 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""}) 

489 if isinstance(tmp_template, str): 

490 subdir_template = defaultdict(lambda: tmp_template) 

491 else: 

492 subdir_template = tmp_template 

493 

494 # Create all DAG jobs 

495 site_values = {} # cache compute site specific values to reduce config lookups 

496 for job_name in generic_workflow: 

497 gwjob = generic_workflow.get_job(job_name) 

498 if gwjob.compute_site not in site_values: 

499 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site) 

500 htc_job = _create_job( 

501 subdir_template[gwjob.label], 

502 site_values[gwjob.compute_site], 

503 generic_workflow, 

504 gwjob, 

505 out_prefix, 

506 ) 

507 htc_workflow.dag.add_job(htc_job) 

508 

509 # Add job dependencies to the DAG 

510 for job_name in generic_workflow: 

511 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name)) 

512 

513 # If final job exists in generic workflow, create DAG final job 

514 final = generic_workflow.get_final() 

515 if final and isinstance(final, GenericWorkflowJob): 

516 if final.compute_site and final.compute_site not in site_values: 

517 site_values[final.compute_site] = _gather_site_values(config, final.compute_site) 

518 final_htjob = _create_job( 

519 subdir_template[final.label], 

520 site_values[final.compute_site], 

521 generic_workflow, 

522 final, 

523 out_prefix, 

524 ) 

525 if "post" not in final_htjob.dagcmds: 

526 final_htjob.dagcmds["post"] = ( 

527 f"{os.path.dirname(__file__)}/final_post.sh" f" {final.name} $DAG_STATUS $RETURN" 

528 ) 

529 htc_workflow.dag.add_final_job(final_htjob) 

530 elif final and isinstance(final, GenericWorkflow): 

531 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job") 

532 elif final: 

533 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

534 

535 return htc_workflow 

536 

537 def write(self, out_prefix): 

538 """Output HTCondor DAGMan files needed for workflow submission. 

539 

540 Parameters 

541 ---------- 

542 out_prefix : `str` 

543 Directory prefix for HTCondor files. 

544 """ 

545 self.submit_path = out_prefix 

546 os.makedirs(out_prefix, exist_ok=True) 

547 

548 # Write down the workflow in HTCondor format. 

549 self.dag.write(out_prefix, "jobs/{self.label}") 

550 

551 

552def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix): 

553 """Convert GenericWorkflow job nodes to DAG jobs. 

554 

555 Parameters 

556 ---------- 

557 subdir_template : `str` 

558 Template for making subdirs. 

559 site_values : `dict` 

560 Site specific values 

561 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

562 Generic workflow that is being converted. 

563 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

564 The generic job to convert to a HTCondor job. 

565 out_prefix : `str` 

566 Directory prefix for HTCondor files. 

567 

568 Returns 

569 ------- 

570 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob` 

571 The HTCondor job equivalent to the given generic job. 

572 """ 

573 htc_job = HTCJob(gwjob.name, label=gwjob.label) 

574 

575 curvals = defaultdict(str) 

576 curvals["label"] = gwjob.label 

577 if gwjob.tags: 

578 curvals.update(gwjob.tags) 

579 

580 subdir = subdir_template.format_map(curvals) 

581 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub" 

582 

583 htc_job_cmds = { 

584 "universe": "vanilla", 

585 "should_transfer_files": "YES", 

586 "when_to_transfer_output": "ON_EXIT_OR_EVICT", 

587 "transfer_output_files": '""', # Set to empty string to disable 

588 "transfer_executable": "False", 

589 "getenv": "True", 

590 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell 

591 # htcondor to put on hold any jobs which exited by a signal. 

592 "on_exit_hold": "ExitBySignal == true", 

593 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", ' 

594 '"Handling signal as if job has gone over memory limit.")', 

595 "on_exit_hold_subcode": "34", 

596 } 

597 

598 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob)) 

599 

600 # job stdout, stderr, htcondor user log. 

601 for key in ("output", "error", "log"): 

602 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}") 

603 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key]) 

604 

605 htc_job_cmds.update( 

606 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix) 

607 ) 

608 

609 # Add the job cmds dict to the job object. 

610 htc_job.add_job_cmds(htc_job_cmds) 

611 

612 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob)) 

613 

614 # Add job attributes to job. 

615 _LOG.debug("gwjob.attrs = %s", gwjob.attrs) 

616 htc_job.add_job_attrs(gwjob.attrs) 

617 htc_job.add_job_attrs(site_values["attrs"]) 

618 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)}) 

619 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label}) 

620 

621 return htc_job 

622 

623 

624def _translate_job_cmds(cached_vals, generic_workflow, gwjob): 

625 """Translate the job data that are one to one mapping 

626 

627 Parameters 

628 ---------- 

629 cached_vals : `dict` [`str`, `Any`] 

630 Config values common to jobs with same label. 

631 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

632 Generic workflow that contains job to being converted. 

633 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

634 Generic workflow job to be converted. 

635 

636 Returns 

637 ------- 

638 htc_job_commands : `dict` [`str`, `Any`] 

639 Contains commands which can appear in the HTCondor submit description 

640 file. 

641 """ 

642 # Values in the job script that just are name mappings. 

643 job_translation = { 

644 "mail_to": "notify_user", 

645 "when_to_mail": "notification", 

646 "request_cpus": "request_cpus", 

647 "priority": "priority", 

648 "category": "category", 

649 } 

650 

651 jobcmds = {} 

652 for gwkey, htckey in job_translation.items(): 

653 jobcmds[htckey] = getattr(gwjob, gwkey, None) 

654 

655 # job commands that need modification 

656 if gwjob.number_of_retries: 

657 jobcmds["max_retries"] = f"{gwjob.number_of_retries}" 

658 

659 if gwjob.retry_unless_exit: 

660 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}" 

661 

662 if gwjob.request_disk: 

663 jobcmds["request_disk"] = f"{gwjob.request_disk}MB" 

664 

665 if gwjob.request_memory: 

666 jobcmds["request_memory"] = f"{gwjob.request_memory}" 

667 

668 if gwjob.memory_multiplier: 

669 # Do not use try-except! At the moment, BpsConfig returns an empty 

670 # string if it does not contain the key. 

671 memory_limit = cached_vals["memoryLimit"] 

672 if not memory_limit: 

673 raise RuntimeError( 

674 "Memory autoscaling enabled, but automatic detection of the memory limit " 

675 "failed; setting it explicitly with 'memoryLimit' or changing worker node " 

676 "search pattern 'executeMachinesPattern' might help." 

677 ) 

678 

679 # Set maximal amount of memory job can ask for. 

680 # 

681 # The check below assumes that 'memory_limit' was set to a value which 

682 # realistically reflects actual physical limitations of a given compute 

683 # resource. 

684 memory_max = memory_limit 

685 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit: 

686 memory_max = gwjob.request_memory_max 

687 

688 # Make job ask for more memory each time it failed due to insufficient 

689 # memory requirements. 

690 jobcmds["request_memory"] = _create_request_memory_expr( 

691 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

692 ) 

693 

694 # Periodically release jobs which are being held due to exceeding 

695 # memory. Stop doing that (by removing the job from the HTCondor queue) 

696 # after the maximal number of retries has been reached or the job was 

697 # already run at maximal allowed memory. 

698 jobcmds["periodic_release"] = _create_periodic_release_expr( 

699 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

700 ) 

701 jobcmds["periodic_remove"] = _create_periodic_remove_expr( 

702 gwjob.request_memory, gwjob.memory_multiplier, memory_max 

703 ) 

704 

705 # Assume concurrency_limit implemented using HTCondor concurrency limits. 

706 # May need to move to special site-specific implementation if sites use 

707 # other mechanisms. 

708 if gwjob.concurrency_limit: 

709 jobcmds["concurrency_limit"] = gwjob.concurrency_limit 

710 

711 # Handle command line 

712 if gwjob.executable.transfer_executable: 

713 jobcmds["transfer_executable"] = "True" 

714 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri) 

715 else: 

716 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri) 

717 

718 if gwjob.arguments: 

719 arguments = gwjob.arguments 

720 arguments = _replace_cmd_vars(arguments, gwjob) 

721 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob) 

722 arguments = _fix_env_var_syntax(arguments) 

723 jobcmds["arguments"] = arguments 

724 

725 # Add extra "pass-thru" job commands 

726 if gwjob.profile: 

727 for key, val in gwjob.profile.items(): 

728 jobcmds[key] = htc_escape(val) 

729 for key, val in cached_vals["profile"]: 

730 jobcmds[key] = htc_escape(val) 

731 

732 return jobcmds 

733 

734 

735def _translate_dag_cmds(gwjob): 

736 """Translate job values into DAGMan commands. 

737 

738 Parameters 

739 ---------- 

740 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

741 Job containing values to be translated. 

742 

743 Returns 

744 ------- 

745 dagcmds : `dict` [`str`, `Any`] 

746 DAGMan commands for the job. 

747 """ 

748 # Values in the dag script that just are name mappings. 

749 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"} 

750 

751 dagcmds = {} 

752 for gwkey, htckey in dag_translation.items(): 

753 dagcmds[htckey] = getattr(gwjob, gwkey, None) 

754 

755 # Still to be coded: vars "pre_cmdline", "post_cmdline" 

756 return dagcmds 

757 

758 

759def _fix_env_var_syntax(oldstr): 

760 """Change ENV place holders to HTCondor Env var syntax. 

761 

762 Parameters 

763 ---------- 

764 oldstr : `str` 

765 String in which environment variable syntax is to be fixed. 

766 

767 Returns 

768 ------- 

769 newstr : `str` 

770 Given string with environment variable syntax fixed. 

771 """ 

772 newstr = oldstr 

773 for key in re.findall(r"<ENV:([^>]+)>", oldstr): 

774 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})") 

775 return newstr 

776 

777 

778def _replace_file_vars(use_shared, arguments, workflow, gwjob): 

779 """Replace file placeholders in command line arguments with correct 

780 physical file names. 

781 

782 Parameters 

783 ---------- 

784 use_shared : `bool` 

785 Whether HTCondor can assume shared filesystem. 

786 arguments : `str` 

787 Arguments string in which to replace file placeholders. 

788 workflow : `lsst.ctrl.bps.GenericWorkflow` 

789 Generic workflow that contains file information. 

790 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

791 The job corresponding to the arguments. 

792 

793 Returns 

794 ------- 

795 arguments : `str` 

796 Given arguments string with file placeholders replaced. 

797 """ 

798 # Replace input file placeholders with paths. 

799 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False): 

800 if not gwfile.wms_transfer: 

801 # Must assume full URI if in command line and told WMS is not 

802 # responsible for transferring file. 

803 uri = gwfile.src_uri 

804 elif use_shared: 

805 if gwfile.job_shared: 

806 # Have shared filesystems and jobs can share file. 

807 uri = gwfile.src_uri 

808 else: 

809 # Taking advantage of inside knowledge. Not future-proof. 

810 # Temporary fix until have job wrapper that pulls files 

811 # within job. 

812 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml": 

813 uri = "butler.yaml" 

814 else: 

815 uri = os.path.basename(gwfile.src_uri) 

816 else: # Using push transfer 

817 uri = os.path.basename(gwfile.src_uri) 

818 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

819 

820 # Replace output file placeholders with paths. 

821 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False): 

822 if not gwfile.wms_transfer: 

823 # Must assume full URI if in command line and told WMS is not 

824 # responsible for transferring file. 

825 uri = gwfile.src_uri 

826 elif use_shared: 

827 if gwfile.job_shared: 

828 # Have shared filesystems and jobs can share file. 

829 uri = gwfile.src_uri 

830 else: 

831 uri = os.path.basename(gwfile.src_uri) 

832 else: # Using push transfer 

833 uri = os.path.basename(gwfile.src_uri) 

834 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri) 

835 return arguments 

836 

837 

838def _replace_cmd_vars(arguments, gwjob): 

839 """Replace format-style placeholders in arguments. 

840 

841 Parameters 

842 ---------- 

843 arguments : `str` 

844 Arguments string in which to replace placeholders. 

845 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

846 Job containing values to be used to replace placeholders 

847 (in particular gwjob.cmdvals). 

848 

849 Returns 

850 ------- 

851 arguments : `str` 

852 Given arguments string with placeholders replaced. 

853 """ 

854 try: 

855 arguments = arguments.format(**gwjob.cmdvals) 

856 except (KeyError, TypeError): # TypeError in case None instead of {} 

857 _LOG.error( 

858 "Could not replace command variables:\n" "arguments: %s\n" "cmdvals: %s", arguments, gwjob.cmdvals 

859 ) 

860 raise 

861 return arguments 

862 

863 

864def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str): 

865 """Add job input files from generic workflow to job. 

866 

867 Parameters 

868 ---------- 

869 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

870 The generic workflow (e.g., has executable name and arguments). 

871 job_name : `str` 

872 Unique name for the job. 

873 use_shared : `bool` 

874 Whether job has access to files via shared filesystem. 

875 out_prefix : `str` 

876 The root directory into which all WMS-specific files are written. 

877 

878 Returns 

879 ------- 

880 htc_commands : `dict` [`str`, `str`] 

881 HTCondor commands for the job submission script. 

882 """ 

883 htc_commands = {} 

884 inputs = [] 

885 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True): 

886 _LOG.debug("src_uri=%s", gwf_file.src_uri) 

887 

888 uri = Path(gwf_file.src_uri) 

889 

890 # Note if use_shared and job_shared, don't need to transfer file. 

891 

892 if not use_shared: # Copy file using push to job 

893 inputs.append(str(uri.relative_to(out_prefix))) 

894 elif not gwf_file.job_shared: # Jobs require own copy 

895 

896 # if using shared filesystem, but still need copy in job. Use 

897 # HTCondor's curl plugin for a local copy. 

898 

899 # Execution butler is represented as a directory which the 

900 # curl plugin does not handle. Taking advantage of inside 

901 # knowledge for temporary fix until have job wrapper that pulls 

902 # files within job. 

903 if gwf_file.name == "butlerConfig": 

904 # The execution butler directory doesn't normally exist until 

905 # the submit phase so checking for suffix instead of using 

906 # is_dir(). If other non-yaml file exists they would have a 

907 # different gwf_file.name. 

908 if uri.suffix == ".yaml": # Single file, so just copy. 

909 inputs.append(f"file://{uri}") 

910 else: 

911 inputs.append(f"file://{uri / 'butler.yaml'}") 

912 inputs.append(f"file://{uri / 'gen3.sqlite3'}") 

913 elif uri.is_dir(): 

914 raise RuntimeError( 

915 "HTCondor plugin cannot transfer directories locally within job " f"{gwf_file.src_uri}" 

916 ) 

917 else: 

918 inputs.append(f"file://{uri}") 

919 

920 if inputs: 

921 htc_commands["transfer_input_files"] = ",".join(inputs) 

922 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"]) 

923 return htc_commands 

924 

925 

926def _report_from_path(wms_path): 

927 """Gather run information from a given run directory. 

928 

929 Parameters 

930 ---------- 

931 wms_path : `str` 

932 The directory containing the submit side files (e.g., HTCondor files). 

933 

934 Returns 

935 ------- 

936 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

937 Run information for the detailed report. The key is the HTCondor id 

938 and the value is a collection of report information for that run. 

939 message : `str` 

940 Message to be printed with the summary report. 

941 """ 

942 wms_workflow_id, jobs, message = _get_info_from_path(wms_path) 

943 if wms_workflow_id == MISSING_ID: 

944 run_reports = {} 

945 else: 

946 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs) 

947 return run_reports, message 

948 

949 

950def _report_from_id(wms_workflow_id, hist, schedds=None): 

951 """Gather run information using workflow id. 

952 

953 Parameters 

954 ---------- 

955 wms_workflow_id : `str` 

956 Limit to specific run based on id. 

957 hist : `float` 

958 Limit history search to this many days. 

959 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional 

960 HTCondor schedulers which to query for job information. If None 

961 (default), all queries will be run against the local scheduler only. 

962 

963 Returns 

964 ------- 

965 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

966 Run information for the detailed report. The key is the HTCondor id 

967 and the value is a collection of report information for that run. 

968 message : `str` 

969 Message to be printed with the summary report. 

970 """ 

971 dag_constraint = 'regexp("dagman$", Cmd)' 

972 try: 

973 cluster_id = int(float(wms_workflow_id)) 

974 except ValueError: 

975 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"' 

976 else: 

977 dag_constraint += f" && ClusterId == {cluster_id}" 

978 

979 # With the current implementation of the condor_* functions the query will 

980 # always return only one match per Scheduler. 

981 # 

982 # Even in the highly unlikely situation where HTCondor history (which 

983 # condor_search queries too) is long enough to have jobs from before the 

984 # cluster ids were rolled over (and as a result there is more then one job 

985 # with the same cluster id) they will not show up in the results. 

986 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds) 

987 if len(schedd_dag_info) == 0: 

988 run_reports = {} 

989 message = "" 

990 elif len(schedd_dag_info) == 1: 

991 _, dag_info = schedd_dag_info.popitem() 

992 dag_id, dag_ad = dag_info.popitem() 

993 

994 # Create a mapping between jobs and their classads. The keys will be 

995 # of format 'ClusterId.ProcId'. 

996 job_info = {dag_id: dag_ad} 

997 

998 # Find jobs (nodes) belonging to that DAGMan job. 

999 job_constraint = f"DAGManJobId == {int(float(dag_id))}" 

1000 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds) 

1001 if schedd_job_info: 

1002 _, node_info = schedd_job_info.popitem() 

1003 job_info.update(node_info) 

1004 

1005 # Collect additional pieces of information about jobs using HTCondor 

1006 # files in the submission directory. 

1007 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"]) 

1008 _update_jobs(job_info, path_jobs) 

1009 

1010 run_reports = _create_detailed_report_from_jobs(dag_id, job_info) 

1011 else: 

1012 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()] 

1013 run_reports = {} 

1014 message = ( 

1015 f"More than one job matches id '{wms_workflow_id}', " 

1016 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids" 

1017 ) 

1018 return run_reports, message 

1019 

1020 

1021def _get_info_from_path(wms_path): 

1022 """Gather run information from a given run directory. 

1023 

1024 Parameters 

1025 ---------- 

1026 wms_path : `str` 

1027 Directory containing HTCondor files. 

1028 

1029 Returns 

1030 ------- 

1031 wms_workflow_id : `str` 

1032 The run id which is a DAGman job id. 

1033 jobs : `dict` [`str`, `dict` [`str`, `Any`]] 

1034 Information about jobs read from files in the given directory. 

1035 The key is the HTCondor id and the value is a dictionary of HTCondor 

1036 keys and values. 

1037 message : `str` 

1038 Message to be printed with the summary report. 

1039 """ 

1040 messages = [] 

1041 try: 

1042 wms_workflow_id, jobs = read_dag_log(wms_path) 

1043 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs) 

1044 _update_jobs(jobs, read_node_status(wms_path)) 

1045 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs) 

1046 

1047 # Add more info for DAGman job 

1048 job = jobs[wms_workflow_id] 

1049 job.update(read_dag_status(wms_path)) 

1050 

1051 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs) 

1052 if "bps_run" not in job: 

1053 _add_run_info(wms_path, job) 

1054 

1055 message = htc_check_dagman_output(wms_path) 

1056 if message: 

1057 messages.append(message) 

1058 _LOG.debug( 

1059 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"] 

1060 ) 

1061 

1062 # Add extra pieces of information which cannot be found in HTCondor 

1063 # generated files like 'GlobalJobId'. 

1064 # 

1065 # Do not treat absence of this file as a serious error. Neither runs 

1066 # submitted with earlier versions of the plugin nor the runs submitted 

1067 # with Pegasus plugin will have it at the moment. However, once enough 

1068 # time passes and Pegasus plugin will have its own report() method 

1069 # (instead of sneakily using HTCondor's one), the lack of that file 

1070 # should be treated as seriously as lack of any other file. 

1071 try: 

1072 job_info = read_dag_info(wms_path) 

1073 except FileNotFoundError as exc: 

1074 message = f"Warn: Some information may not be available: {exc}" 

1075 messages.append(message) 

1076 else: 

1077 schedd_name = next(iter(job_info)) 

1078 job_ad = next(iter(job_info[schedd_name].values())) 

1079 job.update(job_ad) 

1080 except FileNotFoundError: 

1081 message = f"Could not find HTCondor files in '{wms_path}'" 

1082 _LOG.warning(message) 

1083 messages.append(message) 

1084 wms_workflow_id = MISSING_ID 

1085 jobs = {} 

1086 

1087 message = "\n".join([msg for msg in messages if msg]) 

1088 return wms_workflow_id, jobs, message 

1089 

1090 

1091def _create_detailed_report_from_jobs(wms_workflow_id, jobs): 

1092 """Gather run information to be used in generating summary reports. 

1093 

1094 Parameters 

1095 ---------- 

1096 wms_workflow_id : `str` 

1097 The run id to create the report for. 

1098 jobs : `dict` [`str`, `dict` [`str`, Any]] 

1099 Mapping HTCondor job id to job information. 

1100 

1101 Returns 

1102 ------- 

1103 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1104 Run information for the detailed report. The key is the given HTCondor 

1105 id and the value is a collection of report information for that run. 

1106 """ 

1107 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id]) 

1108 dag_job = jobs[wms_workflow_id] 

1109 report = WmsRunReport( 

1110 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}", 

1111 global_wms_id=dag_job.get("GlobalJobId", "MISS"), 

1112 path=dag_job["Iwd"], 

1113 label=dag_job.get("bps_job_label", "MISS"), 

1114 run=dag_job.get("bps_run", "MISS"), 

1115 project=dag_job.get("bps_project", "MISS"), 

1116 campaign=dag_job.get("bps_campaign", "MISS"), 

1117 payload=dag_job.get("bps_payload", "MISS"), 

1118 operator=_get_owner(dag_job), 

1119 run_summary=_get_run_summary(dag_job), 

1120 state=_htc_status_to_wms_state(dag_job), 

1121 jobs=[], 

1122 total_number_jobs=dag_job["total_jobs"], 

1123 job_state_counts=dag_job["state_counts"], 

1124 ) 

1125 

1126 for job_id, job_info in jobs.items(): 

1127 try: 

1128 if job_info["ClusterId"] != int(float(wms_workflow_id)): 

1129 job_report = WmsJobReport( 

1130 wms_id=job_id, 

1131 name=job_info.get("DAGNodeName", job_id), 

1132 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])), 

1133 state=_htc_status_to_wms_state(job_info), 

1134 ) 

1135 if job_report.label == "init": 

1136 job_report.label = "pipetaskInit" 

1137 report.jobs.append(job_report) 

1138 except KeyError as ex: 

1139 _LOG.error("Job missing key '%s': %s", str(ex), job_info) 

1140 raise 

1141 

1142 run_reports = {report.wms_id: report} 

1143 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports) 

1144 return run_reports 

1145 

1146 

1147def _summary_report(user, hist, pass_thru, schedds=None): 

1148 """Gather run information to be used in generating summary reports. 

1149 

1150 Parameters 

1151 ---------- 

1152 user : `str` 

1153 Run lookup restricted to given user. 

1154 hist : `float` 

1155 How many previous days to search for run information. 

1156 pass_thru : `str` 

1157 Advanced users can define the HTCondor constraint to be used 

1158 when searching queue and history. 

1159 

1160 Returns 

1161 ------- 

1162 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`] 

1163 Run information for the summary report. The keys are HTCondor ids and 

1164 the values are collections of report information for each run. 

1165 message : `str` 

1166 Message to be printed with the summary report. 

1167 """ 

1168 # only doing summary report so only look for dagman jobs 

1169 if pass_thru: 

1170 constraint = pass_thru 

1171 else: 

1172 # Notes: 

1173 # * bps_isjob == 'True' isn't getting set for DAG jobs that are 

1174 # manually restarted. 

1175 # * Any job with DAGManJobID isn't a DAG job 

1176 constraint = 'bps_isjob == "True" && JobUniverse == 7' 

1177 if user: 

1178 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")' 

1179 

1180 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds) 

1181 

1182 # Have list of DAGMan jobs, need to get run_report info. 

1183 run_reports = {} 

1184 for jobs in job_info.values(): 

1185 for job_id, job in jobs.items(): 

1186 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1187 # If didn't get from queue information (e.g., Kerberos bug), 

1188 # try reading from file. 

1189 if total_jobs == 0: 

1190 try: 

1191 job.update(read_dag_status(job["Iwd"])) 

1192 total_jobs, state_counts = _get_state_counts_from_dag_job(job) 

1193 except StopIteration: 

1194 pass # don't kill report can't find htcondor files 

1195 

1196 if "bps_run" not in job: 

1197 _add_run_info(job["Iwd"], job) 

1198 report = WmsRunReport( 

1199 wms_id=job_id, 

1200 global_wms_id=job["GlobalJobId"], 

1201 path=job["Iwd"], 

1202 label=job.get("bps_job_label", "MISS"), 

1203 run=job.get("bps_run", "MISS"), 

1204 project=job.get("bps_project", "MISS"), 

1205 campaign=job.get("bps_campaign", "MISS"), 

1206 payload=job.get("bps_payload", "MISS"), 

1207 operator=_get_owner(job), 

1208 run_summary=_get_run_summary(job), 

1209 state=_htc_status_to_wms_state(job), 

1210 jobs=[], 

1211 total_number_jobs=total_jobs, 

1212 job_state_counts=state_counts, 

1213 ) 

1214 run_reports[report.global_wms_id] = report 

1215 

1216 return run_reports, "" 

1217 

1218 

1219def _add_run_info(wms_path, job): 

1220 """Find BPS run information elsewhere for runs without bps attributes. 

1221 

1222 Parameters 

1223 ---------- 

1224 wms_path : `str` 

1225 Path to submit files for the run. 

1226 job : `dict` [`str`, `Any`] 

1227 HTCondor dag job information. 

1228 

1229 Raises 

1230 ------ 

1231 StopIteration 

1232 If cannot find file it is looking for. Permission errors are 

1233 caught and job's run is marked with error. 

1234 """ 

1235 path = Path(wms_path) / "jobs" 

1236 try: 

1237 subfile = next(path.glob("**/*.sub")) 

1238 except (StopIteration, PermissionError): 

1239 job["bps_run"] = "Unavailable" 

1240 else: 

1241 _LOG.debug("_add_run_info: subfile = %s", subfile) 

1242 try: 

1243 with open(subfile, "r", encoding="utf-8") as fh: 

1244 for line in fh: 

1245 if line.startswith("+bps_"): 

1246 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line) 

1247 if m: 

1248 _LOG.debug("Matching line: %s", line) 

1249 job[m.group(1)] = m.group(2).replace('"', "") 

1250 else: 

1251 _LOG.debug("Could not parse attribute: %s", line) 

1252 except PermissionError: 

1253 job["bps_run"] = "PermissionError" 

1254 _LOG.debug("After adding job = %s", job) 

1255 

1256 

1257def _get_owner(job): 

1258 """Get the owner of a dag job. 

1259 

1260 Parameters 

1261 ---------- 

1262 job : `dict` [`str`, `Any`] 

1263 HTCondor dag job information. 

1264 

1265 Returns 

1266 ------- 

1267 owner : `str` 

1268 Owner of the dag job. 

1269 """ 

1270 owner = job.get("bps_operator", None) 

1271 if not owner: 

1272 owner = job.get("Owner", None) 

1273 if not owner: 

1274 _LOG.warning("Could not get Owner from htcondor job: %s", job) 

1275 owner = "MISS" 

1276 return owner 

1277 

1278 

1279def _get_run_summary(job): 

1280 """Get the run summary for a job. 

1281 

1282 Parameters 

1283 ---------- 

1284 job : `dict` [`str`, `Any`] 

1285 HTCondor dag job information. 

1286 

1287 Returns 

1288 ------- 

1289 summary : `str` 

1290 Number of jobs per PipelineTask label in approximate pipeline order. 

1291 Format: <label>:<count>[;<label>:<count>]+ 

1292 """ 

1293 summary = job.get("bps_job_summary", job.get("bps_run_summary", None)) 

1294 if not summary: 

1295 summary, _ = summary_from_dag(job["Iwd"]) 

1296 if not summary: 

1297 _LOG.warning("Could not get run summary for htcondor job: %s", job) 

1298 _LOG.debug("_get_run_summary: summary=%s", summary) 

1299 

1300 # Workaround sometimes using init vs pipetaskInit 

1301 summary = summary.replace("init:", "pipetaskInit:") 

1302 

1303 if "pegasus_version" in job and "pegasus" not in summary: 

1304 summary += ";pegasus:0" 

1305 

1306 return summary 

1307 

1308 

1309def _get_state_counts_from_jobs(wms_workflow_id, jobs): 

1310 """Count number of jobs per WMS state. 

1311 

1312 Parameters 

1313 ---------- 

1314 wms_workflow_id : `str` 

1315 HTCondor job id. 

1316 jobs : `dict` [`str`, `Any`] 

1317 HTCondor dag job information. 

1318 

1319 Returns 

1320 ------- 

1321 total_count : `int` 

1322 Total number of dag nodes. 

1323 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1324 Keys are the different WMS states and values are counts of jobs 

1325 that are in that WMS state. 

1326 """ 

1327 state_counts = dict.fromkeys(WmsStates, 0) 

1328 

1329 for jid, jinfo in jobs.items(): 

1330 if jid != wms_workflow_id: 

1331 state_counts[_htc_status_to_wms_state(jinfo)] += 1 

1332 

1333 total_counted = sum(state_counts.values()) 

1334 if "NodesTotal" in jobs[wms_workflow_id]: 

1335 total_count = jobs[wms_workflow_id]["NodesTotal"] 

1336 else: 

1337 total_count = total_counted 

1338 

1339 state_counts[WmsStates.UNREADY] += total_count - total_counted 

1340 

1341 return total_count, state_counts 

1342 

1343 

1344def _get_state_counts_from_dag_job(job): 

1345 """Count number of jobs per WMS state. 

1346 

1347 Parameters 

1348 ---------- 

1349 job : `dict` [`str`, `Any`] 

1350 HTCondor dag job information. 

1351 

1352 Returns 

1353 ------- 

1354 total_count : `int` 

1355 Total number of dag nodes. 

1356 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`] 

1357 Keys are the different WMS states and values are counts of jobs 

1358 that are in that WMS state. 

1359 """ 

1360 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job)) 

1361 state_counts = dict.fromkeys(WmsStates, 0) 

1362 if "DAG_NodesReady" in job: 

1363 state_counts = { 

1364 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), 

1365 WmsStates.READY: job.get("DAG_NodesReady", 0), 

1366 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1367 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), 

1368 WmsStates.FAILED: job.get("DAG_NodesFailed", 0), 

1369 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0), 

1370 } 

1371 total_jobs = job.get("DAG_NodesTotal") 

1372 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs) 

1373 elif "NodesFailed" in job: 

1374 state_counts = { 

1375 WmsStates.UNREADY: job.get("NodesUnready", 0), 

1376 WmsStates.READY: job.get("NodesReady", 0), 

1377 WmsStates.HELD: job.get("JobProcsHeld", 0), 

1378 WmsStates.SUCCEEDED: job.get("NodesDone", 0), 

1379 WmsStates.FAILED: job.get("NodesFailed", 0), 

1380 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0), 

1381 } 

1382 try: 

1383 total_jobs = job.get("NodesTotal") 

1384 except KeyError as ex: 

1385 _LOG.error("Job missing %s. job = %s", str(ex), job) 

1386 raise 

1387 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs) 

1388 else: 

1389 # With Kerberos job auth and Kerberos bug, if warning would be printed 

1390 # for every DAG. 

1391 _LOG.debug("Can't get job state counts %s", job["Iwd"]) 

1392 total_jobs = 0 

1393 

1394 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts) 

1395 return total_jobs, state_counts 

1396 

1397 

1398def _htc_status_to_wms_state(job): 

1399 """Convert HTCondor job status to generic wms state. 

1400 

1401 Parameters 

1402 ---------- 

1403 job : `dict` [`str`, `Any`] 

1404 HTCondor job information. 

1405 

1406 Returns 

1407 ------- 

1408 wms_state : `WmsStates` 

1409 The equivalent WmsState to given job's status. 

1410 """ 

1411 wms_state = WmsStates.MISFIT 

1412 if "JobStatus" in job: 

1413 wms_state = _htc_job_status_to_wms_state(job) 

1414 elif "NodeStatus" in job: 

1415 wms_state = _htc_node_status_to_wms_state(job) 

1416 return wms_state 

1417 

1418 

1419def _htc_job_status_to_wms_state(job): 

1420 """Convert HTCondor job status to generic wms state. 

1421 

1422 Parameters 

1423 ---------- 

1424 job : `dict` [`str`, `Any`] 

1425 HTCondor job information. 

1426 

1427 Returns 

1428 ------- 

1429 wms_state : `lsst.ctrl.bps.WmsStates` 

1430 The equivalent WmsState to given job's status. 

1431 """ 

1432 _LOG.debug( 

1433 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"]) 

1434 ) 

1435 job_status = int(job["JobStatus"]) 

1436 wms_state = WmsStates.MISFIT 

1437 

1438 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status) 

1439 if job_status == JobStatus.IDLE: 

1440 wms_state = WmsStates.PENDING 

1441 elif job_status == JobStatus.RUNNING: 

1442 wms_state = WmsStates.RUNNING 

1443 elif job_status == JobStatus.REMOVED: 

1444 wms_state = WmsStates.DELETED 

1445 elif job_status == JobStatus.COMPLETED: 

1446 if ( 

1447 job.get("ExitBySignal", False) 

1448 or job.get("ExitCode", 0) 

1449 or job.get("ExitSignal", 0) 

1450 or job.get("DAG_Status", 0) 

1451 or job.get("ReturnValue", 0) 

1452 ): 

1453 wms_state = WmsStates.FAILED 

1454 else: 

1455 wms_state = WmsStates.SUCCEEDED 

1456 elif job_status == JobStatus.HELD: 

1457 wms_state = WmsStates.HELD 

1458 

1459 return wms_state 

1460 

1461 

1462def _htc_node_status_to_wms_state(job): 

1463 """Convert HTCondor status to generic wms state. 

1464 

1465 Parameters 

1466 ---------- 

1467 job : `dict` [`str`, `Any`] 

1468 HTCondor job information. 

1469 

1470 Returns 

1471 ------- 

1472 wms_state : `lsst.ctrl.bps.WmsStates` 

1473 The equivalent WmsState to given node's status. 

1474 """ 

1475 wms_state = WmsStates.MISFIT 

1476 

1477 status = job["NodeStatus"] 

1478 if status == NodeStatus.NOT_READY: 

1479 wms_state = WmsStates.UNREADY 

1480 elif status == NodeStatus.READY: 

1481 wms_state = WmsStates.READY 

1482 elif status == NodeStatus.PRERUN: 

1483 wms_state = WmsStates.MISFIT 

1484 elif status == NodeStatus.SUBMITTED: 

1485 if job["JobProcsHeld"]: 

1486 wms_state = WmsStates.HELD 

1487 elif job["StatusDetails"] == "not_idle": 

1488 wms_state = WmsStates.RUNNING 

1489 elif job["JobProcsQueued"]: 

1490 wms_state = WmsStates.PENDING 

1491 elif status == NodeStatus.POSTRUN: 

1492 wms_state = WmsStates.MISFIT 

1493 elif status == NodeStatus.DONE: 

1494 wms_state = WmsStates.SUCCEEDED 

1495 elif status == NodeStatus.ERROR: 

1496 # Use job exist instead of post script exit 

1497 if "DAGMAN error 0" in job["StatusDetails"]: 

1498 wms_state = WmsStates.SUCCEEDED 

1499 else: 

1500 wms_state = WmsStates.FAILED 

1501 

1502 return wms_state 

1503 

1504 

1505def _update_jobs(jobs1, jobs2): 

1506 """Update jobs1 with info in jobs2. 

1507 

1508 (Basically an update for nested dictionaries.) 

1509 

1510 Parameters 

1511 ---------- 

1512 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]] 

1513 HTCondor job information to be updated. 

1514 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]] 

1515 Additional HTCondor job information. 

1516 """ 

1517 for jid, jinfo in jobs2.items(): 

1518 if jid in jobs1: 

1519 jobs1[jid].update(jinfo) 

1520 else: 

1521 jobs1[jid] = jinfo 

1522 

1523 

1524def _wms_id_type(wms_id): 

1525 """Determine the type of the WMS id. 

1526 

1527 Parameters 

1528 ---------- 

1529 wms_id : `str` 

1530 WMS id identifying a job. 

1531 

1532 Returns 

1533 ------- 

1534 id_type : `lsst.ctrl.bps.htcondor.WmsIdType` 

1535 Type of WMS id. 

1536 """ 

1537 try: 

1538 int(float(wms_id)) 

1539 except ValueError: 

1540 wms_path = Path(wms_id) 

1541 if wms_path.exists(): 

1542 id_type = WmsIdType.PATH 

1543 else: 

1544 id_type = WmsIdType.GLOBAL 

1545 except TypeError: 

1546 id_type = WmsIdType.UNKNOWN 

1547 else: 

1548 id_type = WmsIdType.LOCAL 

1549 return id_type 

1550 

1551 

1552def _wms_id_to_cluster(wms_id): 

1553 """Convert WMS id to cluster id. 

1554 

1555 Parameters 

1556 ---------- 

1557 wms_id : `int` or `float` or `str` 

1558 HTCondor job id or path. 

1559 

1560 Returns 

1561 ------- 

1562 schedd_ad : `classad.ClassAd` 

1563 ClassAd describing the scheduler managing the job with the given id. 

1564 cluster_id : `int` 

1565 HTCondor cluster id. 

1566 id_type : `lsst.ctrl.bps.wms.htcondor.IdType` 

1567 The type of the provided id. 

1568 """ 

1569 coll = htcondor.Collector() 

1570 

1571 schedd_ad = None 

1572 cluster_id = None 

1573 id_type = _wms_id_type(wms_id) 

1574 if id_type == WmsIdType.LOCAL: 

1575 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1576 cluster_id = int(float(wms_id)) 

1577 elif id_type == WmsIdType.GLOBAL: 

1578 constraint = f'GlobalJobId == "{wms_id}"' 

1579 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)} 

1580 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()] 

1581 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds] 

1582 results = { 

1583 query.tag(): dict(ads[0]) 

1584 for query in htcondor.poll(queries) 

1585 if (ads := query.nextAdsNonBlocking()) 

1586 } 

1587 if results: 

1588 schedd_name = next(iter(results)) 

1589 schedd_ad = schedd_ads[schedd_name] 

1590 cluster_id = results[schedd_name]["ClusterId"] 

1591 elif id_type == WmsIdType.PATH: 

1592 try: 

1593 job_info = read_dag_info(wms_id) 

1594 except (FileNotFoundError, PermissionError, IOError): 

1595 pass 

1596 else: 

1597 schedd_name = next(iter(job_info)) 

1598 job_id = next(iter(job_info[schedd_name])) 

1599 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name) 

1600 cluster_id = int(float(job_id)) 

1601 else: 

1602 pass 

1603 return schedd_ad, cluster_id, id_type 

1604 

1605 

1606def _create_periodic_release_expr(memory, multiplier, limit): 

1607 """Construct an HTCondorAd expression for releasing held jobs. 

1608 

1609 The expression instruct HTCondor to release any job which was put on hold 

1610 due to exceeding memory requirements back to the job queue providing it 

1611 satisfies all of the conditions below: 

1612 

1613 * number of run attempts did not reach allowable number of retries, 

1614 * the memory requirements in the last failed run attempt did not reach 

1615 the specified memory limit. 

1616 

1617 Parameters 

1618 ---------- 

1619 memory : `int` 

1620 Requested memory in MB. 

1621 multiplier : `float` 

1622 Memory growth rate between retires. 

1623 limit : `int` 

1624 Memory limit. 

1625 

1626 Returns 

1627 ------- 

1628 expr : `str` 

1629 A string representing an HTCondor ClassAd expression for releasing jobs 

1630 which have been held due to exceeding the memory requirements. 

1631 """ 

1632 is_retry_allowed = "NumJobStarts <= JobMaxRetries" 

1633 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}" 

1634 

1635 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1636 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1637 # The special comparison operators ensure that all comparisons below will 

1638 # evaluate to FALSE in this case. 

1639 # 

1640 # Note: 

1641 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1642 # the entire expression should evaluate to FALSE when the job is not HELD. 

1643 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1644 # but better safe than sorry. 

1645 was_mem_exceeded = ( 

1646 "JobStatus == 5 " 

1647 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1648 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1649 ) 

1650 

1651 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}" 

1652 return expr 

1653 

1654 

1655def _create_periodic_remove_expr(memory, multiplier, limit): 

1656 """Construct an HTCondorAd expression for removing jobs from the queue. 

1657 

1658 The expression instruct HTCondor to remove any job which was put on hold 

1659 due to exceeding memory requirements from the job queue providing it 

1660 satisfies any of the conditions below: 

1661 

1662 * allowable number of retries was reached, 

1663 * the memory requirements during the last failed run attempt reached 

1664 the specified memory limit. 

1665 

1666 Parameters 

1667 ---------- 

1668 memory : `int` 

1669 Requested memory in MB. 

1670 multiplier : `float` 

1671 Memory growth rate between retires. 

1672 limit : `int` 

1673 Memory limit. 

1674 

1675 Returns 

1676 ------- 

1677 expr : `str` 

1678 A string representing an HTCondor ClassAd expression for removing jobs 

1679 which were run at the maximal allowable memory and still exceeded 

1680 the memory requirements. 

1681 """ 

1682 is_retry_disallowed = "NumJobStarts > JobMaxRetries" 

1683 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}" 

1684 

1685 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are 

1686 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5). 

1687 # The special comparison operators ensure that all comparisons below will 

1688 # evaluate to FALSE in this case. 

1689 # 

1690 # Note: 

1691 # May not be strictly necessary. Operators '&&' and '||' are not strict so 

1692 # the entire expression should evaluate to FALSE when the job is not HELD. 

1693 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE, 

1694 # but better safe than sorry. 

1695 was_mem_exceeded = ( 

1696 "JobStatus == 5 " 

1697 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " 

1698 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)" 

1699 ) 

1700 

1701 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})" 

1702 return expr 

1703 

1704 

1705def _create_request_memory_expr(memory, multiplier, limit): 

1706 """Construct an HTCondor ClassAd expression for safe memory scaling. 

1707 

1708 Parameters 

1709 ---------- 

1710 memory : `int` 

1711 Requested memory in MB. 

1712 multiplier : `float` 

1713 Memory growth rate between retires. 

1714 limit : `int` 

1715 Memory limit. 

1716 

1717 Returns 

1718 ------- 

1719 expr : `str` 

1720 A string representing an HTCondor ClassAd expression enabling safe 

1721 memory scaling between job retries. 

1722 """ 

1723 # The check if the job was held due to exceeding memory requirements 

1724 # will be made *after* job was released back to the job queue (is in 

1725 # the IDLE state), hence the need to use `Last*` job ClassAds instead of 

1726 # the ones describing job's current state. 

1727 # 

1728 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is 

1729 # initially put in the job queue. The special comparison operators ensure 

1730 # that all comparisons below will evaluate to FALSE in this case. 

1731 was_mem_exceeded = ( 

1732 "LastJobStatus =?= 5 " 

1733 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " 

1734 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)" 

1735 ) 

1736 

1737 # If job runs the first time or was held for reasons other than exceeding 

1738 # the memory, set the required memory to the requested value or use 

1739 # the memory value measured by HTCondor (MemoryUsage) depending on 

1740 # whichever is greater. 

1741 expr = ( 

1742 f"({was_mem_exceeded}) " 

1743 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " 

1744 f": max({{{memory}, MemoryUsage ?: 0}})" 

1745 ) 

1746 return expr 

1747 

1748 

1749def _locate_schedds(locate_all=False): 

1750 """Find out Scheduler daemons in an HTCondor pool. 

1751 

1752 Parameters 

1753 ---------- 

1754 locate_all : `bool`, optional 

1755 If True, all available schedulers in the HTCondor pool will be located. 

1756 False by default which means that the search will be limited to looking 

1757 for the Scheduler running on a local host. 

1758 

1759 Returns 

1760 ------- 

1761 schedds : `dict` [`str`, `htcondor.Schedd`] 

1762 A mapping between Scheduler names and Python objects allowing for 

1763 interacting with them. 

1764 """ 

1765 coll = htcondor.Collector() 

1766 

1767 schedd_ads = [] 

1768 if locate_all: 

1769 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd)) 

1770 else: 

1771 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd)) 

1772 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads} 

1773 

1774 

1775def _gather_site_values(config, compute_site): 

1776 """Gather values specific to given site. 

1777 

1778 Parameters 

1779 ---------- 

1780 config : `lsst.ctrl.bps.BpsConfig` 

1781 BPS configuration that includes necessary submit/runtime 

1782 information. 

1783 compute_site : `str` 

1784 Compute site name. 

1785 

1786 Returns 

1787 ------- 

1788 site_values : `dict` [`str`, `Any`] 

1789 Values specific to the given site. 

1790 """ 

1791 site_values = {"attrs": {}, "profile": {}} 

1792 search_opts = {} 

1793 if compute_site: 

1794 search_opts["curvals"] = {"curr_site": compute_site} 

1795 

1796 # Determine the hard limit for the memory requirement. 

1797 found, limit = config.search("memoryLimit", opt=search_opts) 

1798 if not found: 

1799 search_opts["default"] = DEFAULT_HTC_EXEC_PATT 

1800 _, patt = config.search("executeMachinesPattern", opt=search_opts) 

1801 del search_opts["default"] 

1802 

1803 # To reduce the amount of data, ignore dynamic slots (if any) as, 

1804 # by definition, they cannot have more memory than 

1805 # the partitionable slot they are the part of. 

1806 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)' 

1807 pool_info = condor_status(constraint=constraint) 

1808 try: 

1809 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values()) 

1810 except ValueError: 

1811 _LOG.debug("No execute machine in the pool matches %s", patt) 

1812 if limit: 

1813 config[".bps_defined.memory_limit"] = limit 

1814 

1815 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False}) 

1816 site_values["memoryLimit"] = limit 

1817 

1818 key = f".site.{compute_site}.profile.condor" 

1819 if key in config: 

1820 for key, val in config[key].items(): 

1821 if key.startswith("+"): 

1822 site_values["attrs"][key[1:]] = val 

1823 else: 

1824 site_values["profile"][key] = val 

1825 

1826 return site_values