Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%
743 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-27 02:50 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-27 02:50 -0700
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Interface between generic workflow to HTCondor workflow system.
29"""
31__all__ = ["HTCondorService", "HTCondorWorkflow"]
34import logging
35import os
36import re
37from collections import defaultdict
38from enum import IntEnum, auto
39from pathlib import Path
41import htcondor
42from lsst.ctrl.bps import (
43 BaseWmsService,
44 BaseWmsWorkflow,
45 GenericWorkflow,
46 GenericWorkflowJob,
47 WmsJobReport,
48 WmsRunReport,
49 WmsStates,
50)
51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary
52from lsst.utils.timer import time_this
53from packaging import version
55from .lssthtc import (
56 MISSING_ID,
57 HTCDag,
58 HTCJob,
59 JobStatus,
60 NodeStatus,
61 condor_history,
62 condor_q,
63 condor_search,
64 condor_status,
65 htc_backup_files,
66 htc_check_dagman_output,
67 htc_create_submit_from_cmd,
68 htc_create_submit_from_dag,
69 htc_create_submit_from_file,
70 htc_escape,
71 htc_submit_dag,
72 htc_version,
73 pegasus_name_to_label,
74 read_dag_info,
75 read_dag_log,
76 read_dag_status,
77 read_node_status,
78 summary_from_dag,
79 write_dag_info,
80)
83class WmsIdType(IntEnum):
84 """Type of valid WMS ids."""
86 UNKNOWN = auto()
87 """The type of id cannot be determined.
88 """
90 LOCAL = auto()
91 """The id is HTCondor job's ClusterId (with optional '.ProcId').
92 """
94 GLOBAL = auto()
95 """Id is a HTCondor's global job id.
96 """
98 PATH = auto()
99 """Id is a submission path.
100 """
103DEFAULT_HTC_EXEC_PATT = ".*worker.*"
104"""Default pattern for searching execute machines in an HTCondor pool.
105"""
107_LOG = logging.getLogger(__name__)
110class HTCondorService(BaseWmsService):
111 """HTCondor version of WMS service."""
113 def prepare(self, config, generic_workflow, out_prefix=None):
114 """Convert generic workflow to an HTCondor DAG ready for submission.
116 Parameters
117 ----------
118 config : `lsst.ctrl.bps.BpsConfig`
119 BPS configuration that includes necessary submit/runtime
120 information.
121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
122 The generic workflow (e.g., has executable name and arguments).
123 out_prefix : `str`
124 The root directory into which all WMS-specific files are written.
126 Returns
127 -------
128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
129 HTCondor workflow ready to be run.
130 """
131 _LOG.debug("out_prefix = '%s'", out_prefix)
132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"):
133 workflow = HTCondorWorkflow.from_generic_workflow(
134 config,
135 generic_workflow,
136 out_prefix,
137 f"{self.__class__.__module__}.{self.__class__.__name__}",
138 )
140 with time_this(
141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow"
142 ):
143 workflow.write(out_prefix)
144 return workflow
146 def submit(self, workflow):
147 """Submit a single HTCondor workflow.
149 Parameters
150 ----------
151 workflow : `lsst.ctrl.bps.BaseWorkflow`
152 A single HTCondor workflow to submit. run_id is updated after
153 successful submission to WMS.
154 """
155 dag = workflow.dag
157 ver = version.parse(htc_version())
158 if ver >= version.parse("8.9.3"):
159 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
160 else:
161 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
163 # For workflow portability, internal paths are all relative. Hence
164 # the DAG needs to be submitted to HTCondor from inside the submit
165 # directory.
166 with chdir(workflow.submit_path):
167 _LOG.info("Submitting from directory: %s", os.getcwd())
168 schedd_dag_info = htc_submit_dag(sub)
169 if schedd_dag_info:
170 write_dag_info(f"{dag.name}.info.json", schedd_dag_info)
172 _, dag_info = schedd_dag_info.popitem()
173 _, dag_ad = dag_info.popitem()
175 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
176 workflow.run_id = dag.run_id
177 else:
178 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information")
180 def restart(self, wms_workflow_id):
181 """Restart a failed DAGMan workflow.
183 Parameters
184 ----------
185 wms_workflow_id : `str`
186 The directory with HTCondor files.
188 Returns
189 -------
190 run_id : `str`
191 HTCondor id of the restarted DAGMan job. If restart failed, it will
192 be set to None.
193 run_name : `str`
194 Name of the restarted workflow. If restart failed, it will be set
195 to None.
196 message : `str`
197 A message describing any issues encountered during the restart.
198 If there were no issues, an empty string is returned.
199 """
200 wms_path, id_type = _wms_id_to_dir(wms_workflow_id)
201 if wms_path is None:
202 return (
203 None,
204 None,
205 (
206 f"workflow with run id '{wms_workflow_id}' not found. "
207 f"Hint: use run's submit directory as the id instead"
208 ),
209 )
211 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}:
212 if not wms_path.is_dir():
213 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found."
215 _LOG.info("Restarting workflow from directory '%s'", wms_path)
216 rescue_dags = list(wms_path.glob("*.dag.rescue*"))
217 if not rescue_dags:
218 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'"
220 _LOG.info("Verifying that the workflow is not already in the job queue")
221 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"')
222 if schedd_dag_info:
223 _, dag_info = schedd_dag_info.popitem()
224 _, dag_ad = dag_info.popitem()
225 id_ = dag_ad["GlobalJobId"]
226 return None, None, f"Workflow already in the job queue (global job id: '{id_}')"
228 _LOG.info("Checking execution status of the workflow")
229 warn = False
230 dag_ad = read_dag_status(str(wms_path))
231 if dag_ad:
232 nodes_total = dag_ad.get("NodesTotal", 0)
233 if nodes_total != 0:
234 nodes_done = dag_ad.get("NodesDone", 0)
235 if nodes_total == nodes_done:
236 return None, None, "All jobs in the workflow finished successfully"
237 else:
238 warn = True
239 else:
240 warn = True
241 if warn:
242 _LOG.warning(
243 "Cannot determine the execution status of the workflow, continuing with restart regardless"
244 )
246 _LOG.info("Backing up select HTCondor files from previous run attempt")
247 htc_backup_files(wms_path, subdir="backups")
249 # For workflow portability, internal paths are all relative. Hence
250 # the DAG needs to be resubmitted to HTCondor from inside the submit
251 # directory.
252 _LOG.info("Adding workflow to the job queue")
253 run_id, run_name, message = None, None, ""
254 with chdir(wms_path):
255 try:
256 dag_path = next(wms_path.glob("*.dag.condor.sub"))
257 except StopIteration:
258 message = f"DAGMan submit description file not found in '{wms_path}'"
259 else:
260 sub = htc_create_submit_from_file(dag_path.name)
261 schedd_dag_info = htc_submit_dag(sub)
263 # Save select information about the DAGMan job to a file. Use
264 # the run name (available in the ClassAd) as the filename.
265 if schedd_dag_info:
266 dag_info = next(iter(schedd_dag_info.values()))
267 dag_ad = next(iter(dag_info.values()))
268 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info)
269 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
270 run_name = dag_ad["bps_run"]
271 else:
272 message = "DAGMan job information unavailable"
274 return run_id, run_name, message
276 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
277 """Query WMS for list of submitted WMS workflows/jobs.
279 This should be a quick lookup function to create list of jobs for
280 other functions.
282 Parameters
283 ----------
284 wms_id : `int` or `str`, optional
285 Id or path that can be used by WMS service to look up job.
286 user : `str`, optional
287 User whose submitted jobs should be listed.
288 require_bps : `bool`, optional
289 Whether to require jobs returned in list to be bps-submitted jobs.
290 pass_thru : `str`, optional
291 Information to pass through to WMS.
292 is_global : `bool`, optional
293 If set, all job queues (and their histories) will be queried for
294 job information. Defaults to False which means that only the local
295 job queue will be queried.
297 Returns
298 -------
299 job_ids : `list` [`Any`]
300 Only job ids to be used by cancel and other functions. Typically
301 this means top-level jobs (i.e., not children jobs).
302 """
303 _LOG.debug(
304 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
305 wms_id,
306 user,
307 require_bps,
308 pass_thru,
309 is_global,
310 )
312 # Determine which Schedds will be queried for job information.
313 coll = htcondor.Collector()
315 schedd_ads = []
316 if is_global:
317 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
318 else:
319 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
321 # Construct appropriate constraint expression using provided arguments.
322 constraint = "False"
323 if wms_id is None:
324 if user is not None:
325 constraint = f'(Owner == "{user}")'
326 else:
327 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
328 if cluster_id is not None:
329 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
331 # If provided id is either a submission path or a global id,
332 # make sure the right Schedd will be queried regardless of
333 # 'is_global' value.
334 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
335 schedd_ads = [schedd_ad]
336 if require_bps:
337 constraint += ' && (bps_isjob == "True")'
338 if pass_thru:
339 if "-forcex" in pass_thru:
340 pass_thru_2 = pass_thru.replace("-forcex", "")
341 if pass_thru_2 and not pass_thru_2.isspace():
342 constraint += f" && ({pass_thru_2})"
343 else:
344 constraint += f" && ({pass_thru})"
346 # Create a list of scheduler daemons which need to be queried.
347 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
349 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
350 results = condor_q(constraint=constraint, schedds=schedds)
352 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
353 job_ids = []
354 for schedd_name, job_info in results.items():
355 for job_id, job_ad in job_info.items():
356 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
357 if "DAGManJobId" not in job_ad:
358 job_ids.append(job_ad.get("GlobalJobId", job_id))
359 else:
360 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
361 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
362 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
363 job_ids.append(job_ad.get("GlobalJobId", job_id))
365 _LOG.debug("job_ids = %s", job_ids)
366 return job_ids
368 def report(
369 self,
370 wms_workflow_id=None,
371 user=None,
372 hist=0,
373 pass_thru=None,
374 is_global=False,
375 return_exit_codes=False,
376 ):
377 """Return run information based upon given constraints.
379 Parameters
380 ----------
381 wms_workflow_id : `str`, optional
382 Limit to specific run based on id.
383 user : `str`, optional
384 Limit results to runs for this user.
385 hist : `float`, optional
386 Limit history search to this many days. Defaults to 0.
387 pass_thru : `str`, optional
388 Constraints to pass through to HTCondor.
389 is_global : `bool`, optional
390 If set, all job queues (and their histories) will be queried for
391 job information. Defaults to False which means that only the local
392 job queue will be queried.
393 return_exit_codes : `bool`, optional
394 If set, return exit codes related to jobs with a
395 non-success status. Defaults to False, which means that only
396 the summary state is returned.
398 Only applicable in the context of a WMS with associated
399 handlers to return exit codes from jobs.
401 Returns
402 -------
403 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
404 Information about runs from given job information.
405 message : `str`
406 Extra message for report command to print. This could be pointers
407 to documentation or to WMS specific commands.
408 """
409 if wms_workflow_id:
410 id_type = _wms_id_type(wms_workflow_id)
411 if id_type == WmsIdType.LOCAL:
412 schedulers = _locate_schedds(locate_all=is_global)
413 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
414 elif id_type == WmsIdType.GLOBAL:
415 schedulers = _locate_schedds(locate_all=True)
416 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
417 elif id_type == WmsIdType.PATH:
418 run_reports, message = _report_from_path(wms_workflow_id)
419 else:
420 run_reports, message = {}, "Invalid job id"
421 else:
422 schedulers = _locate_schedds(locate_all=is_global)
423 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
424 _LOG.debug("report: %s, %s", run_reports, message)
426 return list(run_reports.values()), message
428 def cancel(self, wms_id, pass_thru=None):
429 """Cancel submitted workflows/jobs.
431 Parameters
432 ----------
433 wms_id : `str`
434 Id or path of job that should be canceled.
435 pass_thru : `str`, optional
436 Information to pass through to WMS.
438 Returns
439 -------
440 deleted : `bool`
441 Whether successful deletion or not. Currently, if any doubt or any
442 individual jobs not deleted, return False.
443 message : `str`
444 Any message from WMS (e.g., error details).
445 """
446 _LOG.debug("Canceling wms_id = %s", wms_id)
448 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
450 if cluster_id is None:
451 deleted = False
452 message = "invalid id"
453 else:
454 _LOG.debug(
455 "Canceling job managed by schedd_name = %s with cluster_id = %s",
456 cluster_id,
457 schedd_ad["Name"],
458 )
459 schedd = htcondor.Schedd(schedd_ad)
461 constraint = f"ClusterId == {cluster_id}"
462 if pass_thru is not None and "-forcex" in pass_thru:
463 pass_thru_2 = pass_thru.replace("-forcex", "")
464 if pass_thru_2 and not pass_thru_2.isspace():
465 constraint += f"&& ({pass_thru_2})"
466 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
467 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
468 else:
469 if pass_thru:
470 constraint += f"&& ({pass_thru})"
471 _LOG.debug("JobAction.Remove constraint = %s", constraint)
472 results = schedd.act(htcondor.JobAction.Remove, constraint)
473 _LOG.debug("Remove results: %s", results)
475 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
476 deleted = True
477 message = ""
478 else:
479 deleted = False
480 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
481 message = "no such bps job in batch queue"
482 else:
483 message = f"unknown problems deleting: {results}"
485 _LOG.debug("deleted: %s; message = %s", deleted, message)
486 return deleted, message
489class HTCondorWorkflow(BaseWmsWorkflow):
490 """Single HTCondor workflow.
492 Parameters
493 ----------
494 name : `str`
495 Unique name for Workflow used when naming files.
496 config : `lsst.ctrl.bps.BpsConfig`
497 BPS configuration that includes necessary submit/runtime information.
498 """
500 def __init__(self, name, config=None):
501 super().__init__(name, config)
502 self.dag = None
504 @classmethod
505 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
506 # Docstring inherited
507 htc_workflow = cls(generic_workflow.name, config)
508 htc_workflow.dag = HTCDag(name=generic_workflow.name)
510 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
511 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
512 htc_workflow.dag.add_attribs(
513 {
514 "bps_wms_service": service_class,
515 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
516 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
517 "bps_job_summary": create_count_summary(generic_workflow.job_counts),
518 }
519 )
521 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
522 if isinstance(tmp_template, str):
523 subdir_template = defaultdict(lambda: tmp_template)
524 else:
525 subdir_template = tmp_template
527 # Create all DAG jobs
528 site_values = {} # cache compute site specific values to reduce config lookups
529 for job_name in generic_workflow:
530 gwjob = generic_workflow.get_job(job_name)
531 if gwjob.compute_site not in site_values:
532 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
533 htc_job = _create_job(
534 subdir_template[gwjob.label],
535 site_values[gwjob.compute_site],
536 generic_workflow,
537 gwjob,
538 out_prefix,
539 )
540 htc_workflow.dag.add_job(htc_job)
542 # Add job dependencies to the DAG
543 for job_name in generic_workflow:
544 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
546 # If final job exists in generic workflow, create DAG final job
547 final = generic_workflow.get_final()
548 if final and isinstance(final, GenericWorkflowJob):
549 if final.compute_site and final.compute_site not in site_values:
550 site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
551 final_htjob = _create_job(
552 subdir_template[final.label],
553 site_values[final.compute_site],
554 generic_workflow,
555 final,
556 out_prefix,
557 )
558 if "post" not in final_htjob.dagcmds:
559 final_htjob.dagcmds["post"] = (
560 f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
561 )
562 htc_workflow.dag.add_final_job(final_htjob)
563 elif final and isinstance(final, GenericWorkflow):
564 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
565 elif final:
566 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
568 return htc_workflow
570 def write(self, out_prefix):
571 """Output HTCondor DAGMan files needed for workflow submission.
573 Parameters
574 ----------
575 out_prefix : `str`
576 Directory prefix for HTCondor files.
577 """
578 self.submit_path = out_prefix
579 os.makedirs(out_prefix, exist_ok=True)
581 # Write down the workflow in HTCondor format.
582 self.dag.write(out_prefix, "jobs/{self.label}")
585def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
586 """Convert GenericWorkflow job nodes to DAG jobs.
588 Parameters
589 ----------
590 subdir_template : `str`
591 Template for making subdirs.
592 site_values : `dict`
593 Site specific values
594 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
595 Generic workflow that is being converted.
596 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
597 The generic job to convert to a HTCondor job.
598 out_prefix : `str`
599 Directory prefix for HTCondor files.
601 Returns
602 -------
603 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
604 The HTCondor job equivalent to the given generic job.
605 """
606 htc_job = HTCJob(gwjob.name, label=gwjob.label)
608 curvals = defaultdict(str)
609 curvals["label"] = gwjob.label
610 if gwjob.tags:
611 curvals.update(gwjob.tags)
613 subdir = subdir_template.format_map(curvals)
614 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
616 htc_job_cmds = {
617 "universe": "vanilla",
618 "should_transfer_files": "YES",
619 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
620 "transfer_output_files": '""', # Set to empty string to disable
621 "transfer_executable": "False",
622 "getenv": "True",
623 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
624 # htcondor to put on hold any jobs which exited by a signal.
625 "on_exit_hold": "ExitBySignal == true",
626 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
627 '"Handling signal as if job has gone over memory limit.")',
628 "on_exit_hold_subcode": "34",
629 }
631 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
633 # job stdout, stderr, htcondor user log.
634 for key in ("output", "error", "log"):
635 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
636 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
638 htc_job_cmds.update(
639 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
640 )
642 # Add the job cmds dict to the job object.
643 htc_job.add_job_cmds(htc_job_cmds)
645 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
647 # Add job attributes to job.
648 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
649 htc_job.add_job_attrs(gwjob.attrs)
650 htc_job.add_job_attrs(site_values["attrs"])
651 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
652 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
654 return htc_job
657def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
658 """Translate the job data that are one to one mapping
660 Parameters
661 ----------
662 cached_vals : `dict` [`str`, `Any`]
663 Config values common to jobs with same label.
664 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
665 Generic workflow that contains job to being converted.
666 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
667 Generic workflow job to be converted.
669 Returns
670 -------
671 htc_job_commands : `dict` [`str`, `Any`]
672 Contains commands which can appear in the HTCondor submit description
673 file.
674 """
675 # Values in the job script that just are name mappings.
676 job_translation = {
677 "mail_to": "notify_user",
678 "when_to_mail": "notification",
679 "request_cpus": "request_cpus",
680 "priority": "priority",
681 "category": "category",
682 "accounting_group": "accounting_group",
683 "accounting_user": "accounting_group_user",
684 }
686 jobcmds = {}
687 for gwkey, htckey in job_translation.items():
688 jobcmds[htckey] = getattr(gwjob, gwkey, None)
690 # If accounting info was not set explicitly, use site settings if any.
691 if not gwjob.accounting_group:
692 jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
693 if not gwjob.accounting_user:
694 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
696 # job commands that need modification
697 if gwjob.number_of_retries:
698 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
700 if gwjob.retry_unless_exit:
701 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
703 if gwjob.request_disk:
704 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
706 if gwjob.request_memory:
707 jobcmds["request_memory"] = f"{gwjob.request_memory}"
709 if gwjob.memory_multiplier:
710 # Do not use try-except! At the moment, BpsConfig returns an empty
711 # string if it does not contain the key.
712 memory_limit = cached_vals["memoryLimit"]
713 if not memory_limit:
714 raise RuntimeError(
715 "Memory autoscaling enabled, but automatic detection of the memory limit "
716 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
717 "search pattern 'executeMachinesPattern' might help."
718 )
720 # Set maximal amount of memory job can ask for.
721 #
722 # The check below assumes that 'memory_limit' was set to a value which
723 # realistically reflects actual physical limitations of a given compute
724 # resource.
725 memory_max = memory_limit
726 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
727 memory_max = gwjob.request_memory_max
729 # Make job ask for more memory each time it failed due to insufficient
730 # memory requirements.
731 jobcmds["request_memory"] = _create_request_memory_expr(
732 gwjob.request_memory, gwjob.memory_multiplier, memory_max
733 )
735 # Periodically release jobs which are being held due to exceeding
736 # memory. Stop doing that (by removing the job from the HTCondor queue)
737 # after the maximal number of retries has been reached or the job was
738 # already run at maximal allowed memory.
739 jobcmds["periodic_release"] = _create_periodic_release_expr(
740 gwjob.request_memory, gwjob.memory_multiplier, memory_max
741 )
742 jobcmds["periodic_remove"] = _create_periodic_remove_expr(
743 gwjob.request_memory, gwjob.memory_multiplier, memory_max
744 )
746 # Assume concurrency_limit implemented using HTCondor concurrency limits.
747 # May need to move to special site-specific implementation if sites use
748 # other mechanisms.
749 if gwjob.concurrency_limit:
750 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
752 # Handle command line
753 if gwjob.executable.transfer_executable:
754 jobcmds["transfer_executable"] = "True"
755 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
756 else:
757 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
759 if gwjob.arguments:
760 arguments = gwjob.arguments
761 arguments = _replace_cmd_vars(arguments, gwjob)
762 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
763 arguments = _fix_env_var_syntax(arguments)
764 jobcmds["arguments"] = arguments
766 # Add extra "pass-thru" job commands
767 if gwjob.profile:
768 for key, val in gwjob.profile.items():
769 jobcmds[key] = htc_escape(val)
770 for key, val in cached_vals["profile"].items():
771 jobcmds[key] = htc_escape(val)
773 return jobcmds
776def _translate_dag_cmds(gwjob):
777 """Translate job values into DAGMan commands.
779 Parameters
780 ----------
781 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
782 Job containing values to be translated.
784 Returns
785 -------
786 dagcmds : `dict` [`str`, `Any`]
787 DAGMan commands for the job.
788 """
789 # Values in the dag script that just are name mappings.
790 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
792 dagcmds = {}
793 for gwkey, htckey in dag_translation.items():
794 dagcmds[htckey] = getattr(gwjob, gwkey, None)
796 # Still to be coded: vars "pre_cmdline", "post_cmdline"
797 return dagcmds
800def _fix_env_var_syntax(oldstr):
801 """Change ENV place holders to HTCondor Env var syntax.
803 Parameters
804 ----------
805 oldstr : `str`
806 String in which environment variable syntax is to be fixed.
808 Returns
809 -------
810 newstr : `str`
811 Given string with environment variable syntax fixed.
812 """
813 newstr = oldstr
814 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
815 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
816 return newstr
819def _replace_file_vars(use_shared, arguments, workflow, gwjob):
820 """Replace file placeholders in command line arguments with correct
821 physical file names.
823 Parameters
824 ----------
825 use_shared : `bool`
826 Whether HTCondor can assume shared filesystem.
827 arguments : `str`
828 Arguments string in which to replace file placeholders.
829 workflow : `lsst.ctrl.bps.GenericWorkflow`
830 Generic workflow that contains file information.
831 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
832 The job corresponding to the arguments.
834 Returns
835 -------
836 arguments : `str`
837 Given arguments string with file placeholders replaced.
838 """
839 # Replace input file placeholders with paths.
840 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
841 if not gwfile.wms_transfer:
842 # Must assume full URI if in command line and told WMS is not
843 # responsible for transferring file.
844 uri = gwfile.src_uri
845 elif use_shared:
846 if gwfile.job_shared:
847 # Have shared filesystems and jobs can share file.
848 uri = gwfile.src_uri
849 else:
850 # Taking advantage of inside knowledge. Not future-proof.
851 # Temporary fix until have job wrapper that pulls files
852 # within job.
853 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
854 uri = "butler.yaml"
855 else:
856 uri = os.path.basename(gwfile.src_uri)
857 else: # Using push transfer
858 uri = os.path.basename(gwfile.src_uri)
859 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
861 # Replace output file placeholders with paths.
862 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
863 if not gwfile.wms_transfer:
864 # Must assume full URI if in command line and told WMS is not
865 # responsible for transferring file.
866 uri = gwfile.src_uri
867 elif use_shared:
868 if gwfile.job_shared:
869 # Have shared filesystems and jobs can share file.
870 uri = gwfile.src_uri
871 else:
872 uri = os.path.basename(gwfile.src_uri)
873 else: # Using push transfer
874 uri = os.path.basename(gwfile.src_uri)
875 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
876 return arguments
879def _replace_cmd_vars(arguments, gwjob):
880 """Replace format-style placeholders in arguments.
882 Parameters
883 ----------
884 arguments : `str`
885 Arguments string in which to replace placeholders.
886 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
887 Job containing values to be used to replace placeholders
888 (in particular gwjob.cmdvals).
890 Returns
891 -------
892 arguments : `str`
893 Given arguments string with placeholders replaced.
894 """
895 try:
896 arguments = arguments.format(**gwjob.cmdvals)
897 except (KeyError, TypeError): # TypeError in case None instead of {}
898 _LOG.error(
899 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals
900 )
901 raise
902 return arguments
905def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
906 """Add job input files from generic workflow to job.
908 Parameters
909 ----------
910 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
911 The generic workflow (e.g., has executable name and arguments).
912 job_name : `str`
913 Unique name for the job.
914 use_shared : `bool`
915 Whether job has access to files via shared filesystem.
916 out_prefix : `str`
917 The root directory into which all WMS-specific files are written.
919 Returns
920 -------
921 htc_commands : `dict` [`str`, `str`]
922 HTCondor commands for the job submission script.
923 """
924 htc_commands = {}
925 inputs = []
926 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
927 _LOG.debug("src_uri=%s", gwf_file.src_uri)
929 uri = Path(gwf_file.src_uri)
931 # Note if use_shared and job_shared, don't need to transfer file.
933 if not use_shared: # Copy file using push to job
934 inputs.append(str(uri.relative_to(out_prefix)))
935 elif not gwf_file.job_shared: # Jobs require own copy
936 # if using shared filesystem, but still need copy in job. Use
937 # HTCondor's curl plugin for a local copy.
939 # Execution butler is represented as a directory which the
940 # curl plugin does not handle. Taking advantage of inside
941 # knowledge for temporary fix until have job wrapper that pulls
942 # files within job.
943 if gwf_file.name == "butlerConfig":
944 # The execution butler directory doesn't normally exist until
945 # the submit phase so checking for suffix instead of using
946 # is_dir(). If other non-yaml file exists they would have a
947 # different gwf_file.name.
948 if uri.suffix == ".yaml": # Single file, so just copy.
949 inputs.append(f"file://{uri}")
950 else:
951 inputs.append(f"file://{uri / 'butler.yaml'}")
952 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
953 elif uri.is_dir():
954 raise RuntimeError(
955 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
956 )
957 else:
958 inputs.append(f"file://{uri}")
960 if inputs:
961 htc_commands["transfer_input_files"] = ",".join(inputs)
962 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
963 return htc_commands
966def _report_from_path(wms_path):
967 """Gather run information from a given run directory.
969 Parameters
970 ----------
971 wms_path : `str`
972 The directory containing the submit side files (e.g., HTCondor files).
974 Returns
975 -------
976 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
977 Run information for the detailed report. The key is the HTCondor id
978 and the value is a collection of report information for that run.
979 message : `str`
980 Message to be printed with the summary report.
981 """
982 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
983 if wms_workflow_id == MISSING_ID:
984 run_reports = {}
985 else:
986 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
987 return run_reports, message
990def _report_from_id(wms_workflow_id, hist, schedds=None):
991 """Gather run information using workflow id.
993 Parameters
994 ----------
995 wms_workflow_id : `str`
996 Limit to specific run based on id.
997 hist : `float`
998 Limit history search to this many days.
999 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1000 HTCondor schedulers which to query for job information. If None
1001 (default), all queries will be run against the local scheduler only.
1003 Returns
1004 -------
1005 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1006 Run information for the detailed report. The key is the HTCondor id
1007 and the value is a collection of report information for that run.
1008 message : `str`
1009 Message to be printed with the summary report.
1010 """
1011 messages = []
1013 # Collect information about the job by querying HTCondor schedd and
1014 # HTCondor history.
1015 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
1016 if len(schedd_dag_info) == 1:
1017 # Extract the DAG info without altering the results of the query.
1018 schedd_name = next(iter(schedd_dag_info))
1019 dag_id = next(iter(schedd_dag_info[schedd_name]))
1020 dag_ad = schedd_dag_info[schedd_name][dag_id]
1022 # If the provided workflow id does not correspond to the one extracted
1023 # from the DAGMan log file in the submit directory, rerun the query
1024 # with the id found in the file.
1025 #
1026 # This is to cover the situation in which the user provided the old job
1027 # id of a restarted run.
1028 try:
1029 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"])
1030 except FileNotFoundError as exc:
1031 # At the moment missing DAGMan log is pretty much a fatal error.
1032 # So empty the DAG info to finish early (see the if statement
1033 # below).
1034 schedd_dag_info.clean()
1035 messages.append(f"Cannot create the report for '{dag_id}': {exc}")
1036 else:
1037 if path_dag_id != dag_id:
1038 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
1039 messages.append(
1040 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
1041 "This normally occurs when a run is restarted. The report shown is for the most "
1042 f"recent status with run id '{path_dag_id}'"
1043 )
1045 if len(schedd_dag_info) == 0:
1046 run_reports = {}
1047 elif len(schedd_dag_info) == 1:
1048 _, dag_info = schedd_dag_info.popitem()
1049 dag_id, dag_ad = dag_info.popitem()
1051 # Create a mapping between jobs and their classads. The keys will
1052 # be of format 'ClusterId.ProcId'.
1053 job_info = {dag_id: dag_ad}
1055 # Find jobs (nodes) belonging to that DAGMan job.
1056 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
1057 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
1058 if schedd_job_info:
1059 _, node_info = schedd_job_info.popitem()
1060 job_info.update(node_info)
1062 # Collect additional pieces of information about jobs using HTCondor
1063 # files in the submission directory.
1064 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
1065 _update_jobs(job_info, path_jobs)
1066 if message:
1067 messages.append(message)
1068 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
1069 else:
1070 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
1071 message = (
1072 f"More than one job matches id '{wms_workflow_id}', "
1073 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
1074 )
1075 messages.append(message)
1076 run_reports = {}
1078 message = "\n".join(messages)
1079 return run_reports, message
1082def _get_info_from_schedd(wms_workflow_id, hist, schedds):
1083 """Gather run information from HTCondor.
1085 Parameters
1086 ----------
1087 wms_workflow_id : `str`
1088 Limit to specific run based on id.
1089 hist : `int`
1090 Limit history search to this many days.
1091 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1092 HTCondor schedulers which to query for job information. If None
1093 (default), all queries will be run against the local scheduler only.
1095 Returns
1096 -------
1097 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1098 Information about jobs satisfying the search criteria where for each
1099 Scheduler, local HTCondor job ids are mapped to their respective
1100 classads.
1101 """
1102 dag_constraint = 'regexp("dagman$", Cmd)'
1103 try:
1104 cluster_id = int(float(wms_workflow_id))
1105 except ValueError:
1106 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
1107 else:
1108 dag_constraint += f" && ClusterId == {cluster_id}"
1110 # With the current implementation of the condor_* functions the query
1111 # will always return only one match per Scheduler.
1112 #
1113 # Even in the highly unlikely situation where HTCondor history (which
1114 # condor_search queries too) is long enough to have jobs from before
1115 # the cluster ids were rolled over (and as a result there is more then
1116 # one job with the same cluster id) they will not show up in
1117 # the results.
1118 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
1119 return schedd_dag_info
1122def _get_info_from_path(wms_path):
1123 """Gather run information from a given run directory.
1125 Parameters
1126 ----------
1127 wms_path : `str`
1128 Directory containing HTCondor files.
1130 Returns
1131 -------
1132 wms_workflow_id : `str`
1133 The run id which is a DAGman job id.
1134 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1135 Information about jobs read from files in the given directory.
1136 The key is the HTCondor id and the value is a dictionary of HTCondor
1137 keys and values.
1138 message : `str`
1139 Message to be printed with the summary report.
1140 """
1141 messages = []
1142 try:
1143 wms_workflow_id, jobs = read_dag_log(wms_path)
1144 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
1145 _update_jobs(jobs, read_node_status(wms_path))
1146 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
1148 # Add more info for DAGman job
1149 job = jobs[wms_workflow_id]
1150 job.update(read_dag_status(wms_path))
1152 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
1153 if "bps_run" not in job:
1154 _add_run_info(wms_path, job)
1156 message = htc_check_dagman_output(wms_path)
1157 if message:
1158 messages.append(message)
1159 _LOG.debug(
1160 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
1161 )
1163 # Add extra pieces of information which cannot be found in HTCondor
1164 # generated files like 'GlobalJobId'.
1165 #
1166 # Do not treat absence of this file as a serious error. Neither runs
1167 # submitted with earlier versions of the plugin nor the runs submitted
1168 # with Pegasus plugin will have it at the moment. However, once enough
1169 # time passes and Pegasus plugin will have its own report() method
1170 # (instead of sneakily using HTCondor's one), the lack of that file
1171 # should be treated as seriously as lack of any other file.
1172 try:
1173 job_info = read_dag_info(wms_path)
1174 except FileNotFoundError as exc:
1175 message = f"Warn: Some information may not be available: {exc}"
1176 messages.append(message)
1177 else:
1178 schedd_name = next(iter(job_info))
1179 job_ad = next(iter(job_info[schedd_name].values()))
1180 job.update(job_ad)
1181 except FileNotFoundError:
1182 message = f"Could not find HTCondor files in '{wms_path}'"
1183 _LOG.warning(message)
1184 messages.append(message)
1185 wms_workflow_id = MISSING_ID
1186 jobs = {}
1188 message = "\n".join([msg for msg in messages if msg])
1189 return wms_workflow_id, jobs, message
1192def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
1193 """Gather run information to be used in generating summary reports.
1195 Parameters
1196 ----------
1197 wms_workflow_id : `str`
1198 The run id to create the report for.
1199 jobs : `dict` [`str`, `dict` [`str`, Any]]
1200 Mapping HTCondor job id to job information.
1202 Returns
1203 -------
1204 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1205 Run information for the detailed report. The key is the given HTCondor
1206 id and the value is a collection of report information for that run.
1207 """
1208 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1209 dag_job = jobs.pop(wms_workflow_id)
1210 report = WmsRunReport(
1211 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
1212 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
1213 path=dag_job["Iwd"],
1214 label=dag_job.get("bps_job_label", "MISS"),
1215 run=dag_job.get("bps_run", "MISS"),
1216 project=dag_job.get("bps_project", "MISS"),
1217 campaign=dag_job.get("bps_campaign", "MISS"),
1218 payload=dag_job.get("bps_payload", "MISS"),
1219 operator=_get_owner(dag_job),
1220 run_summary=_get_run_summary(dag_job),
1221 state=_htc_status_to_wms_state(dag_job),
1222 jobs=[],
1223 total_number_jobs=dag_job["total_jobs"],
1224 job_state_counts=dag_job["state_counts"],
1225 exit_code_summary=_get_exit_code_summary(jobs),
1226 )
1228 for job_id, job_info in jobs.items():
1229 try:
1230 job_report = WmsJobReport(
1231 wms_id=job_id,
1232 name=job_info.get("DAGNodeName", job_id),
1233 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])),
1234 state=_htc_status_to_wms_state(job_info),
1235 )
1236 if job_report.label == "init":
1237 job_report.label = "pipetaskInit"
1238 report.jobs.append(job_report)
1239 except KeyError as ex:
1240 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
1241 raise
1243 # Add the removed entry to restore the original content of the dictionary.
1244 # The ordering of keys will be change permanently though.
1245 jobs.update({wms_workflow_id: dag_job})
1247 run_reports = {report.wms_id: report}
1248 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1249 return run_reports
1252def _summary_report(user, hist, pass_thru, schedds=None):
1253 """Gather run information to be used in generating summary reports.
1255 Parameters
1256 ----------
1257 user : `str`
1258 Run lookup restricted to given user.
1259 hist : `float`
1260 How many previous days to search for run information.
1261 pass_thru : `str`
1262 Advanced users can define the HTCondor constraint to be used
1263 when searching queue and history.
1265 Returns
1266 -------
1267 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1268 Run information for the summary report. The keys are HTCondor ids and
1269 the values are collections of report information for each run.
1270 message : `str`
1271 Message to be printed with the summary report.
1272 """
1273 # only doing summary report so only look for dagman jobs
1274 if pass_thru:
1275 constraint = pass_thru
1276 else:
1277 # Notes:
1278 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1279 # manually restarted.
1280 # * Any job with DAGManJobID isn't a DAG job
1281 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1282 if user:
1283 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1285 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1287 # Have list of DAGMan jobs, need to get run_report info.
1288 run_reports = {}
1289 for jobs in job_info.values():
1290 for job_id, job in jobs.items():
1291 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1292 # If didn't get from queue information (e.g., Kerberos bug),
1293 # try reading from file.
1294 if total_jobs == 0:
1295 try:
1296 job.update(read_dag_status(job["Iwd"]))
1297 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1298 except StopIteration:
1299 pass # don't kill report can't find htcondor files
1301 if "bps_run" not in job:
1302 _add_run_info(job["Iwd"], job)
1303 report = WmsRunReport(
1304 wms_id=job_id,
1305 global_wms_id=job["GlobalJobId"],
1306 path=job["Iwd"],
1307 label=job.get("bps_job_label", "MISS"),
1308 run=job.get("bps_run", "MISS"),
1309 project=job.get("bps_project", "MISS"),
1310 campaign=job.get("bps_campaign", "MISS"),
1311 payload=job.get("bps_payload", "MISS"),
1312 operator=_get_owner(job),
1313 run_summary=_get_run_summary(job),
1314 state=_htc_status_to_wms_state(job),
1315 jobs=[],
1316 total_number_jobs=total_jobs,
1317 job_state_counts=state_counts,
1318 )
1319 run_reports[report.global_wms_id] = report
1321 return run_reports, ""
1324def _add_run_info(wms_path, job):
1325 """Find BPS run information elsewhere for runs without bps attributes.
1327 Parameters
1328 ----------
1329 wms_path : `str`
1330 Path to submit files for the run.
1331 job : `dict` [`str`, `Any`]
1332 HTCondor dag job information.
1334 Raises
1335 ------
1336 StopIteration
1337 If cannot find file it is looking for. Permission errors are
1338 caught and job's run is marked with error.
1339 """
1340 path = Path(wms_path) / "jobs"
1341 try:
1342 subfile = next(path.glob("**/*.sub"))
1343 except (StopIteration, PermissionError):
1344 job["bps_run"] = "Unavailable"
1345 else:
1346 _LOG.debug("_add_run_info: subfile = %s", subfile)
1347 try:
1348 with open(subfile, encoding="utf-8") as fh:
1349 for line in fh:
1350 if line.startswith("+bps_"):
1351 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1352 if m:
1353 _LOG.debug("Matching line: %s", line)
1354 job[m.group(1)] = m.group(2).replace('"', "")
1355 else:
1356 _LOG.debug("Could not parse attribute: %s", line)
1357 except PermissionError:
1358 job["bps_run"] = "PermissionError"
1359 _LOG.debug("After adding job = %s", job)
1362def _get_owner(job):
1363 """Get the owner of a dag job.
1365 Parameters
1366 ----------
1367 job : `dict` [`str`, `Any`]
1368 HTCondor dag job information.
1370 Returns
1371 -------
1372 owner : `str`
1373 Owner of the dag job.
1374 """
1375 owner = job.get("bps_operator", None)
1376 if not owner:
1377 owner = job.get("Owner", None)
1378 if not owner:
1379 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1380 owner = "MISS"
1381 return owner
1384def _get_run_summary(job):
1385 """Get the run summary for a job.
1387 Parameters
1388 ----------
1389 job : `dict` [`str`, `Any`]
1390 HTCondor dag job information.
1392 Returns
1393 -------
1394 summary : `str`
1395 Number of jobs per PipelineTask label in approximate pipeline order.
1396 Format: <label>:<count>[;<label>:<count>]+
1397 """
1398 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1399 if not summary:
1400 summary, _ = summary_from_dag(job["Iwd"])
1401 if not summary:
1402 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1403 _LOG.debug("_get_run_summary: summary=%s", summary)
1405 # Workaround sometimes using init vs pipetaskInit
1406 summary = summary.replace("init:", "pipetaskInit:")
1408 if "pegasus_version" in job and "pegasus" not in summary:
1409 summary += ";pegasus:0"
1411 return summary
1414def _get_exit_code_summary(jobs):
1415 """Get the exit code summary for a run.
1417 Parameters
1418 ----------
1419 jobs : `dict` [`str`, `dict` [`str`, Any]]
1420 Mapping HTCondor job id to job information.
1422 Returns
1423 -------
1424 summary : `dict` [`str`, `list` [`int`]]
1425 Jobs' exit codes per job label.
1426 """
1427 summary = {}
1428 for job_id, job_ad in jobs.items():
1429 job_label = job_ad["bps_job_label"]
1430 summary.setdefault(job_label, [])
1431 try:
1432 exit_code = 0
1433 job_status = job_ad["JobStatus"]
1434 match job_status:
1435 case JobStatus.COMPLETED:
1436 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
1437 case JobStatus.HELD:
1438 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["HoldReasonCode"]
1439 case (
1440 JobStatus.IDLE
1441 | JobStatus.RUNNING
1442 | JobStatus.REMOVED
1443 | JobStatus.TRANSFERRING_OUTPUT
1444 | JobStatus.SUSPENDED
1445 ):
1446 pass
1447 case _:
1448 _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%d'", job_status, job_id)
1449 if exit_code != 0:
1450 summary[job_label].append(exit_code)
1451 except KeyError as ex:
1452 _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id)
1453 return summary
1456def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1457 """Count number of jobs per WMS state.
1459 Parameters
1460 ----------
1461 wms_workflow_id : `str`
1462 HTCondor job id.
1463 jobs : `dict` [`str`, `Any`]
1464 HTCondor dag job information.
1466 Returns
1467 -------
1468 total_count : `int`
1469 Total number of dag nodes.
1470 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1471 Keys are the different WMS states and values are counts of jobs
1472 that are in that WMS state.
1473 """
1474 state_counts = dict.fromkeys(WmsStates, 0)
1476 for jid, jinfo in jobs.items():
1477 if jid != wms_workflow_id:
1478 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1480 total_counted = sum(state_counts.values())
1481 if "NodesTotal" in jobs[wms_workflow_id]:
1482 total_count = jobs[wms_workflow_id]["NodesTotal"]
1483 else:
1484 total_count = total_counted
1486 state_counts[WmsStates.UNREADY] += total_count - total_counted
1488 return total_count, state_counts
1491def _get_state_counts_from_dag_job(job):
1492 """Count number of jobs per WMS state.
1494 Parameters
1495 ----------
1496 job : `dict` [`str`, `Any`]
1497 HTCondor dag job information.
1499 Returns
1500 -------
1501 total_count : `int`
1502 Total number of dag nodes.
1503 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1504 Keys are the different WMS states and values are counts of jobs
1505 that are in that WMS state.
1506 """
1507 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1508 state_counts = dict.fromkeys(WmsStates, 0)
1509 if "DAG_NodesReady" in job:
1510 state_counts = {
1511 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1512 WmsStates.READY: job.get("DAG_NodesReady", 0),
1513 WmsStates.HELD: job.get("JobProcsHeld", 0),
1514 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1515 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1516 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
1517 }
1518 total_jobs = job.get("DAG_NodesTotal")
1519 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1520 elif "NodesFailed" in job:
1521 state_counts = {
1522 WmsStates.UNREADY: job.get("NodesUnready", 0),
1523 WmsStates.READY: job.get("NodesReady", 0),
1524 WmsStates.HELD: job.get("JobProcsHeld", 0),
1525 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1526 WmsStates.FAILED: job.get("NodesFailed", 0),
1527 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
1528 }
1529 try:
1530 total_jobs = job.get("NodesTotal")
1531 except KeyError as ex:
1532 _LOG.error("Job missing %s. job = %s", str(ex), job)
1533 raise
1534 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1535 else:
1536 # With Kerberos job auth and Kerberos bug, if warning would be printed
1537 # for every DAG.
1538 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1539 total_jobs = 0
1541 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1542 return total_jobs, state_counts
1545def _htc_status_to_wms_state(job):
1546 """Convert HTCondor job status to generic wms state.
1548 Parameters
1549 ----------
1550 job : `dict` [`str`, `Any`]
1551 HTCondor job information.
1553 Returns
1554 -------
1555 wms_state : `WmsStates`
1556 The equivalent WmsState to given job's status.
1557 """
1558 wms_state = WmsStates.MISFIT
1559 if "JobStatus" in job:
1560 wms_state = _htc_job_status_to_wms_state(job)
1561 elif "NodeStatus" in job:
1562 wms_state = _htc_node_status_to_wms_state(job)
1563 return wms_state
1566def _htc_job_status_to_wms_state(job):
1567 """Convert HTCondor job status to generic wms state.
1569 Parameters
1570 ----------
1571 job : `dict` [`str`, `Any`]
1572 HTCondor job information.
1574 Returns
1575 -------
1576 wms_state : `lsst.ctrl.bps.WmsStates`
1577 The equivalent WmsState to given job's status.
1578 """
1579 _LOG.debug(
1580 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1581 )
1582 job_status = int(job["JobStatus"])
1583 wms_state = WmsStates.MISFIT
1585 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1586 if job_status == JobStatus.IDLE:
1587 wms_state = WmsStates.PENDING
1588 elif job_status == JobStatus.RUNNING:
1589 wms_state = WmsStates.RUNNING
1590 elif job_status == JobStatus.REMOVED:
1591 wms_state = WmsStates.DELETED
1592 elif job_status == JobStatus.COMPLETED:
1593 if (
1594 job.get("ExitBySignal", False)
1595 or job.get("ExitCode", 0)
1596 or job.get("ExitSignal", 0)
1597 or job.get("DAG_Status", 0)
1598 or job.get("ReturnValue", 0)
1599 ):
1600 wms_state = WmsStates.FAILED
1601 else:
1602 wms_state = WmsStates.SUCCEEDED
1603 elif job_status == JobStatus.HELD:
1604 wms_state = WmsStates.HELD
1606 return wms_state
1609def _htc_node_status_to_wms_state(job):
1610 """Convert HTCondor status to generic wms state.
1612 Parameters
1613 ----------
1614 job : `dict` [`str`, `Any`]
1615 HTCondor job information.
1617 Returns
1618 -------
1619 wms_state : `lsst.ctrl.bps.WmsStates`
1620 The equivalent WmsState to given node's status.
1621 """
1622 wms_state = WmsStates.MISFIT
1624 status = job["NodeStatus"]
1625 if status == NodeStatus.NOT_READY:
1626 wms_state = WmsStates.UNREADY
1627 elif status == NodeStatus.READY:
1628 wms_state = WmsStates.READY
1629 elif status == NodeStatus.PRERUN:
1630 wms_state = WmsStates.MISFIT
1631 elif status == NodeStatus.SUBMITTED:
1632 if job["JobProcsHeld"]:
1633 wms_state = WmsStates.HELD
1634 elif job["StatusDetails"] == "not_idle":
1635 wms_state = WmsStates.RUNNING
1636 elif job["JobProcsQueued"]:
1637 wms_state = WmsStates.PENDING
1638 elif status == NodeStatus.POSTRUN:
1639 wms_state = WmsStates.MISFIT
1640 elif status == NodeStatus.DONE:
1641 wms_state = WmsStates.SUCCEEDED
1642 elif status == NodeStatus.ERROR:
1643 # Use job exist instead of post script exit
1644 if "DAGMAN error 0" in job["StatusDetails"]:
1645 wms_state = WmsStates.SUCCEEDED
1646 else:
1647 wms_state = WmsStates.FAILED
1649 return wms_state
1652def _update_jobs(jobs1, jobs2):
1653 """Update jobs1 with info in jobs2.
1655 (Basically an update for nested dictionaries.)
1657 Parameters
1658 ----------
1659 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1660 HTCondor job information to be updated.
1661 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1662 Additional HTCondor job information.
1663 """
1664 for jid, jinfo in jobs2.items():
1665 if jid in jobs1:
1666 jobs1[jid].update(jinfo)
1667 else:
1668 jobs1[jid] = jinfo
1671def _wms_id_type(wms_id):
1672 """Determine the type of the WMS id.
1674 Parameters
1675 ----------
1676 wms_id : `str`
1677 WMS id identifying a job.
1679 Returns
1680 -------
1681 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1682 Type of WMS id.
1683 """
1684 try:
1685 int(float(wms_id))
1686 except ValueError:
1687 wms_path = Path(wms_id)
1688 if wms_path.is_dir():
1689 id_type = WmsIdType.PATH
1690 else:
1691 id_type = WmsIdType.GLOBAL
1692 except TypeError:
1693 id_type = WmsIdType.UNKNOWN
1694 else:
1695 id_type = WmsIdType.LOCAL
1696 return id_type
1699def _wms_id_to_cluster(wms_id):
1700 """Convert WMS id to cluster id.
1702 Parameters
1703 ----------
1704 wms_id : `int` or `float` or `str`
1705 HTCondor job id or path.
1707 Returns
1708 -------
1709 schedd_ad : `classad.ClassAd`
1710 ClassAd describing the scheduler managing the job with the given id.
1711 cluster_id : `int`
1712 HTCondor cluster id.
1713 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1714 The type of the provided id.
1715 """
1716 coll = htcondor.Collector()
1718 schedd_ad = None
1719 cluster_id = None
1720 id_type = _wms_id_type(wms_id)
1721 if id_type == WmsIdType.LOCAL:
1722 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1723 cluster_id = int(float(wms_id))
1724 elif id_type == WmsIdType.GLOBAL:
1725 constraint = f'GlobalJobId == "{wms_id}"'
1726 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1727 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
1728 job_info = condor_q(constraint=constraint, schedds=schedds)
1729 if job_info:
1730 schedd_name, job_rec = job_info.popitem()
1731 job_id, _ = job_rec.popitem()
1732 schedd_ad = schedd_ads[schedd_name]
1733 cluster_id = int(float(job_id))
1734 elif id_type == WmsIdType.PATH:
1735 try:
1736 job_info = read_dag_info(wms_id)
1737 except (FileNotFoundError, PermissionError, OSError):
1738 pass
1739 else:
1740 schedd_name, job_rec = job_info.popitem()
1741 job_id, _ = job_rec.popitem()
1742 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1743 cluster_id = int(float(job_id))
1744 else:
1745 pass
1746 return schedd_ad, cluster_id, id_type
1749def _wms_id_to_dir(wms_id):
1750 """Convert WMS id to a submit directory candidate.
1752 The function does not check if the directory exists or if it is a valid
1753 BPS submit directory.
1755 Parameters
1756 ----------
1757 wms_id : `int` or `float` or `str`
1758 HTCondor job id or path.
1760 Returns
1761 -------
1762 wms_path : `pathlib.Path` or None
1763 Submit directory candidate for the run with the given job id. If no
1764 directory can be associated with the provided WMS id, it will be set
1765 to None.
1766 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1767 The type of the provided id.
1769 Raises
1770 ------
1771 TypeError
1772 Raised if provided WMS id has invalid type.
1773 """
1774 coll = htcondor.Collector()
1775 schedd_ads = []
1777 constraint = None
1778 wms_path = None
1779 id_type = _wms_id_type(wms_id)
1780 match id_type:
1781 case WmsIdType.LOCAL:
1782 constraint = f"ClusterId == {int(float(wms_id))}"
1783 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1784 case WmsIdType.GLOBAL:
1785 constraint = f'GlobalJobId == "{wms_id}"'
1786 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1787 case WmsIdType.PATH:
1788 wms_path = Path(wms_id)
1789 case WmsIdType.UNKNOWN:
1790 raise TypeError(f"Invalid job id type: {wms_id}")
1791 if constraint is not None:
1792 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1793 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"])
1794 if job_info:
1795 _, job_rec = job_info.popitem()
1796 _, job_ad = job_rec.popitem()
1797 wms_path = Path(job_ad["Iwd"])
1798 return wms_path, id_type
1801def _create_periodic_release_expr(memory, multiplier, limit):
1802 """Construct an HTCondorAd expression for releasing held jobs.
1804 The expression instruct HTCondor to release any job which was put on hold
1805 due to exceeding memory requirements back to the job queue providing it
1806 satisfies all of the conditions below:
1808 * number of run attempts did not reach allowable number of retries,
1809 * the memory requirements in the last failed run attempt did not reach
1810 the specified memory limit.
1812 Parameters
1813 ----------
1814 memory : `int`
1815 Requested memory in MB.
1816 multiplier : `float`
1817 Memory growth rate between retires.
1818 limit : `int`
1819 Memory limit.
1821 Returns
1822 -------
1823 expr : `str`
1824 A string representing an HTCondor ClassAd expression for releasing jobs
1825 which have been held due to exceeding the memory requirements.
1826 """
1827 is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1828 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1830 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1831 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1832 # The special comparison operators ensure that all comparisons below will
1833 # evaluate to FALSE in this case.
1834 #
1835 # Note:
1836 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1837 # the entire expression should evaluate to FALSE when the job is not HELD.
1838 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1839 # but better safe than sorry.
1840 was_mem_exceeded = (
1841 "JobStatus == 5 "
1842 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1843 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1844 )
1846 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1847 return expr
1850def _create_periodic_remove_expr(memory, multiplier, limit):
1851 """Construct an HTCondorAd expression for removing jobs from the queue.
1853 The expression instruct HTCondor to remove any job which was put on hold
1854 due to exceeding memory requirements from the job queue providing it
1855 satisfies any of the conditions below:
1857 * allowable number of retries was reached,
1858 * the memory requirements during the last failed run attempt reached
1859 the specified memory limit.
1861 Parameters
1862 ----------
1863 memory : `int`
1864 Requested memory in MB.
1865 multiplier : `float`
1866 Memory growth rate between retires.
1867 limit : `int`
1868 Memory limit.
1870 Returns
1871 -------
1872 expr : `str`
1873 A string representing an HTCondor ClassAd expression for removing jobs
1874 which were run at the maximal allowable memory and still exceeded
1875 the memory requirements.
1876 """
1877 is_retry_disallowed = "NumJobStarts > JobMaxRetries"
1878 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
1880 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1881 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1882 # The special comparison operators ensure that all comparisons below will
1883 # evaluate to FALSE in this case.
1884 #
1885 # Note:
1886 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1887 # the entire expression should evaluate to FALSE when the job is not HELD.
1888 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1889 # but better safe than sorry.
1890 was_mem_exceeded = (
1891 "JobStatus == 5 "
1892 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1893 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1894 )
1896 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
1897 return expr
1900def _create_request_memory_expr(memory, multiplier, limit):
1901 """Construct an HTCondor ClassAd expression for safe memory scaling.
1903 Parameters
1904 ----------
1905 memory : `int`
1906 Requested memory in MB.
1907 multiplier : `float`
1908 Memory growth rate between retires.
1909 limit : `int`
1910 Memory limit.
1912 Returns
1913 -------
1914 expr : `str`
1915 A string representing an HTCondor ClassAd expression enabling safe
1916 memory scaling between job retries.
1917 """
1918 # The check if the job was held due to exceeding memory requirements
1919 # will be made *after* job was released back to the job queue (is in
1920 # the IDLE state), hence the need to use `Last*` job ClassAds instead of
1921 # the ones describing job's current state.
1922 #
1923 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
1924 # initially put in the job queue. The special comparison operators ensure
1925 # that all comparisons below will evaluate to FALSE in this case.
1926 was_mem_exceeded = (
1927 "LastJobStatus =?= 5 "
1928 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
1929 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1930 )
1932 # If job runs the first time or was held for reasons other than exceeding
1933 # the memory, set the required memory to the requested value or use
1934 # the memory value measured by HTCondor (MemoryUsage) depending on
1935 # whichever is greater.
1936 expr = (
1937 f"({was_mem_exceeded}) "
1938 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
1939 f": max({{{memory}, MemoryUsage ?: 0}})"
1940 )
1941 return expr
1944def _locate_schedds(locate_all=False):
1945 """Find out Scheduler daemons in an HTCondor pool.
1947 Parameters
1948 ----------
1949 locate_all : `bool`, optional
1950 If True, all available schedulers in the HTCondor pool will be located.
1951 False by default which means that the search will be limited to looking
1952 for the Scheduler running on a local host.
1954 Returns
1955 -------
1956 schedds : `dict` [`str`, `htcondor.Schedd`]
1957 A mapping between Scheduler names and Python objects allowing for
1958 interacting with them.
1959 """
1960 coll = htcondor.Collector()
1962 schedd_ads = []
1963 if locate_all:
1964 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1965 else:
1966 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1967 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1970def _gather_site_values(config, compute_site):
1971 """Gather values specific to given site.
1973 Parameters
1974 ----------
1975 config : `lsst.ctrl.bps.BpsConfig`
1976 BPS configuration that includes necessary submit/runtime
1977 information.
1978 compute_site : `str`
1979 Compute site name.
1981 Returns
1982 -------
1983 site_values : `dict` [`str`, `Any`]
1984 Values specific to the given site.
1985 """
1986 site_values = {"attrs": {}, "profile": {}}
1987 search_opts = {}
1988 if compute_site:
1989 search_opts["curvals"] = {"curr_site": compute_site}
1991 # Determine the hard limit for the memory requirement.
1992 found, limit = config.search("memoryLimit", opt=search_opts)
1993 if not found:
1994 search_opts["default"] = DEFAULT_HTC_EXEC_PATT
1995 _, patt = config.search("executeMachinesPattern", opt=search_opts)
1996 del search_opts["default"]
1998 # To reduce the amount of data, ignore dynamic slots (if any) as,
1999 # by definition, they cannot have more memory than
2000 # the partitionable slot they are the part of.
2001 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
2002 pool_info = condor_status(constraint=constraint)
2003 try:
2004 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
2005 except ValueError:
2006 _LOG.debug("No execute machine in the pool matches %s", patt)
2007 if limit:
2008 config[".bps_defined.memory_limit"] = limit
2010 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
2011 site_values["memoryLimit"] = limit
2013 found, value = config.search("accountingGroup", opt=search_opts)
2014 if found:
2015 site_values["accountingGroup"] = value
2016 found, value = config.search("accountingUser", opt=search_opts)
2017 if found:
2018 site_values["accountingUser"] = value
2020 key = f".site.{compute_site}.profile.condor"
2021 if key in config:
2022 for key, val in config[key].items():
2023 if key.startswith("+"):
2024 site_values["attrs"][key[1:]] = val
2025 else:
2026 site_values["profile"][key] = val
2028 return site_values