Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%
520 statements
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 03:01 -0700
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 03:01 -0700
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import os
29import re
30import logging
31from datetime import datetime, timedelta
32from pathlib import Path
33from collections import defaultdict
35import htcondor
37from lsst.daf.butler.core.utils import time_this
38from ... import (
39 BaseWmsWorkflow,
40 BaseWmsService,
41 GenericWorkflow,
42 GenericWorkflowJob,
43 WmsRunReport,
44 WmsJobReport,
45 WmsStates
46)
47from ...bps_utils import (
48 chdir,
49 create_count_summary
50)
51from .lssthtc import (
52 HTCDag,
53 HTCJob,
54 MISSING_ID,
55 JobStatus,
56 NodeStatus,
57 htc_check_dagman_output,
58 htc_escape,
59 htc_submit_dag,
60 read_dag_log,
61 read_dag_status,
62 read_node_status,
63 condor_history,
64 condor_q,
65 condor_status,
66 pegasus_name_to_label,
67 summary_from_dag,
68)
71DEFAULT_HTC_EXEC_PATT = ".*worker.*"
72"""Default pattern for searching execute machines in an HTCondor pool.
73"""
75_LOG = logging.getLogger(__name__)
78class HTCondorService(BaseWmsService):
79 """HTCondor version of WMS service.
80 """
81 def prepare(self, config, generic_workflow, out_prefix=None):
82 """Convert generic workflow to an HTCondor DAG ready for submission.
84 Parameters
85 ----------
86 config : `lsst.ctrl.bps.BpsConfig`
87 BPS configuration that includes necessary submit/runtime
88 information.
89 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
90 The generic workflow (e.g., has executable name and arguments).
91 out_prefix : `str`
92 The root directory into which all WMS-specific files are written.
94 Returns
95 ----------
96 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
97 HTCondor workflow ready to be run.
98 """
99 _LOG.debug("out_prefix = '%s'", out_prefix)
100 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"):
101 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
102 f"{self.__class__.__module__}."
103 f"{self.__class__.__name__}")
105 with time_this(log=_LOG, level=logging.INFO, prefix=None,
106 msg="Completed writing out HTCondor workflow"):
107 workflow.write(out_prefix)
108 return workflow
110 def submit(self, workflow):
111 """Submit a single HTCondor workflow.
113 Parameters
114 ----------
115 workflow : `lsst.ctrl.bps.BaseWorkflow`
116 A single HTCondor workflow to submit. run_id is updated after
117 successful submission to WMS.
118 """
119 # For workflow portability, internal paths are all relative. Hence
120 # the DAG needs to be submitted to HTCondor from inside the submit
121 # directory.
122 with chdir(workflow.submit_path):
123 _LOG.info("Submitting from directory: %s", os.getcwd())
124 htc_submit_dag(workflow.dag, {})
125 workflow.run_id = workflow.dag.run_id
127 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
128 """Query WMS for list of submitted WMS workflows/jobs.
130 This should be a quick lookup function to create list of jobs for
131 other functions.
133 Parameters
134 ----------
135 wms_id : `int` or `str`, optional
136 Id or path that can be used by WMS service to look up job.
137 user : `str`, optional
138 User whose submitted jobs should be listed.
139 require_bps : `bool`, optional
140 Whether to require jobs returned in list to be bps-submitted jobs.
141 pass_thru : `str`, optional
142 Information to pass through to WMS.
144 Returns
145 -------
146 job_ids : `list` [`Any`]
147 Only job ids to be used by cancel and other functions. Typically
148 this means top-level jobs (i.e., not children jobs).
149 """
150 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
151 wms_id, user, require_bps, pass_thru)
152 constraint = ""
154 if wms_id is None:
155 if user is not None:
156 constraint = f'(Owner == "{user}")'
157 else:
158 cluster_id = _wms_id_to_cluster(wms_id)
159 if cluster_id != 0:
160 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
162 if require_bps:
163 constraint += ' && (bps_isjob == "True")'
165 if pass_thru:
166 if "-forcex" in pass_thru:
167 pass_thru_2 = pass_thru.replace("-forcex", "")
168 if pass_thru_2 and not pass_thru_2.isspace():
169 constraint += f"&& ({pass_thru_2})"
170 else:
171 constraint += f" && ({pass_thru})"
173 _LOG.debug("constraint = %s", constraint)
174 jobs = condor_q(constraint)
176 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
177 job_ids = []
178 for job_id, job_info in jobs.items():
179 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
180 if "DAGManJobId" not in job_info: # orphaned job
181 job_ids.append(job_id)
182 else:
183 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
184 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
185 if f"{job_info['DAGManJobId']}.0" not in jobs:
186 job_ids.append(job_id)
188 _LOG.debug("job_ids = %s", job_ids)
189 return job_ids
191 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
192 """Return run information based upon given constraints.
194 Parameters
195 ----------
196 wms_workflow_id : `str`
197 Limit to specific run based on id.
198 user : `str`
199 Limit results to runs for this user.
200 hist : `float`
201 Limit history search to this many days.
202 pass_thru : `str`
203 Constraints to pass through to HTCondor.
205 Returns
206 -------
207 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
208 Information about runs from given job information.
209 message : `str`
210 Extra message for report command to print. This could be pointers
211 to documentation or to WMS specific commands.
212 """
213 message = ""
215 if wms_workflow_id:
216 # Explicitly checking if wms_workflow_id can be converted to a
217 # float instead of using try/except to avoid catching a different
218 # ValueError from _report_from_id
219 try:
220 float(wms_workflow_id)
221 is_float = True
222 except ValueError: # Don't need TypeError here as None goes to else branch.
223 is_float = False
225 if is_float:
226 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
227 else:
228 run_reports, message = _report_from_path(wms_workflow_id)
229 else:
230 run_reports, message = _summary_report(user, hist, pass_thru)
231 _LOG.debug("report: %s, %s", run_reports, message)
233 return list(run_reports.values()), message
235 def cancel(self, wms_id, pass_thru=None):
236 """Cancel submitted workflows/jobs.
238 Parameters
239 ----------
240 wms_id : `str`
241 Id or path of job that should be canceled.
242 pass_thru : `str`, optional
243 Information to pass through to WMS.
245 Returns
246 --------
247 deleted : `bool`
248 Whether successful deletion or not. Currently, if any doubt or any
249 individual jobs not deleted, return False.
250 message : `str`
251 Any message from WMS (e.g., error details).
252 """
253 _LOG.debug("Canceling wms_id = %s", wms_id)
255 cluster_id = _wms_id_to_cluster(wms_id)
256 if cluster_id == 0:
257 deleted = False
258 message = "Invalid id"
259 else:
260 _LOG.debug("Canceling cluster_id = %s", cluster_id)
261 schedd = htcondor.Schedd()
262 constraint = f"ClusterId == {cluster_id}"
263 if pass_thru is not None and "-forcex" in pass_thru:
264 pass_thru_2 = pass_thru.replace("-forcex", "")
265 if pass_thru_2 and not pass_thru_2.isspace():
266 constraint += f"&& ({pass_thru_2})"
267 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
268 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
269 else:
270 if pass_thru:
271 constraint += f"&& ({pass_thru})"
272 _LOG.debug("JobAction.Remove constraint = %s", constraint)
273 results = schedd.act(htcondor.JobAction.Remove, constraint)
274 _LOG.debug("Remove results: %s", results)
276 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
277 deleted = True
278 message = ""
279 else:
280 deleted = False
281 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
282 message = "no such bps job in batch queue"
283 else:
284 message = f"unknown problems deleting: {results}"
286 _LOG.debug("deleted: %s; message = %s", deleted, message)
287 return deleted, message
290class HTCondorWorkflow(BaseWmsWorkflow):
291 """Single HTCondor workflow.
293 Parameters
294 ----------
295 name : `str`
296 Unique name for Workflow used when naming files.
297 config : `lsst.ctrl.bps.BpsConfig`
298 BPS configuration that includes necessary submit/runtime information.
299 """
300 def __init__(self, name, config=None):
301 super().__init__(name, config)
302 self.dag = None
304 @classmethod
305 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
306 # Docstring inherited
307 htc_workflow = cls(generic_workflow.name, config)
308 htc_workflow.dag = HTCDag(name=generic_workflow.name)
310 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
311 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
312 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
313 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
314 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
315 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
317 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
318 if isinstance(tmp_template, str):
319 subdir_template = defaultdict(lambda: tmp_template)
320 else:
321 subdir_template = tmp_template
323 # Create all DAG jobs
324 site_values = {} # cache compute site specific values to reduce config lookups
325 for job_name in generic_workflow:
326 gwjob = generic_workflow.get_job(job_name)
327 if gwjob.compute_site not in site_values:
328 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
329 htc_job = _create_job(subdir_template[gwjob.label], site_values[gwjob.compute_site],
330 generic_workflow, gwjob, out_prefix)
331 htc_workflow.dag.add_job(htc_job)
333 # Add job dependencies to the DAG
334 for job_name in generic_workflow:
335 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
337 # If final job exists in generic workflow, create DAG final job
338 final = generic_workflow.get_final()
339 if final and isinstance(final, GenericWorkflowJob):
340 if final.compute_site and final.compute_site not in site_values:
341 site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
342 final_htjob = _create_job(subdir_template[final.label], site_values[final.compute_site],
343 generic_workflow, final, out_prefix)
344 if "post" not in final_htjob.dagcmds:
345 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
346 f" {final.name} $DAG_STATUS $RETURN"
347 htc_workflow.dag.add_final_job(final_htjob)
348 elif final and isinstance(final, GenericWorkflow):
349 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
350 elif final:
351 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
353 return htc_workflow
355 def write(self, out_prefix):
356 """Output HTCondor DAGMan files needed for workflow submission.
358 Parameters
359 ----------
360 out_prefix : `str`
361 Directory prefix for HTCondor files.
362 """
363 self.submit_path = out_prefix
364 os.makedirs(out_prefix, exist_ok=True)
366 # Write down the workflow in HTCondor format.
367 self.dag.write(out_prefix, "jobs/{self.label}")
370def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
371 """Convert GenericWorkflow job nodes to DAG jobs.
373 Parameters
374 ----------
375 subdir_template : `str`
376 Template for making subdirs.
377 site_values : `dict`
378 Site specific values
379 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
380 Generic workflow that is being converted.
381 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
382 The generic job to convert to a HTCondor job.
383 out_prefix : `str`
384 Directory prefix for HTCondor files.
386 Returns
387 -------
388 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
389 The HTCondor job equivalent to the given generic job.
390 """
391 htc_job = HTCJob(gwjob.name, label=gwjob.label)
393 curvals = defaultdict(str)
394 curvals["label"] = gwjob.label
395 if gwjob.tags:
396 curvals.update(gwjob.tags)
398 subdir = subdir_template.format_map(curvals)
399 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
401 htc_job_cmds = {
402 "universe": "vanilla",
403 "should_transfer_files": "YES",
404 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
405 "transfer_output_files": '""', # Set to empty string to disable
406 "transfer_executable": "False",
407 "getenv": "True",
409 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
410 # to put SIGBUS jobs on hold.
411 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
412 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
413 "on_exit_hold_subcode": "34"
414 }
416 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
418 # job stdout, stderr, htcondor user log.
419 for key in ("output", "error", "log"):
420 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
421 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
423 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"],
424 out_prefix))
426 # Add the job cmds dict to the job object.
427 htc_job.add_job_cmds(htc_job_cmds)
429 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
431 # Add job attributes to job.
432 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
433 htc_job.add_job_attrs(gwjob.attrs)
434 htc_job.add_job_attrs(site_values["attrs"])
435 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
436 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
437 "bps_job_label": gwjob.label})
439 return htc_job
442def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
443 """Translate the job data that are one to one mapping
445 Parameters
446 ----------
447 cached_vals : `dict` [`str`, `Any`]
448 Config values common to jobs with same label.
449 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
450 Generic workflow that contains job to being converted.
451 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
452 Generic workflow job to be converted.
454 Returns
455 -------
456 htc_job_commands : `dict` [`str`, `Any`]
457 Contains commands which can appear in the HTCondor submit description
458 file.
459 """
460 # Values in the job script that just are name mappings.
461 job_translation = {"mail_to": "notify_user",
462 "when_to_mail": "notification",
463 "request_cpus": "request_cpus",
464 "priority": "priority",
465 "category": "category"}
467 jobcmds = {}
468 for gwkey, htckey in job_translation.items():
469 jobcmds[htckey] = getattr(gwjob, gwkey, None)
471 # job commands that need modification
472 if gwjob.number_of_retries:
473 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
475 if gwjob.retry_unless_exit:
476 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
478 if gwjob.request_disk:
479 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
481 if gwjob.request_memory:
482 jobcmds["request_memory"] = f"{gwjob.request_memory}"
484 if gwjob.memory_multiplier:
485 # Do not use try-except! At the moment, BpsConfig returns an empty
486 # string if it does not contain the key.
487 memory_limit = cached_vals["memoryLimit"]
488 if not memory_limit:
489 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
490 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
491 "search pattern 'executeMachinesPattern' might help.")
492 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
494 # Periodically release jobs which are being held due to exceeding
495 # memory. Stop doing that (by removing the job from the HTCondor queue)
496 # after the maximal number of retries has been reached or the memory
497 # requirements cannot be satisfied.
498 jobcmds["periodic_release"] = \
499 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
500 jobcmds["periodic_remove"] = \
501 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
502 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
504 # Assume concurrency_limit implemented using HTCondor concurrency limits.
505 # May need to move to special site-specific implementation if sites use
506 # other mechanisms.
507 if gwjob.concurrency_limit:
508 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
510 # Handle command line
511 if gwjob.executable.transfer_executable:
512 jobcmds["transfer_executable"] = "True"
513 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
514 else:
515 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
517 if gwjob.arguments:
518 arguments = gwjob.arguments
519 arguments = _replace_cmd_vars(arguments, gwjob)
520 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
521 arguments = _fix_env_var_syntax(arguments)
522 jobcmds["arguments"] = arguments
524 # Add extra "pass-thru" job commands
525 if gwjob.profile:
526 for key, val in gwjob.profile.items():
527 jobcmds[key] = htc_escape(val)
528 for key, val in cached_vals["profile"]:
529 jobcmds[key] = htc_escape(val)
531 return jobcmds
534def _translate_dag_cmds(gwjob):
535 """Translate job values into DAGMan commands.
537 Parameters
538 ----------
539 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
540 Job containing values to be translated.
542 Returns
543 -------
544 dagcmds : `dict` [`str`, `Any`]
545 DAGMan commands for the job.
546 """
547 # Values in the dag script that just are name mappings.
548 dag_translation = {"abort_on_value": "abort_dag_on",
549 "abort_return_value": "abort_exit"}
551 dagcmds = {}
552 for gwkey, htckey in dag_translation.items():
553 dagcmds[htckey] = getattr(gwjob, gwkey, None)
555 # Still to be coded: vars "pre_cmdline", "post_cmdline"
556 return dagcmds
559def _fix_env_var_syntax(oldstr):
560 """Change ENV place holders to HTCondor Env var syntax.
562 Parameters
563 ----------
564 oldstr : `str`
565 String in which environment variable syntax is to be fixed.
567 Returns
568 -------
569 newstr : `str`
570 Given string with environment variable syntax fixed.
571 """
572 newstr = oldstr
573 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
574 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
575 return newstr
578def _replace_file_vars(use_shared, arguments, workflow, gwjob):
579 """Replace file placeholders in command line arguments with correct
580 physical file names.
582 Parameters
583 ----------
584 use_shared : `bool`
585 Whether HTCondor can assume shared filesystem.
586 arguments : `str`
587 Arguments string in which to replace file placeholders.
588 workflow : `lsst.ctrl.bps.GenericWorkflow`
589 Generic workflow that contains file information.
590 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
591 The job corresponding to the arguments.
593 Returns
594 -------
595 arguments : `str`
596 Given arguments string with file placeholders replaced.
597 """
598 # Replace input file placeholders with paths.
599 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
600 if not gwfile.wms_transfer:
601 # Must assume full URI if in command line and told WMS is not
602 # responsible for transferring file.
603 uri = gwfile.src_uri
604 elif use_shared:
605 if gwfile.job_shared:
606 # Have shared filesystems and jobs can share file.
607 uri = gwfile.src_uri
608 else:
609 # Taking advantage of inside knowledge. Not future-proof.
610 # Temporary fix until have job wrapper that pulls files
611 # within job.
612 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
613 uri = "butler.yaml"
614 else:
615 uri = os.path.basename(gwfile.src_uri)
616 else: # Using push transfer
617 uri = os.path.basename(gwfile.src_uri)
618 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
620 # Replace output file placeholders with paths.
621 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
622 if not gwfile.wms_transfer:
623 # Must assume full URI if in command line and told WMS is not
624 # responsible for transferring file.
625 uri = gwfile.src_uri
626 elif use_shared:
627 if gwfile.job_shared:
628 # Have shared filesystems and jobs can share file.
629 uri = gwfile.src_uri
630 else:
631 uri = os.path.basename(gwfile.src_uri)
632 else: # Using push transfer
633 uri = os.path.basename(gwfile.src_uri)
634 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
635 return arguments
638def _replace_cmd_vars(arguments, gwjob):
639 """Replace format-style placeholders in arguments.
641 Parameters
642 ----------
643 arguments : `str`
644 Arguments string in which to replace placeholders.
645 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
646 Job containing values to be used to replace placeholders
647 (in particular gwjob.cmdvals).
649 Returns
650 -------
651 arguments : `str`
652 Given arguments string with placeholders replaced.
653 """
654 try:
655 arguments = arguments.format(**gwjob.cmdvals)
656 except (KeyError, TypeError): # TypeError in case None instead of {}
657 _LOG.error("Could not replace command variables:\n"
658 "arguments: %s\n"
659 "cmdvals: %s", arguments, gwjob.cmdvals)
660 raise
661 return arguments
664def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
665 """Add job input files from generic workflow to job.
667 Parameters
668 ----------
669 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
670 The generic workflow (e.g., has executable name and arguments).
671 job_name : `str`
672 Unique name for the job.
673 use_shared : `bool`
674 Whether job has access to files via shared filesystem.
675 out_prefix : `str`
676 The root directory into which all WMS-specific files are written.
678 Returns
679 -------
680 htc_commands : `dict` [`str`, `str`]
681 HTCondor commands for the job submission script.
682 """
683 htc_commands = {}
684 inputs = []
685 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
686 _LOG.debug("src_uri=%s", gwf_file.src_uri)
688 uri = Path(gwf_file.src_uri)
690 # Note if use_shared and job_shared, don't need to transfer file.
692 if not use_shared: # Copy file using push to job
693 inputs.append(str(uri.relative_to(out_prefix)))
694 elif not gwf_file.job_shared: # Jobs require own copy
696 # if using shared filesystem, but still need copy in job. Use
697 # HTCondor's curl plugin for a local copy.
699 # Execution butler is represented as a directory which the
700 # curl plugin does not handle. Taking advantage of inside
701 # knowledge for temporary fix until have job wrapper that pulls
702 # files within job.
703 if gwf_file.name == "butlerConfig":
704 # The execution butler directory doesn't normally exist until
705 # the submit phase so checking for suffix instead of using
706 # is_dir(). If other non-yaml file exists they would have a
707 # different gwf_file.name.
708 if uri.suffix == ".yaml": # Single file, so just copy.
709 inputs.append(f"file://{uri}")
710 else:
711 inputs.append(f"file://{uri / 'butler.yaml'}")
712 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
713 elif uri.is_dir():
714 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job "
715 f"{gwf_file.src_uri}")
716 else:
717 inputs.append(f"file://{uri}")
719 if inputs:
720 htc_commands["transfer_input_files"] = ",".join(inputs)
721 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
722 return htc_commands
725def _report_from_path(wms_path):
726 """Gather run information from a given run directory.
728 Parameters
729 ----------
730 wms_path : `str`
731 The directory containing the submit side files (e.g., HTCondor files).
733 Returns
734 -------
735 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
736 Run information for the detailed report. The key is the HTCondor id
737 and the value is a collection of report information for that run.
738 message : `str`
739 Message to be printed with the summary report.
740 """
741 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
742 if wms_workflow_id == MISSING_ID:
743 run_reports = {}
744 else:
745 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
746 return run_reports, message
749def _report_from_id(wms_workflow_id, hist):
750 """Gather run information from a given run directory.
752 Parameters
753 ----------
754 wms_workflow_id : `int` or `str`
755 Limit to specific run based on id.
756 hist : `float`
757 Limit history search to this many days.
759 Returns
760 -------
761 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
762 Run information for the detailed report. The key is the HTCondor id
763 and the value is a collection of report information for that run.
764 message : `str`
765 Message to be printed with the summary report.
766 """
767 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
768 f"{int(float(wms_workflow_id))})"
769 jobs = condor_q(constraint)
770 if hist:
771 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
772 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
773 hist_jobs = condor_history(constraint)
774 _update_jobs(jobs, hist_jobs)
776 # keys in dictionary will be strings of format "ClusterId.ProcId"
777 wms_workflow_id = str(wms_workflow_id)
778 if not wms_workflow_id.endswith(".0"):
779 wms_workflow_id += ".0"
781 if wms_workflow_id in jobs:
782 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
783 _update_jobs(jobs, path_jobs)
784 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
785 else:
786 run_reports = {}
787 message = f"Found 0 records for run id {wms_workflow_id}"
788 return run_reports, message
791def _get_info_from_path(wms_path):
792 """Gather run information from a given run directory.
794 Parameters
795 ----------
796 wms_path : `str`
797 Directory containing HTCondor files.
799 Returns
800 -------
801 wms_workflow_id : `str`
802 The run id which is a DAGman job id.
803 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
804 Information about jobs read from files in the given directory.
805 The key is the HTCondor id and the value is a dictionary of HTCondor
806 keys and values.
807 message : `str`
808 Message to be printed with the summary report.
809 """
810 try:
811 wms_workflow_id, jobs = read_dag_log(wms_path)
812 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
813 _update_jobs(jobs, read_node_status(wms_path))
814 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
816 # Add more info for DAGman job
817 job = jobs[wms_workflow_id]
818 job.update(read_dag_status(wms_path))
819 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
820 if "bps_run" not in job:
821 _add_run_info(wms_path, job)
823 message = htc_check_dagman_output(wms_path)
824 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
825 jobs[wms_workflow_id]["total_jobs"])
826 except StopIteration:
827 message = f"Could not find HTCondor files in {wms_path}"
828 _LOG.warning(message)
829 wms_workflow_id = MISSING_ID
830 jobs = {}
832 return wms_workflow_id, jobs, message
835def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
836 """Gather run information to be used in generating summary reports.
838 Parameters
839 ----------
840 wms_workflow_id : `str`
841 Run lookup restricted to given user.
842 jobs : `float`
843 How many previous days to search for run information.
845 Returns
846 -------
847 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
848 Run information for the detailed report. The key is the given HTCondor
849 id and the value is a collection of report information for that run.
850 """
851 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
852 dag_job = jobs[wms_workflow_id]
853 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
854 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
855 return {}
856 report = WmsRunReport(wms_id=wms_workflow_id,
857 path=dag_job["Iwd"],
858 label=dag_job.get("bps_job_label", "MISS"),
859 run=dag_job.get("bps_run", "MISS"),
860 project=dag_job.get("bps_project", "MISS"),
861 campaign=dag_job.get("bps_campaign", "MISS"),
862 payload=dag_job.get("bps_payload", "MISS"),
863 operator=_get_owner(dag_job),
864 run_summary=_get_run_summary(dag_job),
865 state=_htc_status_to_wms_state(dag_job),
866 jobs=[],
867 total_number_jobs=dag_job["total_jobs"],
868 job_state_counts=dag_job["state_counts"])
870 try:
871 for job in jobs.values():
872 if job["ClusterId"] != int(float(wms_workflow_id)):
873 job_report = WmsJobReport(wms_id=job["ClusterId"],
874 name=job.get("DAGNodeName", str(job["ClusterId"])),
875 label=job.get("bps_job_label",
876 pegasus_name_to_label(job["DAGNodeName"])),
877 state=_htc_status_to_wms_state(job))
878 if job_report.label == "init":
879 job_report.label = "pipetaskInit"
880 report.jobs.append(job_report)
881 except KeyError as ex:
882 _LOG.error("Job missing key '%s': %s", str(ex), job)
883 raise
885 run_reports = {report.wms_id: report}
886 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
887 return run_reports
890def _summary_report(user, hist, pass_thru):
891 """Gather run information to be used in generating summary reports.
893 Parameters
894 ----------
895 user : `str`
896 Run lookup restricted to given user.
897 hist : `float`
898 How many previous days to search for run information.
899 pass_thru : `str`
900 Advanced users can define the HTCondor constraint to be used
901 when searching queue and history.
903 Returns
904 -------
905 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
906 Run information for the summary report. The keys are HTCondor ids and
907 the values are collections of report information for each run.
908 message : `str`
909 Message to be printed with the summary report.
910 """
911 # only doing summary report so only look for dagman jobs
912 if pass_thru:
913 constraint = pass_thru
914 else:
915 # Notes:
916 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
917 # manually restarted.
918 # * Any job with DAGManJobID isn't a DAG job
919 constraint = 'bps_isjob == "True" && JobUniverse == 7'
920 if user:
921 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
923 # Check runs in queue.
924 jobs = condor_q(constraint)
926 if hist:
927 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
928 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
929 hist_jobs = condor_history(constraint)
930 _update_jobs(jobs, hist_jobs)
932 _LOG.debug("Job ids from queue and history %s", jobs.keys())
934 # Have list of DAGMan jobs, need to get run_report info.
935 run_reports = {}
936 for job in jobs.values():
937 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
938 # If didn't get from queue information (e.g., Kerberos bug),
939 # try reading from file.
940 if total_jobs == 0:
941 try:
942 job.update(read_dag_status(job["Iwd"]))
943 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
944 except StopIteration:
945 pass # don't kill report can't find htcondor files
947 if "bps_run" not in job:
948 _add_run_info(job["Iwd"], job)
949 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
950 path=job["Iwd"],
951 label=job.get("bps_job_label", "MISS"),
952 run=job.get("bps_run", "MISS"),
953 project=job.get("bps_project", "MISS"),
954 campaign=job.get("bps_campaign", "MISS"),
955 payload=job.get("bps_payload", "MISS"),
956 operator=_get_owner(job),
957 run_summary=_get_run_summary(job),
958 state=_htc_status_to_wms_state(job),
959 jobs=[],
960 total_number_jobs=total_jobs,
961 job_state_counts=state_counts)
963 run_reports[report.wms_id] = report
965 return run_reports, ""
968def _add_run_info(wms_path, job):
969 """Find BPS run information elsewhere for runs without bps attributes.
971 Parameters
972 ----------
973 wms_path : `str`
974 Path to submit files for the run.
975 job : `dict` [`str`, `Any`]
976 HTCondor dag job information.
978 Raises
979 ------
980 StopIteration
981 If cannot find file it is looking for. Permission errors are
982 caught and job's run is marked with error.
983 """
984 path = Path(wms_path) / "jobs"
985 try:
986 subfile = next(path.glob("**/*.sub"))
987 except (StopIteration, PermissionError):
988 job["bps_run"] = "Unavailable"
989 else:
990 _LOG.debug("_add_run_info: subfile = %s", subfile)
991 try:
992 with open(subfile, "r", encoding='utf-8') as fh:
993 for line in fh:
994 if line.startswith("+bps_"):
995 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
996 if m:
997 _LOG.debug("Matching line: %s", line)
998 job[m.group(1)] = m.group(2).replace('"', "")
999 else:
1000 _LOG.debug("Could not parse attribute: %s", line)
1001 except PermissionError:
1002 job["bps_run"] = "PermissionError"
1003 _LOG.debug("After adding job = %s", job)
1006def _get_owner(job):
1007 """Get the owner of a dag job.
1009 Parameters
1010 ----------
1011 job : `dict` [`str`, `Any`]
1012 HTCondor dag job information.
1014 Returns
1015 -------
1016 owner : `str`
1017 Owner of the dag job.
1018 """
1019 owner = job.get("bps_operator", None)
1020 if not owner:
1021 owner = job.get("Owner", None)
1022 if not owner:
1023 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1024 owner = "MISS"
1025 return owner
1028def _get_run_summary(job):
1029 """Get the run summary for a job.
1031 Parameters
1032 ----------
1033 job : `dict` [`str`, `Any`]
1034 HTCondor dag job information.
1036 Returns
1037 -------
1038 summary : `str`
1039 Number of jobs per PipelineTask label in approximate pipeline order.
1040 Format: <label>:<count>[;<label>:<count>]+
1041 """
1042 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1043 if not summary:
1044 summary, _ = summary_from_dag(job["Iwd"])
1045 if not summary:
1046 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1047 _LOG.debug("_get_run_summary: summary=%s", summary)
1049 # Workaround sometimes using init vs pipetaskInit
1050 summary = summary.replace("init:", "pipetaskInit:")
1052 if "pegasus_version" in job and "pegasus" not in summary:
1053 summary += ";pegasus:0"
1055 return summary
1058def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1059 """Count number of jobs per WMS state.
1061 Parameters
1062 ----------
1063 wms_workflow_id : `str`
1064 HTCondor job id.
1065 jobs : `dict` [`str`, `Any`]
1066 HTCondor dag job information.
1068 Returns
1069 -------
1070 total_count : `int`
1071 Total number of dag nodes.
1072 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1073 Keys are the different WMS states and values are counts of jobs
1074 that are in that WMS state.
1075 """
1076 state_counts = dict.fromkeys(WmsStates, 0)
1078 for jid, jinfo in jobs.items():
1079 if jid != wms_workflow_id:
1080 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1082 total_counted = sum(state_counts.values())
1083 if "NodesTotal" in jobs[wms_workflow_id]:
1084 total_count = jobs[wms_workflow_id]["NodesTotal"]
1085 else:
1086 total_count = total_counted
1088 state_counts[WmsStates.UNREADY] += total_count - total_counted
1090 return total_count, state_counts
1093def _get_state_counts_from_dag_job(job):
1094 """Count number of jobs per WMS state.
1096 Parameters
1097 ----------
1098 job : `dict` [`str`, `Any`]
1099 HTCondor dag job information.
1101 Returns
1102 -------
1103 total_count : `int`
1104 Total number of dag nodes.
1105 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1106 Keys are the different WMS states and values are counts of jobs
1107 that are in that WMS state.
1108 """
1109 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1110 state_counts = dict.fromkeys(WmsStates, 0)
1111 if "DAG_NodesReady" in job:
1112 state_counts = {
1113 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1114 WmsStates.READY: job.get("DAG_NodesReady", 0),
1115 WmsStates.HELD: job.get("JobProcsHeld", 0),
1116 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1117 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1118 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1119 total_jobs = job.get("DAG_NodesTotal")
1120 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1121 elif "NodesFailed" in job:
1122 state_counts = {
1123 WmsStates.UNREADY: job.get("NodesUnready", 0),
1124 WmsStates.READY: job.get("NodesReady", 0),
1125 WmsStates.HELD: job.get("JobProcsHeld", 0),
1126 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1127 WmsStates.FAILED: job.get("NodesFailed", 0),
1128 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1129 try:
1130 total_jobs = job.get("NodesTotal")
1131 except KeyError as ex:
1132 _LOG.error("Job missing %s. job = %s", str(ex), job)
1133 raise
1134 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1135 else:
1136 # With Kerberos job auth and Kerberos bug, if warning would be printed
1137 # for every DAG.
1138 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1139 total_jobs = 0
1141 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1142 return total_jobs, state_counts
1145def _htc_status_to_wms_state(job):
1146 """Convert HTCondor job status to generic wms state.
1148 Parameters
1149 ----------
1150 job : `dict` [`str`, `Any`]
1151 HTCondor job information.
1153 Returns
1154 -------
1155 wms_state : `WmsStates`
1156 The equivalent WmsState to given job's status.
1157 """
1158 wms_state = WmsStates.MISFIT
1159 if "JobStatus" in job:
1160 wms_state = _htc_job_status_to_wms_state(job)
1161 elif "NodeStatus" in job:
1162 wms_state = _htc_node_status_to_wms_state(job)
1163 return wms_state
1166def _htc_job_status_to_wms_state(job):
1167 """Convert HTCondor job status to generic wms state.
1169 Parameters
1170 ----------
1171 job : `dict` [`str`, `Any`]
1172 HTCondor job information.
1174 Returns
1175 -------
1176 wms_state : `lsst.ctrl.bps.WmsStates`
1177 The equivalent WmsState to given job's status.
1178 """
1179 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1180 type(job["JobStatus"]))
1181 job_status = int(job["JobStatus"])
1182 wms_state = WmsStates.MISFIT
1184 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1185 if job_status == JobStatus.IDLE:
1186 wms_state = WmsStates.PENDING
1187 elif job_status == JobStatus.RUNNING:
1188 wms_state = WmsStates.RUNNING
1189 elif job_status == JobStatus.REMOVED:
1190 wms_state = WmsStates.DELETED
1191 elif job_status == JobStatus.COMPLETED:
1192 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1193 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1194 job.get("ReturnValue", 0):
1195 wms_state = WmsStates.FAILED
1196 else:
1197 wms_state = WmsStates.SUCCEEDED
1198 elif job_status == JobStatus.HELD:
1199 wms_state = WmsStates.HELD
1201 return wms_state
1204def _htc_node_status_to_wms_state(job):
1205 """Convert HTCondor status to generic wms state.
1207 Parameters
1208 ----------
1209 job : `dict` [`str`, `Any`]
1210 HTCondor job information.
1212 Returns
1213 -------
1214 wms_state : `lsst.ctrl.bps.WmsStates`
1215 The equivalent WmsState to given node's status.
1216 """
1217 wms_state = WmsStates.MISFIT
1219 status = job["NodeStatus"]
1220 if status == NodeStatus.NOT_READY:
1221 wms_state = WmsStates.UNREADY
1222 elif status == NodeStatus.READY:
1223 wms_state = WmsStates.READY
1224 elif status == NodeStatus.PRERUN:
1225 wms_state = WmsStates.MISFIT
1226 elif status == NodeStatus.SUBMITTED:
1227 if job["JobProcsHeld"]:
1228 wms_state = WmsStates.HELD
1229 elif job["StatusDetails"] == "not_idle":
1230 wms_state = WmsStates.RUNNING
1231 elif job["JobProcsQueued"]:
1232 wms_state = WmsStates.PENDING
1233 elif status == NodeStatus.POSTRUN:
1234 wms_state = WmsStates.MISFIT
1235 elif status == NodeStatus.DONE:
1236 wms_state = WmsStates.SUCCEEDED
1237 elif status == NodeStatus.ERROR:
1238 # Use job exist instead of post script exit
1239 if "DAGMAN error 0" in job["StatusDetails"]:
1240 wms_state = WmsStates.SUCCEEDED
1241 else:
1242 wms_state = WmsStates.FAILED
1244 return wms_state
1247def _update_jobs(jobs1, jobs2):
1248 """Update jobs1 with info in jobs2.
1250 (Basically an update for nested dictionaries.)
1252 Parameters
1253 ----------
1254 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1255 HTCondor job information to be updated.
1256 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1257 Additional HTCondor job information.
1258 """
1259 for jid, jinfo in jobs2.items():
1260 if jid in jobs1:
1261 jobs1[jid].update(jinfo)
1262 else:
1263 jobs1[jid] = jinfo
1266def _wms_id_to_cluster(wms_id):
1267 """Convert WMS ID to cluster ID.
1269 Parameters
1270 ----------
1271 wms_id : `int` or `float` or `str`
1272 HTCondor job id or path.
1274 Returns
1275 -------
1276 cluster_id : `int`
1277 HTCondor cluster id.
1278 """
1279 # If wms_id represents path, get numeric id.
1280 try:
1281 cluster_id = int(float(wms_id))
1282 except ValueError:
1283 wms_path = Path(wms_id)
1284 if wms_path.exists():
1285 try:
1286 cluster_id, _ = read_dag_log(wms_id)
1287 cluster_id = int(float(cluster_id))
1288 except StopIteration:
1289 cluster_id = 0
1290 else:
1291 cluster_id = 0
1292 return cluster_id
1295def _create_request_memory_expr(memory, multiplier):
1296 """Construct an HTCondor ClassAd expression for safe memory scaling.
1298 Parameters
1299 ----------
1300 memory : `int`
1301 Requested memory in MB.
1302 multiplier : `float`
1303 Memory growth rate between retires.
1305 Returns
1306 -------
1307 ad : `str`
1308 A string representing an HTCondor ClassAd expression enabling safe
1309 memory scaling between job retries.
1310 """
1311 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue.
1312 # The special comparison operators ensure that all comparisons below will
1313 # evaluate to FALSE in this case.
1314 was_mem_exceeded = "LastJobStatus =?= 5 " \
1315 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1316 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1318 # If job runs the first time or was held for reasons other than exceeding
1319 # the memory, set the required memory to the requested value or use
1320 # the memory value measured by HTCondor (MemoryUsage) depending on
1321 # whichever is greater.
1322 ad = f"({was_mem_exceeded}) " \
1323 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \
1324 f": max({{{memory}, MemoryUsage ?: 0}}))"
1325 return ad
1328def _gather_site_values(config, compute_site):
1329 """Gather values specific to given site.
1331 Parameters
1332 ----------
1333 config : `lsst.ctrl.bps.BpsConfig`
1334 BPS configuration that includes necessary submit/runtime
1335 information.
1336 compute_site : `str`
1337 Compute site name.
1339 Returns
1340 -------
1341 site_values : `dict` [`str`, `Any`]
1342 Values specific to the given site.
1343 """
1344 site_values = {"attrs": {}, "profile": {}}
1345 search_opts = {}
1346 if compute_site:
1347 search_opts["curvals"] = {"curr_site": compute_site}
1349 # Determine the hard limit for the memory requirement.
1350 found, limit = config.search('memoryLimit', opt=search_opts)
1351 if not found:
1352 search_opts["default"] = DEFAULT_HTC_EXEC_PATT
1353 _, patt = config.search("executeMachinesPattern", opt=search_opts)
1354 del search_opts["default"]
1356 # To reduce the amount of data, ignore dynamic slots (if any) as,
1357 # by definition, they cannot have more memory than
1358 # the partitionable slot they are the part of.
1359 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
1360 pool_info = condor_status(constraint=constraint)
1361 try:
1362 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
1363 except ValueError:
1364 _LOG.debug("No execute machine in the pool matches %s", patt)
1365 if limit:
1366 config[".bps_defined.memory_limit"] = limit
1368 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
1369 site_values["memoryLimit"] = limit
1371 key = f".site.{compute_site}.profile.condor"
1372 if key in config:
1373 for key, val in config[key].items():
1374 if key.startswith("+"):
1375 site_values["attrs"][key[1:]] = val
1376 else:
1377 site_values["profile"][key] = val
1379 return site_values