Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%
741 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 03:42 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 03:42 -0700
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Interface between generic workflow to HTCondor workflow system.
29"""
31__all__ = ["HTCondorService", "HTCondorWorkflow"]
34import logging
35import os
36import re
37from collections import defaultdict
38from enum import IntEnum, auto
39from pathlib import Path
41import htcondor
42from lsst.ctrl.bps import (
43 BaseWmsService,
44 BaseWmsWorkflow,
45 GenericWorkflow,
46 GenericWorkflowJob,
47 WmsJobReport,
48 WmsRunReport,
49 WmsStates,
50)
51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary
52from lsst.utils.timer import time_this
53from packaging import version
55from .lssthtc import (
56 MISSING_ID,
57 HTCDag,
58 HTCJob,
59 JobStatus,
60 NodeStatus,
61 condor_history,
62 condor_q,
63 condor_search,
64 condor_status,
65 htc_backup_files,
66 htc_check_dagman_output,
67 htc_create_submit_from_cmd,
68 htc_create_submit_from_dag,
69 htc_create_submit_from_file,
70 htc_escape,
71 htc_submit_dag,
72 htc_version,
73 pegasus_name_to_label,
74 read_dag_info,
75 read_dag_log,
76 read_dag_status,
77 read_node_status,
78 summary_from_dag,
79 write_dag_info,
80)
83class WmsIdType(IntEnum):
84 """Type of valid WMS ids."""
86 UNKNOWN = auto()
87 """The type of id cannot be determined.
88 """
90 LOCAL = auto()
91 """The id is HTCondor job's ClusterId (with optional '.ProcId').
92 """
94 GLOBAL = auto()
95 """Id is a HTCondor's global job id.
96 """
98 PATH = auto()
99 """Id is a submission path.
100 """
103DEFAULT_HTC_EXEC_PATT = ".*worker.*"
104"""Default pattern for searching execute machines in an HTCondor pool.
105"""
107_LOG = logging.getLogger(__name__)
110class HTCondorService(BaseWmsService):
111 """HTCondor version of WMS service."""
113 def prepare(self, config, generic_workflow, out_prefix=None):
114 """Convert generic workflow to an HTCondor DAG ready for submission.
116 Parameters
117 ----------
118 config : `lsst.ctrl.bps.BpsConfig`
119 BPS configuration that includes necessary submit/runtime
120 information.
121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
122 The generic workflow (e.g., has executable name and arguments).
123 out_prefix : `str`
124 The root directory into which all WMS-specific files are written.
126 Returns
127 -------
128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
129 HTCondor workflow ready to be run.
130 """
131 _LOG.debug("out_prefix = '%s'", out_prefix)
132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"):
133 workflow = HTCondorWorkflow.from_generic_workflow(
134 config,
135 generic_workflow,
136 out_prefix,
137 f"{self.__class__.__module__}.{self.__class__.__name__}",
138 )
140 with time_this(
141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow"
142 ):
143 workflow.write(out_prefix)
144 return workflow
146 def submit(self, workflow, **kwargs):
147 """Submit a single HTCondor workflow.
149 Parameters
150 ----------
151 workflow : `lsst.ctrl.bps.BaseWorkflow`
152 A single HTCondor workflow to submit. run_id is updated after
153 successful submission to WMS.
154 **kwargs : `~typing.Any`
155 """
156 dag = workflow.dag
158 ver = version.parse(htc_version())
159 if ver >= version.parse("8.9.3"):
160 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
161 else:
162 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
164 # For workflow portability, internal paths are all relative. Hence
165 # the DAG needs to be submitted to HTCondor from inside the submit
166 # directory.
167 with chdir(workflow.submit_path):
168 _LOG.info("Submitting from directory: %s", os.getcwd())
169 schedd_dag_info = htc_submit_dag(sub)
170 if schedd_dag_info:
171 write_dag_info(f"{dag.name}.info.json", schedd_dag_info)
173 _, dag_info = schedd_dag_info.popitem()
174 _, dag_ad = dag_info.popitem()
176 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
177 workflow.run_id = dag.run_id
178 else:
179 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information")
181 def restart(self, wms_workflow_id):
182 """Restart a failed DAGMan workflow.
184 Parameters
185 ----------
186 wms_workflow_id : `str`
187 The directory with HTCondor files.
189 Returns
190 -------
191 run_id : `str`
192 HTCondor id of the restarted DAGMan job. If restart failed, it will
193 be set to None.
194 run_name : `str`
195 Name of the restarted workflow. If restart failed, it will be set
196 to None.
197 message : `str`
198 A message describing any issues encountered during the restart.
199 If there were no issues, an empty string is returned.
200 """
201 wms_path, id_type = _wms_id_to_dir(wms_workflow_id)
202 if wms_path is None:
203 return (
204 None,
205 None,
206 (
207 f"workflow with run id '{wms_workflow_id}' not found. "
208 f"Hint: use run's submit directory as the id instead"
209 ),
210 )
212 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}:
213 if not wms_path.is_dir():
214 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found."
216 _LOG.info("Restarting workflow from directory '%s'", wms_path)
217 rescue_dags = list(wms_path.glob("*.dag.rescue*"))
218 if not rescue_dags:
219 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'"
221 _LOG.info("Verifying that the workflow is not already in the job queue")
222 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"')
223 if schedd_dag_info:
224 _, dag_info = schedd_dag_info.popitem()
225 _, dag_ad = dag_info.popitem()
226 id_ = dag_ad["GlobalJobId"]
227 return None, None, f"Workflow already in the job queue (global job id: '{id_}')"
229 _LOG.info("Checking execution status of the workflow")
230 warn = False
231 dag_ad = read_dag_status(str(wms_path))
232 if dag_ad:
233 nodes_total = dag_ad.get("NodesTotal", 0)
234 if nodes_total != 0:
235 nodes_done = dag_ad.get("NodesDone", 0)
236 if nodes_total == nodes_done:
237 return None, None, "All jobs in the workflow finished successfully"
238 else:
239 warn = True
240 else:
241 warn = True
242 if warn:
243 _LOG.warning(
244 "Cannot determine the execution status of the workflow, continuing with restart regardless"
245 )
247 _LOG.info("Backing up select HTCondor files from previous run attempt")
248 htc_backup_files(wms_path, subdir="backups")
250 # For workflow portability, internal paths are all relative. Hence
251 # the DAG needs to be resubmitted to HTCondor from inside the submit
252 # directory.
253 _LOG.info("Adding workflow to the job queue")
254 run_id, run_name, message = None, None, ""
255 with chdir(wms_path):
256 try:
257 dag_path = next(wms_path.glob("*.dag.condor.sub"))
258 except StopIteration:
259 message = f"DAGMan submit description file not found in '{wms_path}'"
260 else:
261 sub = htc_create_submit_from_file(dag_path.name)
262 schedd_dag_info = htc_submit_dag(sub)
264 # Save select information about the DAGMan job to a file. Use
265 # the run name (available in the ClassAd) as the filename.
266 if schedd_dag_info:
267 dag_info = next(iter(schedd_dag_info.values()))
268 dag_ad = next(iter(dag_info.values()))
269 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info)
270 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
271 run_name = dag_ad["bps_run"]
272 else:
273 message = "DAGMan job information unavailable"
275 return run_id, run_name, message
277 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
278 """Query WMS for list of submitted WMS workflows/jobs.
280 This should be a quick lookup function to create list of jobs for
281 other functions.
283 Parameters
284 ----------
285 wms_id : `int` or `str`, optional
286 Id or path that can be used by WMS service to look up job.
287 user : `str`, optional
288 User whose submitted jobs should be listed.
289 require_bps : `bool`, optional
290 Whether to require jobs returned in list to be bps-submitted jobs.
291 pass_thru : `str`, optional
292 Information to pass through to WMS.
293 is_global : `bool`, optional
294 If set, all job queues (and their histories) will be queried for
295 job information. Defaults to False which means that only the local
296 job queue will be queried.
298 Returns
299 -------
300 job_ids : `list` [`Any`]
301 Only job ids to be used by cancel and other functions. Typically
302 this means top-level jobs (i.e., not children jobs).
303 """
304 _LOG.debug(
305 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
306 wms_id,
307 user,
308 require_bps,
309 pass_thru,
310 is_global,
311 )
313 # Determine which Schedds will be queried for job information.
314 coll = htcondor.Collector()
316 schedd_ads = []
317 if is_global:
318 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
319 else:
320 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
322 # Construct appropriate constraint expression using provided arguments.
323 constraint = "False"
324 if wms_id is None:
325 if user is not None:
326 constraint = f'(Owner == "{user}")'
327 else:
328 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
329 if cluster_id is not None:
330 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
332 # If provided id is either a submission path or a global id,
333 # make sure the right Schedd will be queried regardless of
334 # 'is_global' value.
335 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
336 schedd_ads = [schedd_ad]
337 if require_bps:
338 constraint += ' && (bps_isjob == "True")'
339 if pass_thru:
340 if "-forcex" in pass_thru:
341 pass_thru_2 = pass_thru.replace("-forcex", "")
342 if pass_thru_2 and not pass_thru_2.isspace():
343 constraint += f" && ({pass_thru_2})"
344 else:
345 constraint += f" && ({pass_thru})"
347 # Create a list of scheduler daemons which need to be queried.
348 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
350 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
351 results = condor_q(constraint=constraint, schedds=schedds)
353 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
354 job_ids = []
355 for schedd_name, job_info in results.items():
356 for job_id, job_ad in job_info.items():
357 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
358 if "DAGManJobId" not in job_ad:
359 job_ids.append(job_ad.get("GlobalJobId", job_id))
360 else:
361 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
362 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
363 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
364 job_ids.append(job_ad.get("GlobalJobId", job_id))
366 _LOG.debug("job_ids = %s", job_ids)
367 return job_ids
369 def report(
370 self,
371 wms_workflow_id=None,
372 user=None,
373 hist=0,
374 pass_thru=None,
375 is_global=False,
376 return_exit_codes=False,
377 ):
378 """Return run information based upon given constraints.
380 Parameters
381 ----------
382 wms_workflow_id : `str`, optional
383 Limit to specific run based on id.
384 user : `str`, optional
385 Limit results to runs for this user.
386 hist : `float`, optional
387 Limit history search to this many days. Defaults to 0.
388 pass_thru : `str`, optional
389 Constraints to pass through to HTCondor.
390 is_global : `bool`, optional
391 If set, all job queues (and their histories) will be queried for
392 job information. Defaults to False which means that only the local
393 job queue will be queried.
394 return_exit_codes : `bool`, optional
395 If set, return exit codes related to jobs with a
396 non-success status. Defaults to False, which means that only
397 the summary state is returned.
399 Only applicable in the context of a WMS with associated
400 handlers to return exit codes from jobs.
402 Returns
403 -------
404 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
405 Information about runs from given job information.
406 message : `str`
407 Extra message for report command to print. This could be pointers
408 to documentation or to WMS specific commands.
409 """
410 if wms_workflow_id:
411 id_type = _wms_id_type(wms_workflow_id)
412 if id_type == WmsIdType.LOCAL:
413 schedulers = _locate_schedds(locate_all=is_global)
414 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
415 elif id_type == WmsIdType.GLOBAL:
416 schedulers = _locate_schedds(locate_all=True)
417 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
418 elif id_type == WmsIdType.PATH:
419 run_reports, message = _report_from_path(wms_workflow_id)
420 else:
421 run_reports, message = {}, "Invalid job id"
422 else:
423 schedulers = _locate_schedds(locate_all=is_global)
424 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
425 _LOG.debug("report: %s, %s", run_reports, message)
427 return list(run_reports.values()), message
429 def cancel(self, wms_id, pass_thru=None):
430 """Cancel submitted workflows/jobs.
432 Parameters
433 ----------
434 wms_id : `str`
435 Id or path of job that should be canceled.
436 pass_thru : `str`, optional
437 Information to pass through to WMS.
439 Returns
440 -------
441 deleted : `bool`
442 Whether successful deletion or not. Currently, if any doubt or any
443 individual jobs not deleted, return False.
444 message : `str`
445 Any message from WMS (e.g., error details).
446 """
447 _LOG.debug("Canceling wms_id = %s", wms_id)
449 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
451 if cluster_id is None:
452 deleted = False
453 message = "invalid id"
454 else:
455 _LOG.debug(
456 "Canceling job managed by schedd_name = %s with cluster_id = %s",
457 cluster_id,
458 schedd_ad["Name"],
459 )
460 schedd = htcondor.Schedd(schedd_ad)
462 constraint = f"ClusterId == {cluster_id}"
463 if pass_thru is not None and "-forcex" in pass_thru:
464 pass_thru_2 = pass_thru.replace("-forcex", "")
465 if pass_thru_2 and not pass_thru_2.isspace():
466 constraint += f"&& ({pass_thru_2})"
467 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
468 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
469 else:
470 if pass_thru:
471 constraint += f"&& ({pass_thru})"
472 _LOG.debug("JobAction.Remove constraint = %s", constraint)
473 results = schedd.act(htcondor.JobAction.Remove, constraint)
474 _LOG.debug("Remove results: %s", results)
476 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
477 deleted = True
478 message = ""
479 else:
480 deleted = False
481 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
482 message = "no such bps job in batch queue"
483 else:
484 message = f"unknown problems deleting: {results}"
486 _LOG.debug("deleted: %s; message = %s", deleted, message)
487 return deleted, message
490class HTCondorWorkflow(BaseWmsWorkflow):
491 """Single HTCondor workflow.
493 Parameters
494 ----------
495 name : `str`
496 Unique name for Workflow used when naming files.
497 config : `lsst.ctrl.bps.BpsConfig`
498 BPS configuration that includes necessary submit/runtime information.
499 """
501 def __init__(self, name, config=None):
502 super().__init__(name, config)
503 self.dag = None
505 @classmethod
506 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
507 # Docstring inherited
508 htc_workflow = cls(generic_workflow.name, config)
509 htc_workflow.dag = HTCDag(name=generic_workflow.name)
511 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
512 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
513 htc_workflow.dag.add_attribs(
514 {
515 "bps_wms_service": service_class,
516 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
517 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
518 "bps_job_summary": create_count_summary(generic_workflow.job_counts),
519 }
520 )
522 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
523 if isinstance(tmp_template, str):
524 subdir_template = defaultdict(lambda: tmp_template)
525 else:
526 subdir_template = tmp_template
528 # Create all DAG jobs
529 site_values = {} # cache compute site specific values to reduce config lookups
530 for job_name in generic_workflow:
531 gwjob = generic_workflow.get_job(job_name)
532 if gwjob.compute_site not in site_values:
533 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
534 htc_job = _create_job(
535 subdir_template[gwjob.label],
536 site_values[gwjob.compute_site],
537 generic_workflow,
538 gwjob,
539 out_prefix,
540 )
541 htc_workflow.dag.add_job(htc_job)
543 # Add job dependencies to the DAG
544 for job_name in generic_workflow:
545 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
547 # If final job exists in generic workflow, create DAG final job
548 final = generic_workflow.get_final()
549 if final and isinstance(final, GenericWorkflowJob):
550 if final.compute_site and final.compute_site not in site_values:
551 site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
552 final_htjob = _create_job(
553 subdir_template[final.label],
554 site_values[final.compute_site],
555 generic_workflow,
556 final,
557 out_prefix,
558 )
559 if "post" not in final_htjob.dagcmds:
560 final_htjob.dagcmds["post"] = (
561 f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
562 )
563 htc_workflow.dag.add_final_job(final_htjob)
564 elif final and isinstance(final, GenericWorkflow):
565 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
566 elif final:
567 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
569 return htc_workflow
571 def write(self, out_prefix):
572 """Output HTCondor DAGMan files needed for workflow submission.
574 Parameters
575 ----------
576 out_prefix : `str`
577 Directory prefix for HTCondor files.
578 """
579 self.submit_path = out_prefix
580 os.makedirs(out_prefix, exist_ok=True)
582 # Write down the workflow in HTCondor format.
583 self.dag.write(out_prefix, "jobs/{self.label}")
586def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
587 """Convert GenericWorkflow job nodes to DAG jobs.
589 Parameters
590 ----------
591 subdir_template : `str`
592 Template for making subdirs.
593 site_values : `dict`
594 Site specific values
595 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
596 Generic workflow that is being converted.
597 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
598 The generic job to convert to a HTCondor job.
599 out_prefix : `str`
600 Directory prefix for HTCondor files.
602 Returns
603 -------
604 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
605 The HTCondor job equivalent to the given generic job.
606 """
607 htc_job = HTCJob(gwjob.name, label=gwjob.label)
609 curvals = defaultdict(str)
610 curvals["label"] = gwjob.label
611 if gwjob.tags:
612 curvals.update(gwjob.tags)
614 subdir = subdir_template.format_map(curvals)
615 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
617 htc_job_cmds = {
618 "universe": "vanilla",
619 "should_transfer_files": "YES",
620 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
621 "transfer_output_files": '""', # Set to empty string to disable
622 "transfer_executable": "False",
623 "getenv": "True",
624 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
625 # htcondor to put on hold any jobs which exited by a signal.
626 "on_exit_hold": "ExitBySignal == true",
627 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
628 '"Handling signal as if job has gone over memory limit.")',
629 "on_exit_hold_subcode": "34",
630 }
632 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
634 # job stdout, stderr, htcondor user log.
635 for key in ("output", "error", "log"):
636 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
637 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
639 htc_job_cmds.update(
640 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
641 )
643 # Add the job cmds dict to the job object.
644 htc_job.add_job_cmds(htc_job_cmds)
646 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
648 # Add job attributes to job.
649 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
650 htc_job.add_job_attrs(gwjob.attrs)
651 htc_job.add_job_attrs(site_values["attrs"])
652 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
653 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
655 return htc_job
658def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
659 """Translate the job data that are one to one mapping
661 Parameters
662 ----------
663 cached_vals : `dict` [`str`, `Any`]
664 Config values common to jobs with same label.
665 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
666 Generic workflow that contains job to being converted.
667 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
668 Generic workflow job to be converted.
670 Returns
671 -------
672 htc_job_commands : `dict` [`str`, `Any`]
673 Contains commands which can appear in the HTCondor submit description
674 file.
675 """
676 # Values in the job script that just are name mappings.
677 job_translation = {
678 "mail_to": "notify_user",
679 "when_to_mail": "notification",
680 "request_cpus": "request_cpus",
681 "priority": "priority",
682 "category": "category",
683 "accounting_group": "accounting_group",
684 "accounting_user": "accounting_group_user",
685 }
687 jobcmds = {}
688 for gwkey, htckey in job_translation.items():
689 jobcmds[htckey] = getattr(gwjob, gwkey, None)
691 # If accounting info was not set explicitly, use site settings if any.
692 if not gwjob.accounting_group:
693 jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
694 if not gwjob.accounting_user:
695 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
697 # job commands that need modification
698 if gwjob.number_of_retries:
699 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
701 if gwjob.retry_unless_exit:
702 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
704 if gwjob.request_disk:
705 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
707 if gwjob.request_memory:
708 jobcmds["request_memory"] = f"{gwjob.request_memory}"
710 if gwjob.memory_multiplier:
711 # Do not use try-except! At the moment, BpsConfig returns an empty
712 # string if it does not contain the key.
713 memory_limit = cached_vals["memoryLimit"]
714 if not memory_limit:
715 raise RuntimeError(
716 "Memory autoscaling enabled, but automatic detection of the memory limit "
717 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
718 "search pattern 'executeMachinesPattern' might help."
719 )
721 # Set maximal amount of memory job can ask for.
722 #
723 # The check below assumes that 'memory_limit' was set to a value which
724 # realistically reflects actual physical limitations of a given compute
725 # resource.
726 memory_max = memory_limit
727 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
728 memory_max = gwjob.request_memory_max
730 # Make job ask for more memory each time it failed due to insufficient
731 # memory requirements.
732 jobcmds["request_memory"] = _create_request_memory_expr(
733 gwjob.request_memory, gwjob.memory_multiplier, memory_max
734 )
736 # Periodically release jobs which are being held due to exceeding
737 # memory. Stop doing that (by removing the job from the HTCondor queue)
738 # after the maximal number of retries has been reached or the job was
739 # already run at maximal allowed memory.
740 jobcmds["periodic_release"] = _create_periodic_release_expr(
741 gwjob.request_memory, gwjob.memory_multiplier, memory_max
742 )
743 jobcmds["periodic_remove"] = _create_periodic_remove_expr(
744 gwjob.request_memory, gwjob.memory_multiplier, memory_max
745 )
747 # Assume concurrency_limit implemented using HTCondor concurrency limits.
748 # May need to move to special site-specific implementation if sites use
749 # other mechanisms.
750 if gwjob.concurrency_limit:
751 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
753 # Handle command line
754 if gwjob.executable.transfer_executable:
755 jobcmds["transfer_executable"] = "True"
756 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
757 else:
758 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
760 if gwjob.arguments:
761 arguments = gwjob.arguments
762 arguments = _replace_cmd_vars(arguments, gwjob)
763 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
764 arguments = _fix_env_var_syntax(arguments)
765 jobcmds["arguments"] = arguments
767 # Add extra "pass-thru" job commands
768 if gwjob.profile:
769 for key, val in gwjob.profile.items():
770 jobcmds[key] = htc_escape(val)
771 for key, val in cached_vals["profile"].items():
772 jobcmds[key] = htc_escape(val)
774 return jobcmds
777def _translate_dag_cmds(gwjob):
778 """Translate job values into DAGMan commands.
780 Parameters
781 ----------
782 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
783 Job containing values to be translated.
785 Returns
786 -------
787 dagcmds : `dict` [`str`, `Any`]
788 DAGMan commands for the job.
789 """
790 # Values in the dag script that just are name mappings.
791 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
793 dagcmds = {}
794 for gwkey, htckey in dag_translation.items():
795 dagcmds[htckey] = getattr(gwjob, gwkey, None)
797 # Still to be coded: vars "pre_cmdline", "post_cmdline"
798 return dagcmds
801def _fix_env_var_syntax(oldstr):
802 """Change ENV place holders to HTCondor Env var syntax.
804 Parameters
805 ----------
806 oldstr : `str`
807 String in which environment variable syntax is to be fixed.
809 Returns
810 -------
811 newstr : `str`
812 Given string with environment variable syntax fixed.
813 """
814 newstr = oldstr
815 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
816 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
817 return newstr
820def _replace_file_vars(use_shared, arguments, workflow, gwjob):
821 """Replace file placeholders in command line arguments with correct
822 physical file names.
824 Parameters
825 ----------
826 use_shared : `bool`
827 Whether HTCondor can assume shared filesystem.
828 arguments : `str`
829 Arguments string in which to replace file placeholders.
830 workflow : `lsst.ctrl.bps.GenericWorkflow`
831 Generic workflow that contains file information.
832 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
833 The job corresponding to the arguments.
835 Returns
836 -------
837 arguments : `str`
838 Given arguments string with file placeholders replaced.
839 """
840 # Replace input file placeholders with paths.
841 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
842 if not gwfile.wms_transfer:
843 # Must assume full URI if in command line and told WMS is not
844 # responsible for transferring file.
845 uri = gwfile.src_uri
846 elif use_shared:
847 if gwfile.job_shared:
848 # Have shared filesystems and jobs can share file.
849 uri = gwfile.src_uri
850 else:
851 # Taking advantage of inside knowledge. Not future-proof.
852 # Temporary fix until have job wrapper that pulls files
853 # within job.
854 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
855 uri = "butler.yaml"
856 else:
857 uri = os.path.basename(gwfile.src_uri)
858 else: # Using push transfer
859 uri = os.path.basename(gwfile.src_uri)
860 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
862 # Replace output file placeholders with paths.
863 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
864 if not gwfile.wms_transfer:
865 # Must assume full URI if in command line and told WMS is not
866 # responsible for transferring file.
867 uri = gwfile.src_uri
868 elif use_shared:
869 if gwfile.job_shared:
870 # Have shared filesystems and jobs can share file.
871 uri = gwfile.src_uri
872 else:
873 uri = os.path.basename(gwfile.src_uri)
874 else: # Using push transfer
875 uri = os.path.basename(gwfile.src_uri)
876 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
877 return arguments
880def _replace_cmd_vars(arguments, gwjob):
881 """Replace format-style placeholders in arguments.
883 Parameters
884 ----------
885 arguments : `str`
886 Arguments string in which to replace placeholders.
887 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
888 Job containing values to be used to replace placeholders
889 (in particular gwjob.cmdvals).
891 Returns
892 -------
893 arguments : `str`
894 Given arguments string with placeholders replaced.
895 """
896 try:
897 arguments = arguments.format(**gwjob.cmdvals)
898 except (KeyError, TypeError): # TypeError in case None instead of {}
899 _LOG.error(
900 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals
901 )
902 raise
903 return arguments
906def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
907 """Add job input files from generic workflow to job.
909 Parameters
910 ----------
911 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
912 The generic workflow (e.g., has executable name and arguments).
913 job_name : `str`
914 Unique name for the job.
915 use_shared : `bool`
916 Whether job has access to files via shared filesystem.
917 out_prefix : `str`
918 The root directory into which all WMS-specific files are written.
920 Returns
921 -------
922 htc_commands : `dict` [`str`, `str`]
923 HTCondor commands for the job submission script.
924 """
925 htc_commands = {}
926 inputs = []
927 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
928 _LOG.debug("src_uri=%s", gwf_file.src_uri)
930 uri = Path(gwf_file.src_uri)
932 # Note if use_shared and job_shared, don't need to transfer file.
934 if not use_shared: # Copy file using push to job
935 inputs.append(str(uri.relative_to(out_prefix)))
936 elif not gwf_file.job_shared: # Jobs require own copy
937 # if using shared filesystem, but still need copy in job. Use
938 # HTCondor's curl plugin for a local copy.
940 # Execution butler is represented as a directory which the
941 # curl plugin does not handle. Taking advantage of inside
942 # knowledge for temporary fix until have job wrapper that pulls
943 # files within job.
944 if gwf_file.name == "butlerConfig":
945 # The execution butler directory doesn't normally exist until
946 # the submit phase so checking for suffix instead of using
947 # is_dir(). If other non-yaml file exists they would have a
948 # different gwf_file.name.
949 if uri.suffix == ".yaml": # Single file, so just copy.
950 inputs.append(f"file://{uri}")
951 else:
952 inputs.append(f"file://{uri / 'butler.yaml'}")
953 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
954 elif uri.is_dir():
955 raise RuntimeError(
956 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
957 )
958 else:
959 inputs.append(f"file://{uri}")
961 if inputs:
962 htc_commands["transfer_input_files"] = ",".join(inputs)
963 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
964 return htc_commands
967def _report_from_path(wms_path):
968 """Gather run information from a given run directory.
970 Parameters
971 ----------
972 wms_path : `str`
973 The directory containing the submit side files (e.g., HTCondor files).
975 Returns
976 -------
977 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
978 Run information for the detailed report. The key is the HTCondor id
979 and the value is a collection of report information for that run.
980 message : `str`
981 Message to be printed with the summary report.
982 """
983 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
984 if wms_workflow_id == MISSING_ID:
985 run_reports = {}
986 else:
987 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
988 return run_reports, message
991def _report_from_id(wms_workflow_id, hist, schedds=None):
992 """Gather run information using workflow id.
994 Parameters
995 ----------
996 wms_workflow_id : `str`
997 Limit to specific run based on id.
998 hist : `float`
999 Limit history search to this many days.
1000 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1001 HTCondor schedulers which to query for job information. If None
1002 (default), all queries will be run against the local scheduler only.
1004 Returns
1005 -------
1006 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1007 Run information for the detailed report. The key is the HTCondor id
1008 and the value is a collection of report information for that run.
1009 message : `str`
1010 Message to be printed with the summary report.
1011 """
1012 messages = []
1014 # Collect information about the job by querying HTCondor schedd and
1015 # HTCondor history.
1016 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
1017 if len(schedd_dag_info) == 1:
1018 # Extract the DAG info without altering the results of the query.
1019 schedd_name = next(iter(schedd_dag_info))
1020 dag_id = next(iter(schedd_dag_info[schedd_name]))
1021 dag_ad = schedd_dag_info[schedd_name][dag_id]
1023 # If the provided workflow id does not correspond to the one extracted
1024 # from the DAGMan log file in the submit directory, rerun the query
1025 # with the id found in the file.
1026 #
1027 # This is to cover the situation in which the user provided the old job
1028 # id of a restarted run.
1029 try:
1030 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"])
1031 except FileNotFoundError as exc:
1032 # At the moment missing DAGMan log is pretty much a fatal error.
1033 # So empty the DAG info to finish early (see the if statement
1034 # below).
1035 schedd_dag_info.clean()
1036 messages.append(f"Cannot create the report for '{dag_id}': {exc}")
1037 else:
1038 if path_dag_id != dag_id:
1039 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
1040 messages.append(
1041 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
1042 "This normally occurs when a run is restarted. The report shown is for the most "
1043 f"recent status with run id '{path_dag_id}'"
1044 )
1046 if len(schedd_dag_info) == 0:
1047 run_reports = {}
1048 elif len(schedd_dag_info) == 1:
1049 _, dag_info = schedd_dag_info.popitem()
1050 dag_id, dag_ad = dag_info.popitem()
1052 # Create a mapping between jobs and their classads. The keys will
1053 # be of format 'ClusterId.ProcId'.
1054 job_info = {dag_id: dag_ad}
1056 # Find jobs (nodes) belonging to that DAGMan job.
1057 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
1058 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
1059 if schedd_job_info:
1060 _, node_info = schedd_job_info.popitem()
1061 job_info.update(node_info)
1063 # Collect additional pieces of information about jobs using HTCondor
1064 # files in the submission directory.
1065 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
1066 _update_jobs(job_info, path_jobs)
1067 if message:
1068 messages.append(message)
1069 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
1070 else:
1071 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
1072 message = (
1073 f"More than one job matches id '{wms_workflow_id}', "
1074 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
1075 )
1076 messages.append(message)
1077 run_reports = {}
1079 message = "\n".join(messages)
1080 return run_reports, message
1083def _get_info_from_schedd(wms_workflow_id, hist, schedds):
1084 """Gather run information from HTCondor.
1086 Parameters
1087 ----------
1088 wms_workflow_id : `str`
1089 Limit to specific run based on id.
1090 hist : `int`
1091 Limit history search to this many days.
1092 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1093 HTCondor schedulers which to query for job information. If None
1094 (default), all queries will be run against the local scheduler only.
1096 Returns
1097 -------
1098 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1099 Information about jobs satisfying the search criteria where for each
1100 Scheduler, local HTCondor job ids are mapped to their respective
1101 classads.
1102 """
1103 dag_constraint = 'regexp("dagman$", Cmd)'
1104 try:
1105 cluster_id = int(float(wms_workflow_id))
1106 except ValueError:
1107 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
1108 else:
1109 dag_constraint += f" && ClusterId == {cluster_id}"
1111 # With the current implementation of the condor_* functions the query
1112 # will always return only one match per Scheduler.
1113 #
1114 # Even in the highly unlikely situation where HTCondor history (which
1115 # condor_search queries too) is long enough to have jobs from before
1116 # the cluster ids were rolled over (and as a result there is more then
1117 # one job with the same cluster id) they will not show up in
1118 # the results.
1119 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
1120 return schedd_dag_info
1123def _get_info_from_path(wms_path):
1124 """Gather run information from a given run directory.
1126 Parameters
1127 ----------
1128 wms_path : `str`
1129 Directory containing HTCondor files.
1131 Returns
1132 -------
1133 wms_workflow_id : `str`
1134 The run id which is a DAGman job id.
1135 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1136 Information about jobs read from files in the given directory.
1137 The key is the HTCondor id and the value is a dictionary of HTCondor
1138 keys and values.
1139 message : `str`
1140 Message to be printed with the summary report.
1141 """
1142 messages = []
1143 try:
1144 wms_workflow_id, jobs = read_dag_log(wms_path)
1145 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
1146 _update_jobs(jobs, read_node_status(wms_path))
1147 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
1149 # Add more info for DAGman job
1150 job = jobs[wms_workflow_id]
1151 job.update(read_dag_status(wms_path))
1153 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
1154 if "bps_run" not in job:
1155 _add_run_info(wms_path, job)
1157 message = htc_check_dagman_output(wms_path)
1158 if message:
1159 messages.append(message)
1160 _LOG.debug(
1161 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
1162 )
1164 # Add extra pieces of information which cannot be found in HTCondor
1165 # generated files like 'GlobalJobId'.
1166 #
1167 # Do not treat absence of this file as a serious error. Neither runs
1168 # submitted with earlier versions of the plugin nor the runs submitted
1169 # with Pegasus plugin will have it at the moment. However, once enough
1170 # time passes and Pegasus plugin will have its own report() method
1171 # (instead of sneakily using HTCondor's one), the lack of that file
1172 # should be treated as seriously as lack of any other file.
1173 try:
1174 job_info = read_dag_info(wms_path)
1175 except FileNotFoundError as exc:
1176 message = f"Warn: Some information may not be available: {exc}"
1177 messages.append(message)
1178 else:
1179 schedd_name = next(iter(job_info))
1180 job_ad = next(iter(job_info[schedd_name].values()))
1181 job.update(job_ad)
1182 except FileNotFoundError:
1183 message = f"Could not find HTCondor files in '{wms_path}'"
1184 _LOG.warning(message)
1185 messages.append(message)
1186 wms_workflow_id = MISSING_ID
1187 jobs = {}
1189 message = "\n".join([msg for msg in messages if msg])
1190 return wms_workflow_id, jobs, message
1193def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
1194 """Gather run information to be used in generating summary reports.
1196 Parameters
1197 ----------
1198 wms_workflow_id : `str`
1199 The run id to create the report for.
1200 jobs : `dict` [`str`, `dict` [`str`, Any]]
1201 Mapping HTCondor job id to job information.
1203 Returns
1204 -------
1205 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1206 Run information for the detailed report. The key is the given HTCondor
1207 id and the value is a collection of report information for that run.
1208 """
1209 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1210 dag_job = jobs.pop(wms_workflow_id)
1211 report = WmsRunReport(
1212 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
1213 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
1214 path=dag_job["Iwd"],
1215 label=dag_job.get("bps_job_label", "MISS"),
1216 run=dag_job.get("bps_run", "MISS"),
1217 project=dag_job.get("bps_project", "MISS"),
1218 campaign=dag_job.get("bps_campaign", "MISS"),
1219 payload=dag_job.get("bps_payload", "MISS"),
1220 operator=_get_owner(dag_job),
1221 run_summary=_get_run_summary(dag_job),
1222 state=_htc_status_to_wms_state(dag_job),
1223 jobs=[],
1224 total_number_jobs=dag_job["total_jobs"],
1225 job_state_counts=dag_job["state_counts"],
1226 exit_code_summary=_get_exit_code_summary(jobs),
1227 )
1229 for job_id, job_info in jobs.items():
1230 try:
1231 job_report = WmsJobReport(
1232 wms_id=job_id,
1233 name=job_info.get("DAGNodeName", job_id),
1234 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])),
1235 state=_htc_status_to_wms_state(job_info),
1236 )
1237 if job_report.label == "init":
1238 job_report.label = "pipetaskInit"
1239 report.jobs.append(job_report)
1240 except KeyError as ex:
1241 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
1242 raise
1244 # Add the removed entry to restore the original content of the dictionary.
1245 # The ordering of keys will be change permanently though.
1246 jobs.update({wms_workflow_id: dag_job})
1248 run_reports = {report.wms_id: report}
1249 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1250 return run_reports
1253def _summary_report(user, hist, pass_thru, schedds=None):
1254 """Gather run information to be used in generating summary reports.
1256 Parameters
1257 ----------
1258 user : `str`
1259 Run lookup restricted to given user.
1260 hist : `float`
1261 How many previous days to search for run information.
1262 pass_thru : `str`
1263 Advanced users can define the HTCondor constraint to be used
1264 when searching queue and history.
1266 Returns
1267 -------
1268 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1269 Run information for the summary report. The keys are HTCondor ids and
1270 the values are collections of report information for each run.
1271 message : `str`
1272 Message to be printed with the summary report.
1273 """
1274 # only doing summary report so only look for dagman jobs
1275 if pass_thru:
1276 constraint = pass_thru
1277 else:
1278 # Notes:
1279 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1280 # manually restarted.
1281 # * Any job with DAGManJobID isn't a DAG job
1282 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1283 if user:
1284 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1286 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1288 # Have list of DAGMan jobs, need to get run_report info.
1289 run_reports = {}
1290 for jobs in job_info.values():
1291 for job_id, job in jobs.items():
1292 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1293 # If didn't get from queue information (e.g., Kerberos bug),
1294 # try reading from file.
1295 if total_jobs == 0:
1296 try:
1297 job.update(read_dag_status(job["Iwd"]))
1298 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1299 except StopIteration:
1300 pass # don't kill report can't find htcondor files
1302 if "bps_run" not in job:
1303 _add_run_info(job["Iwd"], job)
1304 report = WmsRunReport(
1305 wms_id=job_id,
1306 global_wms_id=job["GlobalJobId"],
1307 path=job["Iwd"],
1308 label=job.get("bps_job_label", "MISS"),
1309 run=job.get("bps_run", "MISS"),
1310 project=job.get("bps_project", "MISS"),
1311 campaign=job.get("bps_campaign", "MISS"),
1312 payload=job.get("bps_payload", "MISS"),
1313 operator=_get_owner(job),
1314 run_summary=_get_run_summary(job),
1315 state=_htc_status_to_wms_state(job),
1316 jobs=[],
1317 total_number_jobs=total_jobs,
1318 job_state_counts=state_counts,
1319 )
1320 run_reports[report.global_wms_id] = report
1322 return run_reports, ""
1325def _add_run_info(wms_path, job):
1326 """Find BPS run information elsewhere for runs without bps attributes.
1328 Parameters
1329 ----------
1330 wms_path : `str`
1331 Path to submit files for the run.
1332 job : `dict` [`str`, `Any`]
1333 HTCondor dag job information.
1335 Raises
1336 ------
1337 StopIteration
1338 If cannot find file it is looking for. Permission errors are
1339 caught and job's run is marked with error.
1340 """
1341 path = Path(wms_path) / "jobs"
1342 try:
1343 subfile = next(path.glob("**/*.sub"))
1344 except (StopIteration, PermissionError):
1345 job["bps_run"] = "Unavailable"
1346 else:
1347 _LOG.debug("_add_run_info: subfile = %s", subfile)
1348 try:
1349 with open(subfile, encoding="utf-8") as fh:
1350 for line in fh:
1351 if line.startswith("+bps_"):
1352 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1353 if m:
1354 _LOG.debug("Matching line: %s", line)
1355 job[m.group(1)] = m.group(2).replace('"', "")
1356 else:
1357 _LOG.debug("Could not parse attribute: %s", line)
1358 except PermissionError:
1359 job["bps_run"] = "PermissionError"
1360 _LOG.debug("After adding job = %s", job)
1363def _get_owner(job):
1364 """Get the owner of a dag job.
1366 Parameters
1367 ----------
1368 job : `dict` [`str`, `Any`]
1369 HTCondor dag job information.
1371 Returns
1372 -------
1373 owner : `str`
1374 Owner of the dag job.
1375 """
1376 owner = job.get("bps_operator", None)
1377 if not owner:
1378 owner = job.get("Owner", None)
1379 if not owner:
1380 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1381 owner = "MISS"
1382 return owner
1385def _get_run_summary(job):
1386 """Get the run summary for a job.
1388 Parameters
1389 ----------
1390 job : `dict` [`str`, `Any`]
1391 HTCondor dag job information.
1393 Returns
1394 -------
1395 summary : `str`
1396 Number of jobs per PipelineTask label in approximate pipeline order.
1397 Format: <label>:<count>[;<label>:<count>]+
1398 """
1399 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1400 if not summary:
1401 summary, _ = summary_from_dag(job["Iwd"])
1402 if not summary:
1403 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1404 _LOG.debug("_get_run_summary: summary=%s", summary)
1406 # Workaround sometimes using init vs pipetaskInit
1407 summary = summary.replace("init:", "pipetaskInit:")
1409 if "pegasus_version" in job and "pegasus" not in summary:
1410 summary += ";pegasus:0"
1412 return summary
1415def _get_exit_code_summary(jobs):
1416 """Get the exit code summary for a run.
1418 Parameters
1419 ----------
1420 jobs : `dict` [`str`, `dict` [`str`, Any]]
1421 Mapping HTCondor job id to job information.
1423 Returns
1424 -------
1425 summary : `dict` [`str`, `list` [`int`]]
1426 Jobs' exit codes per job label.
1427 """
1428 summary = {}
1429 for job_id, job_ad in jobs.items():
1430 job_label = job_ad["bps_job_label"]
1431 summary.setdefault(job_label, [])
1432 try:
1433 exit_code = 0
1434 job_status = job_ad["JobStatus"]
1435 match job_status:
1436 case JobStatus.COMPLETED | JobStatus.HELD:
1437 exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
1438 case (
1439 JobStatus.IDLE
1440 | JobStatus.RUNNING
1441 | JobStatus.REMOVED
1442 | JobStatus.TRANSFERRING_OUTPUT
1443 | JobStatus.SUSPENDED
1444 ):
1445 pass
1446 case _:
1447 _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%s'", job_status, job_id)
1448 if exit_code != 0:
1449 summary[job_label].append(exit_code)
1450 except KeyError as ex:
1451 _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id)
1452 return summary
1455def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1456 """Count number of jobs per WMS state.
1458 Parameters
1459 ----------
1460 wms_workflow_id : `str`
1461 HTCondor job id.
1462 jobs : `dict` [`str`, `Any`]
1463 HTCondor dag job information.
1465 Returns
1466 -------
1467 total_count : `int`
1468 Total number of dag nodes.
1469 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1470 Keys are the different WMS states and values are counts of jobs
1471 that are in that WMS state.
1472 """
1473 state_counts = dict.fromkeys(WmsStates, 0)
1475 for jid, jinfo in jobs.items():
1476 if jid != wms_workflow_id:
1477 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1479 total_counted = sum(state_counts.values())
1480 if "NodesTotal" in jobs[wms_workflow_id]:
1481 total_count = jobs[wms_workflow_id]["NodesTotal"]
1482 else:
1483 total_count = total_counted
1485 state_counts[WmsStates.UNREADY] += total_count - total_counted
1487 return total_count, state_counts
1490def _get_state_counts_from_dag_job(job):
1491 """Count number of jobs per WMS state.
1493 Parameters
1494 ----------
1495 job : `dict` [`str`, `Any`]
1496 HTCondor dag job information.
1498 Returns
1499 -------
1500 total_count : `int`
1501 Total number of dag nodes.
1502 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1503 Keys are the different WMS states and values are counts of jobs
1504 that are in that WMS state.
1505 """
1506 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1507 state_counts = dict.fromkeys(WmsStates, 0)
1508 if "DAG_NodesReady" in job:
1509 state_counts = {
1510 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1511 WmsStates.READY: job.get("DAG_NodesReady", 0),
1512 WmsStates.HELD: job.get("JobProcsHeld", 0),
1513 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1514 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1515 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
1516 }
1517 total_jobs = job.get("DAG_NodesTotal")
1518 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1519 elif "NodesFailed" in job:
1520 state_counts = {
1521 WmsStates.UNREADY: job.get("NodesUnready", 0),
1522 WmsStates.READY: job.get("NodesReady", 0),
1523 WmsStates.HELD: job.get("JobProcsHeld", 0),
1524 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1525 WmsStates.FAILED: job.get("NodesFailed", 0),
1526 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
1527 }
1528 try:
1529 total_jobs = job.get("NodesTotal")
1530 except KeyError as ex:
1531 _LOG.error("Job missing %s. job = %s", str(ex), job)
1532 raise
1533 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1534 else:
1535 # With Kerberos job auth and Kerberos bug, if warning would be printed
1536 # for every DAG.
1537 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1538 total_jobs = 0
1540 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1541 return total_jobs, state_counts
1544def _htc_status_to_wms_state(job):
1545 """Convert HTCondor job status to generic wms state.
1547 Parameters
1548 ----------
1549 job : `dict` [`str`, `Any`]
1550 HTCondor job information.
1552 Returns
1553 -------
1554 wms_state : `WmsStates`
1555 The equivalent WmsState to given job's status.
1556 """
1557 wms_state = WmsStates.MISFIT
1558 if "JobStatus" in job:
1559 wms_state = _htc_job_status_to_wms_state(job)
1560 elif "NodeStatus" in job:
1561 wms_state = _htc_node_status_to_wms_state(job)
1562 return wms_state
1565def _htc_job_status_to_wms_state(job):
1566 """Convert HTCondor job status to generic wms state.
1568 Parameters
1569 ----------
1570 job : `dict` [`str`, `Any`]
1571 HTCondor job information.
1573 Returns
1574 -------
1575 wms_state : `lsst.ctrl.bps.WmsStates`
1576 The equivalent WmsState to given job's status.
1577 """
1578 _LOG.debug(
1579 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1580 )
1581 job_status = int(job["JobStatus"])
1582 wms_state = WmsStates.MISFIT
1584 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1585 if job_status == JobStatus.IDLE:
1586 wms_state = WmsStates.PENDING
1587 elif job_status == JobStatus.RUNNING:
1588 wms_state = WmsStates.RUNNING
1589 elif job_status == JobStatus.REMOVED:
1590 wms_state = WmsStates.DELETED
1591 elif job_status == JobStatus.COMPLETED:
1592 if (
1593 job.get("ExitBySignal", False)
1594 or job.get("ExitCode", 0)
1595 or job.get("ExitSignal", 0)
1596 or job.get("DAG_Status", 0)
1597 ):
1598 wms_state = WmsStates.FAILED
1599 else:
1600 wms_state = WmsStates.SUCCEEDED
1601 elif job_status == JobStatus.HELD:
1602 wms_state = WmsStates.HELD
1604 return wms_state
1607def _htc_node_status_to_wms_state(job):
1608 """Convert HTCondor status to generic wms state.
1610 Parameters
1611 ----------
1612 job : `dict` [`str`, `Any`]
1613 HTCondor job information.
1615 Returns
1616 -------
1617 wms_state : `lsst.ctrl.bps.WmsStates`
1618 The equivalent WmsState to given node's status.
1619 """
1620 wms_state = WmsStates.MISFIT
1622 status = job["NodeStatus"]
1623 if status == NodeStatus.NOT_READY:
1624 wms_state = WmsStates.UNREADY
1625 elif status == NodeStatus.READY:
1626 wms_state = WmsStates.READY
1627 elif status == NodeStatus.PRERUN:
1628 wms_state = WmsStates.MISFIT
1629 elif status == NodeStatus.SUBMITTED:
1630 if job["JobProcsHeld"]:
1631 wms_state = WmsStates.HELD
1632 elif job["StatusDetails"] == "not_idle":
1633 wms_state = WmsStates.RUNNING
1634 elif job["JobProcsQueued"]:
1635 wms_state = WmsStates.PENDING
1636 elif status == NodeStatus.POSTRUN:
1637 wms_state = WmsStates.MISFIT
1638 elif status == NodeStatus.DONE:
1639 wms_state = WmsStates.SUCCEEDED
1640 elif status == NodeStatus.ERROR:
1641 # Use job exist instead of post script exit
1642 if "DAGMAN error 0" in job["StatusDetails"]:
1643 wms_state = WmsStates.SUCCEEDED
1644 else:
1645 wms_state = WmsStates.FAILED
1647 return wms_state
1650def _update_jobs(jobs1, jobs2):
1651 """Update jobs1 with info in jobs2.
1653 (Basically an update for nested dictionaries.)
1655 Parameters
1656 ----------
1657 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1658 HTCondor job information to be updated.
1659 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1660 Additional HTCondor job information.
1661 """
1662 for jid, jinfo in jobs2.items():
1663 if jid in jobs1:
1664 jobs1[jid].update(jinfo)
1665 else:
1666 jobs1[jid] = jinfo
1669def _wms_id_type(wms_id):
1670 """Determine the type of the WMS id.
1672 Parameters
1673 ----------
1674 wms_id : `str`
1675 WMS id identifying a job.
1677 Returns
1678 -------
1679 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1680 Type of WMS id.
1681 """
1682 try:
1683 int(float(wms_id))
1684 except ValueError:
1685 wms_path = Path(wms_id)
1686 if wms_path.is_dir():
1687 id_type = WmsIdType.PATH
1688 else:
1689 id_type = WmsIdType.GLOBAL
1690 except TypeError:
1691 id_type = WmsIdType.UNKNOWN
1692 else:
1693 id_type = WmsIdType.LOCAL
1694 return id_type
1697def _wms_id_to_cluster(wms_id):
1698 """Convert WMS id to cluster id.
1700 Parameters
1701 ----------
1702 wms_id : `int` or `float` or `str`
1703 HTCondor job id or path.
1705 Returns
1706 -------
1707 schedd_ad : `classad.ClassAd`
1708 ClassAd describing the scheduler managing the job with the given id.
1709 cluster_id : `int`
1710 HTCondor cluster id.
1711 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1712 The type of the provided id.
1713 """
1714 coll = htcondor.Collector()
1716 schedd_ad = None
1717 cluster_id = None
1718 id_type = _wms_id_type(wms_id)
1719 if id_type == WmsIdType.LOCAL:
1720 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1721 cluster_id = int(float(wms_id))
1722 elif id_type == WmsIdType.GLOBAL:
1723 constraint = f'GlobalJobId == "{wms_id}"'
1724 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1725 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
1726 job_info = condor_q(constraint=constraint, schedds=schedds)
1727 if job_info:
1728 schedd_name, job_rec = job_info.popitem()
1729 job_id, _ = job_rec.popitem()
1730 schedd_ad = schedd_ads[schedd_name]
1731 cluster_id = int(float(job_id))
1732 elif id_type == WmsIdType.PATH:
1733 try:
1734 job_info = read_dag_info(wms_id)
1735 except (FileNotFoundError, PermissionError, OSError):
1736 pass
1737 else:
1738 schedd_name, job_rec = job_info.popitem()
1739 job_id, _ = job_rec.popitem()
1740 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1741 cluster_id = int(float(job_id))
1742 else:
1743 pass
1744 return schedd_ad, cluster_id, id_type
1747def _wms_id_to_dir(wms_id):
1748 """Convert WMS id to a submit directory candidate.
1750 The function does not check if the directory exists or if it is a valid
1751 BPS submit directory.
1753 Parameters
1754 ----------
1755 wms_id : `int` or `float` or `str`
1756 HTCondor job id or path.
1758 Returns
1759 -------
1760 wms_path : `pathlib.Path` or None
1761 Submit directory candidate for the run with the given job id. If no
1762 directory can be associated with the provided WMS id, it will be set
1763 to None.
1764 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1765 The type of the provided id.
1767 Raises
1768 ------
1769 TypeError
1770 Raised if provided WMS id has invalid type.
1771 """
1772 coll = htcondor.Collector()
1773 schedd_ads = []
1775 constraint = None
1776 wms_path = None
1777 id_type = _wms_id_type(wms_id)
1778 match id_type:
1779 case WmsIdType.LOCAL:
1780 constraint = f"ClusterId == {int(float(wms_id))}"
1781 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1782 case WmsIdType.GLOBAL:
1783 constraint = f'GlobalJobId == "{wms_id}"'
1784 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1785 case WmsIdType.PATH:
1786 wms_path = Path(wms_id)
1787 case WmsIdType.UNKNOWN:
1788 raise TypeError(f"Invalid job id type: {wms_id}")
1789 if constraint is not None:
1790 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1791 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"])
1792 if job_info:
1793 _, job_rec = job_info.popitem()
1794 _, job_ad = job_rec.popitem()
1795 wms_path = Path(job_ad["Iwd"])
1796 return wms_path, id_type
1799def _create_periodic_release_expr(memory, multiplier, limit):
1800 """Construct an HTCondorAd expression for releasing held jobs.
1802 The expression instruct HTCondor to release any job which was put on hold
1803 due to exceeding memory requirements back to the job queue providing it
1804 satisfies all of the conditions below:
1806 * number of run attempts did not reach allowable number of retries,
1807 * the memory requirements in the last failed run attempt did not reach
1808 the specified memory limit.
1810 Parameters
1811 ----------
1812 memory : `int`
1813 Requested memory in MB.
1814 multiplier : `float`
1815 Memory growth rate between retires.
1816 limit : `int`
1817 Memory limit.
1819 Returns
1820 -------
1821 expr : `str`
1822 A string representing an HTCondor ClassAd expression for releasing jobs
1823 which have been held due to exceeding the memory requirements.
1824 """
1825 is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1826 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1828 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1829 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1830 # The special comparison operators ensure that all comparisons below will
1831 # evaluate to FALSE in this case.
1832 #
1833 # Note:
1834 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1835 # the entire expression should evaluate to FALSE when the job is not HELD.
1836 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1837 # but better safe than sorry.
1838 was_mem_exceeded = (
1839 "JobStatus == 5 "
1840 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1841 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1842 )
1844 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1845 return expr
1848def _create_periodic_remove_expr(memory, multiplier, limit):
1849 """Construct an HTCondorAd expression for removing jobs from the queue.
1851 The expression instruct HTCondor to remove any job which was put on hold
1852 due to exceeding memory requirements from the job queue providing it
1853 satisfies any of the conditions below:
1855 * allowable number of retries was reached,
1856 * the memory requirements during the last failed run attempt reached
1857 the specified memory limit.
1859 Parameters
1860 ----------
1861 memory : `int`
1862 Requested memory in MB.
1863 multiplier : `float`
1864 Memory growth rate between retires.
1865 limit : `int`
1866 Memory limit.
1868 Returns
1869 -------
1870 expr : `str`
1871 A string representing an HTCondor ClassAd expression for removing jobs
1872 which were run at the maximal allowable memory and still exceeded
1873 the memory requirements.
1874 """
1875 is_retry_disallowed = "NumJobStarts > JobMaxRetries"
1876 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
1878 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1879 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1880 # The special comparison operators ensure that all comparisons below will
1881 # evaluate to FALSE in this case.
1882 #
1883 # Note:
1884 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1885 # the entire expression should evaluate to FALSE when the job is not HELD.
1886 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1887 # but better safe than sorry.
1888 was_mem_exceeded = (
1889 "JobStatus == 5 "
1890 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1891 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1892 )
1894 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
1895 return expr
1898def _create_request_memory_expr(memory, multiplier, limit):
1899 """Construct an HTCondor ClassAd expression for safe memory scaling.
1901 Parameters
1902 ----------
1903 memory : `int`
1904 Requested memory in MB.
1905 multiplier : `float`
1906 Memory growth rate between retires.
1907 limit : `int`
1908 Memory limit.
1910 Returns
1911 -------
1912 expr : `str`
1913 A string representing an HTCondor ClassAd expression enabling safe
1914 memory scaling between job retries.
1915 """
1916 # The check if the job was held due to exceeding memory requirements
1917 # will be made *after* job was released back to the job queue (is in
1918 # the IDLE state), hence the need to use `Last*` job ClassAds instead of
1919 # the ones describing job's current state.
1920 #
1921 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
1922 # initially put in the job queue. The special comparison operators ensure
1923 # that all comparisons below will evaluate to FALSE in this case.
1924 was_mem_exceeded = (
1925 "LastJobStatus =?= 5 "
1926 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
1927 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1928 )
1930 # If job runs the first time or was held for reasons other than exceeding
1931 # the memory, set the required memory to the requested value or use
1932 # the memory value measured by HTCondor (MemoryUsage) depending on
1933 # whichever is greater.
1934 expr = (
1935 f"({was_mem_exceeded}) "
1936 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
1937 f": max({{{memory}, MemoryUsage ?: 0}})"
1938 )
1939 return expr
1942def _locate_schedds(locate_all=False):
1943 """Find out Scheduler daemons in an HTCondor pool.
1945 Parameters
1946 ----------
1947 locate_all : `bool`, optional
1948 If True, all available schedulers in the HTCondor pool will be located.
1949 False by default which means that the search will be limited to looking
1950 for the Scheduler running on a local host.
1952 Returns
1953 -------
1954 schedds : `dict` [`str`, `htcondor.Schedd`]
1955 A mapping between Scheduler names and Python objects allowing for
1956 interacting with them.
1957 """
1958 coll = htcondor.Collector()
1960 schedd_ads = []
1961 if locate_all:
1962 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1963 else:
1964 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1965 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1968def _gather_site_values(config, compute_site):
1969 """Gather values specific to given site.
1971 Parameters
1972 ----------
1973 config : `lsst.ctrl.bps.BpsConfig`
1974 BPS configuration that includes necessary submit/runtime
1975 information.
1976 compute_site : `str`
1977 Compute site name.
1979 Returns
1980 -------
1981 site_values : `dict` [`str`, `Any`]
1982 Values specific to the given site.
1983 """
1984 site_values = {"attrs": {}, "profile": {}}
1985 search_opts = {}
1986 if compute_site:
1987 search_opts["curvals"] = {"curr_site": compute_site}
1989 # Determine the hard limit for the memory requirement.
1990 found, limit = config.search("memoryLimit", opt=search_opts)
1991 if not found:
1992 search_opts["default"] = DEFAULT_HTC_EXEC_PATT
1993 _, patt = config.search("executeMachinesPattern", opt=search_opts)
1994 del search_opts["default"]
1996 # To reduce the amount of data, ignore dynamic slots (if any) as,
1997 # by definition, they cannot have more memory than
1998 # the partitionable slot they are the part of.
1999 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
2000 pool_info = condor_status(constraint=constraint)
2001 try:
2002 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
2003 except ValueError:
2004 _LOG.debug("No execute machine in the pool matches %s", patt)
2005 if limit:
2006 config[".bps_defined.memory_limit"] = limit
2008 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
2009 site_values["memoryLimit"] = limit
2011 found, value = config.search("accountingGroup", opt=search_opts)
2012 if found:
2013 site_values["accountingGroup"] = value
2014 found, value = config.search("accountingUser", opt=search_opts)
2015 if found:
2016 site_values["accountingUser"] = value
2018 key = f".site.{compute_site}.profile.condor"
2019 if key in config:
2020 for key, val in config[key].items():
2021 if key.startswith("+"):
2022 site_values["attrs"][key[1:]] = val
2023 else:
2024 site_values["profile"][key] = val
2026 return site_values