Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%
693 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-07 11:03 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-07 11:03 +0000
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import logging
29import os
30import re
31from collections import defaultdict
32from enum import IntEnum, auto
33from pathlib import Path
35import htcondor
36from lsst.ctrl.bps import (
37 BaseWmsService,
38 BaseWmsWorkflow,
39 GenericWorkflow,
40 GenericWorkflowJob,
41 WmsJobReport,
42 WmsRunReport,
43 WmsStates,
44)
45from lsst.ctrl.bps.bps_utils import chdir, create_count_summary
46from lsst.utils.timer import time_this
47from packaging import version
49from .lssthtc import (
50 MISSING_ID,
51 HTCDag,
52 HTCJob,
53 JobStatus,
54 NodeStatus,
55 condor_q,
56 condor_search,
57 condor_status,
58 htc_backup_files,
59 htc_check_dagman_output,
60 htc_create_submit_from_cmd,
61 htc_create_submit_from_dag,
62 htc_create_submit_from_file,
63 htc_escape,
64 htc_submit_dag,
65 htc_version,
66 pegasus_name_to_label,
67 read_dag_info,
68 read_dag_log,
69 read_dag_status,
70 read_node_status,
71 summary_from_dag,
72 write_dag_info,
73)
76class WmsIdType(IntEnum):
77 """Type of valid WMS ids."""
79 UNKNOWN = auto()
80 """The type of id cannot be determined.
81 """
83 LOCAL = auto()
84 """The id is HTCondor job's ClusterId (with optional '.ProcId').
85 """
87 GLOBAL = auto()
88 """Id is a HTCondor's global job id.
89 """
91 PATH = auto()
92 """Id is a submission path.
93 """
96DEFAULT_HTC_EXEC_PATT = ".*worker.*"
97"""Default pattern for searching execute machines in an HTCondor pool.
98"""
100_LOG = logging.getLogger(__name__)
103class HTCondorService(BaseWmsService):
104 """HTCondor version of WMS service."""
106 def prepare(self, config, generic_workflow, out_prefix=None):
107 """Convert generic workflow to an HTCondor DAG ready for submission.
109 Parameters
110 ----------
111 config : `lsst.ctrl.bps.BpsConfig`
112 BPS configuration that includes necessary submit/runtime
113 information.
114 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
115 The generic workflow (e.g., has executable name and arguments).
116 out_prefix : `str`
117 The root directory into which all WMS-specific files are written.
119 Returns
120 -------
121 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
122 HTCondor workflow ready to be run.
123 """
124 _LOG.debug("out_prefix = '%s'", out_prefix)
125 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"):
126 workflow = HTCondorWorkflow.from_generic_workflow(
127 config,
128 generic_workflow,
129 out_prefix,
130 f"{self.__class__.__module__}.{self.__class__.__name__}",
131 )
133 with time_this(
134 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow"
135 ):
136 workflow.write(out_prefix)
137 return workflow
139 def submit(self, workflow):
140 """Submit a single HTCondor workflow.
142 Parameters
143 ----------
144 workflow : `lsst.ctrl.bps.BaseWorkflow`
145 A single HTCondor workflow to submit. run_id is updated after
146 successful submission to WMS.
147 """
148 dag = workflow.dag
150 ver = version.parse(htc_version())
151 if ver >= version.parse("8.9.3"):
152 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
153 else:
154 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
156 # For workflow portability, internal paths are all relative. Hence
157 # the DAG needs to be submitted to HTCondor from inside the submit
158 # directory.
159 with chdir(workflow.submit_path):
160 _LOG.info("Submitting from directory: %s", os.getcwd())
161 schedd_dag_info = htc_submit_dag(sub)
162 if schedd_dag_info:
163 write_dag_info(f"{dag.name}.info.json", schedd_dag_info)
165 _, dag_info = schedd_dag_info.popitem()
166 _, dag_ad = dag_info.popitem()
168 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
169 workflow.run_id = dag.run_id
170 else:
171 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information")
173 def restart(self, wms_workflow_id):
174 """Restart a failed DAGMan workflow.
176 Parameters
177 ----------
178 wms_workflow_id : `str`
179 The directory with HTCondor files.
181 Returns
182 -------
183 run_id : `str`
184 HTCondor id of the restarted DAGMan job. If restart failed, it will
185 be set to None.
186 run_name : `str`
187 Name of the restarted workflow. If restart failed, it will be set
188 to None.
189 message : `str`
190 A message describing any issues encountered during the restart.
191 If there were no issues, an empty string is returned.
192 """
193 wms_path = Path(wms_workflow_id)
194 if not wms_path.is_dir():
195 return None, None, f"Directory '{wms_path}' not found"
197 _LOG.info("Restarting workflow from directory '%s'", wms_path)
198 rescue_dags = list(wms_path.glob("*.dag.rescue*"))
199 if not rescue_dags:
200 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'"
202 _LOG.info("Verifying that the workflow is not already in the job queue")
203 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"')
204 if schedd_dag_info:
205 _, dag_info = schedd_dag_info.popitem()
206 _, dag_ad = dag_info.popitem()
207 id_ = dag_ad["GlobalJobId"]
208 return None, None, f"Workflow already in the job queue (global job id: '{id_}')"
210 _LOG.info("Checking execution status of the workflow")
211 warn = False
212 dag_ad = read_dag_status(str(wms_path))
213 if dag_ad:
214 nodes_total = dag_ad.get("NodesTotal", 0)
215 if nodes_total != 0:
216 nodes_done = dag_ad.get("NodesDone", 0)
217 if nodes_total == nodes_done:
218 return None, None, "All jobs in the workflow finished successfully"
219 else:
220 warn = True
221 else:
222 warn = True
223 if warn:
224 _LOG.warning(
225 "Cannot determine the execution status of the workflow, continuing with restart regardless"
226 )
228 _LOG.info("Backing up select HTCondor files from previous run attempt")
229 htc_backup_files(wms_path, subdir="backups")
231 # For workflow portability, internal paths are all relative. Hence
232 # the DAG needs to be resubmitted to HTCondor from inside the submit
233 # directory.
234 _LOG.info("Adding workflow to the job queue")
235 run_id, run_name, message = None, None, ""
236 with chdir(wms_path):
237 try:
238 dag_path = next(wms_path.glob("*.dag.condor.sub"))
239 except StopIteration:
240 message = f"DAGMan submit description file not found in '{wms_path}'"
241 else:
242 sub = htc_create_submit_from_file(dag_path.name)
243 schedd_dag_info = htc_submit_dag(sub)
245 # Save select information about the DAGMan job to a file. Use
246 # the run name (available in the ClassAd) as the filename.
247 if schedd_dag_info:
248 dag_info = next(iter(schedd_dag_info.values()))
249 dag_ad = next(iter(dag_info.values()))
250 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info)
251 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
252 run_name = dag_ad["bps_run"]
253 else:
254 message = "DAGMan job information unavailable"
256 return run_id, run_name, message
258 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
259 """Query WMS for list of submitted WMS workflows/jobs.
261 This should be a quick lookup function to create list of jobs for
262 other functions.
264 Parameters
265 ----------
266 wms_id : `int` or `str`, optional
267 Id or path that can be used by WMS service to look up job.
268 user : `str`, optional
269 User whose submitted jobs should be listed.
270 require_bps : `bool`, optional
271 Whether to require jobs returned in list to be bps-submitted jobs.
272 pass_thru : `str`, optional
273 Information to pass through to WMS.
274 is_global : `bool`, optional
275 If set, all job queues (and their histories) will be queried for
276 job information. Defaults to False which means that only the local
277 job queue will be queried.
279 Returns
280 -------
281 job_ids : `list` [`Any`]
282 Only job ids to be used by cancel and other functions. Typically
283 this means top-level jobs (i.e., not children jobs).
284 """
285 _LOG.debug(
286 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
287 wms_id,
288 user,
289 require_bps,
290 pass_thru,
291 is_global,
292 )
294 # Determine which Schedds will be queried for job information.
295 coll = htcondor.Collector()
297 schedd_ads = []
298 if is_global:
299 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
300 else:
301 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
303 # Construct appropriate constraint expression using provided arguments.
304 constraint = "False"
305 if wms_id is None:
306 if user is not None:
307 constraint = f'(Owner == "{user}")'
308 else:
309 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
310 if cluster_id is not None:
311 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
313 # If provided id is either a submission path or a global id,
314 # make sure the right Schedd will be queried regardless of
315 # 'is_global' value.
316 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
317 schedd_ads = [schedd_ad]
318 if require_bps:
319 constraint += ' && (bps_isjob == "True")'
320 if pass_thru:
321 if "-forcex" in pass_thru:
322 pass_thru_2 = pass_thru.replace("-forcex", "")
323 if pass_thru_2 and not pass_thru_2.isspace():
324 constraint += f" && ({pass_thru_2})"
325 else:
326 constraint += f" && ({pass_thru})"
328 # Create a list of scheduler daemons which need to be queried.
329 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
331 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
332 results = condor_q(constraint=constraint, schedds=schedds)
334 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
335 job_ids = []
336 for schedd_name, job_info in results.items():
337 for job_id, job_ad in job_info.items():
338 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
339 if "DAGManJobId" not in job_ad:
340 job_ids.append(job_ad.get("GlobalJobId", job_id))
341 else:
342 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
343 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
344 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
345 job_ids.append(job_ad.get("GlobalJobId", job_id))
347 _LOG.debug("job_ids = %s", job_ids)
348 return job_ids
350 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
351 """Return run information based upon given constraints.
353 Parameters
354 ----------
355 wms_workflow_id : `str`, optional
356 Limit to specific run based on id.
357 user : `str`, optional
358 Limit results to runs for this user.
359 hist : `float`, optional
360 Limit history search to this many days. Defaults to 0.
361 pass_thru : `str`, optional
362 Constraints to pass through to HTCondor.
363 is_global : `bool`, optional
364 If set, all job queues (and their histories) will be queried for
365 job information. Defaults to False which means that only the local
366 job queue will be queried.
368 Returns
369 -------
370 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
371 Information about runs from given job information.
372 message : `str`
373 Extra message for report command to print. This could be pointers
374 to documentation or to WMS specific commands.
375 """
376 if wms_workflow_id:
377 id_type = _wms_id_type(wms_workflow_id)
378 if id_type == WmsIdType.LOCAL:
379 schedulers = _locate_schedds(locate_all=is_global)
380 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
381 elif id_type == WmsIdType.GLOBAL:
382 schedulers = _locate_schedds(locate_all=True)
383 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
384 elif id_type == WmsIdType.PATH:
385 run_reports, message = _report_from_path(wms_workflow_id)
386 else:
387 run_reports, message = {}, "Invalid job id"
388 else:
389 schedulers = _locate_schedds(locate_all=is_global)
390 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
391 _LOG.debug("report: %s, %s", run_reports, message)
393 return list(run_reports.values()), message
395 def cancel(self, wms_id, pass_thru=None):
396 """Cancel submitted workflows/jobs.
398 Parameters
399 ----------
400 wms_id : `str`
401 Id or path of job that should be canceled.
402 pass_thru : `str`, optional
403 Information to pass through to WMS.
405 Returns
406 -------
407 deleted : `bool`
408 Whether successful deletion or not. Currently, if any doubt or any
409 individual jobs not deleted, return False.
410 message : `str`
411 Any message from WMS (e.g., error details).
412 """
413 _LOG.debug("Canceling wms_id = %s", wms_id)
415 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
417 if cluster_id is None:
418 deleted = False
419 message = "invalid id"
420 else:
421 _LOG.debug(
422 "Canceling job managed by schedd_name = %s with cluster_id = %s",
423 cluster_id,
424 schedd_ad["Name"],
425 )
426 schedd = htcondor.Schedd(schedd_ad)
428 constraint = f"ClusterId == {cluster_id}"
429 if pass_thru is not None and "-forcex" in pass_thru:
430 pass_thru_2 = pass_thru.replace("-forcex", "")
431 if pass_thru_2 and not pass_thru_2.isspace():
432 constraint += f"&& ({pass_thru_2})"
433 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
434 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
435 else:
436 if pass_thru:
437 constraint += f"&& ({pass_thru})"
438 _LOG.debug("JobAction.Remove constraint = %s", constraint)
439 results = schedd.act(htcondor.JobAction.Remove, constraint)
440 _LOG.debug("Remove results: %s", results)
442 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
443 deleted = True
444 message = ""
445 else:
446 deleted = False
447 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
448 message = "no such bps job in batch queue"
449 else:
450 message = f"unknown problems deleting: {results}"
452 _LOG.debug("deleted: %s; message = %s", deleted, message)
453 return deleted, message
456class HTCondorWorkflow(BaseWmsWorkflow):
457 """Single HTCondor workflow.
459 Parameters
460 ----------
461 name : `str`
462 Unique name for Workflow used when naming files.
463 config : `lsst.ctrl.bps.BpsConfig`
464 BPS configuration that includes necessary submit/runtime information.
465 """
467 def __init__(self, name, config=None):
468 super().__init__(name, config)
469 self.dag = None
471 @classmethod
472 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
473 # Docstring inherited
474 htc_workflow = cls(generic_workflow.name, config)
475 htc_workflow.dag = HTCDag(name=generic_workflow.name)
477 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
478 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
479 htc_workflow.dag.add_attribs(
480 {
481 "bps_wms_service": service_class,
482 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
483 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
484 "bps_job_summary": create_count_summary(generic_workflow.job_counts),
485 }
486 )
488 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
489 if isinstance(tmp_template, str):
490 subdir_template = defaultdict(lambda: tmp_template)
491 else:
492 subdir_template = tmp_template
494 # Create all DAG jobs
495 site_values = {} # cache compute site specific values to reduce config lookups
496 for job_name in generic_workflow:
497 gwjob = generic_workflow.get_job(job_name)
498 if gwjob.compute_site not in site_values:
499 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
500 htc_job = _create_job(
501 subdir_template[gwjob.label],
502 site_values[gwjob.compute_site],
503 generic_workflow,
504 gwjob,
505 out_prefix,
506 )
507 htc_workflow.dag.add_job(htc_job)
509 # Add job dependencies to the DAG
510 for job_name in generic_workflow:
511 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
513 # If final job exists in generic workflow, create DAG final job
514 final = generic_workflow.get_final()
515 if final and isinstance(final, GenericWorkflowJob):
516 if final.compute_site and final.compute_site not in site_values:
517 site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
518 final_htjob = _create_job(
519 subdir_template[final.label],
520 site_values[final.compute_site],
521 generic_workflow,
522 final,
523 out_prefix,
524 )
525 if "post" not in final_htjob.dagcmds:
526 final_htjob.dagcmds[
527 "post"
528 ] = f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
529 htc_workflow.dag.add_final_job(final_htjob)
530 elif final and isinstance(final, GenericWorkflow):
531 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
532 elif final:
533 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
535 return htc_workflow
537 def write(self, out_prefix):
538 """Output HTCondor DAGMan files needed for workflow submission.
540 Parameters
541 ----------
542 out_prefix : `str`
543 Directory prefix for HTCondor files.
544 """
545 self.submit_path = out_prefix
546 os.makedirs(out_prefix, exist_ok=True)
548 # Write down the workflow in HTCondor format.
549 self.dag.write(out_prefix, "jobs/{self.label}")
552def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
553 """Convert GenericWorkflow job nodes to DAG jobs.
555 Parameters
556 ----------
557 subdir_template : `str`
558 Template for making subdirs.
559 site_values : `dict`
560 Site specific values
561 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
562 Generic workflow that is being converted.
563 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
564 The generic job to convert to a HTCondor job.
565 out_prefix : `str`
566 Directory prefix for HTCondor files.
568 Returns
569 -------
570 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
571 The HTCondor job equivalent to the given generic job.
572 """
573 htc_job = HTCJob(gwjob.name, label=gwjob.label)
575 curvals = defaultdict(str)
576 curvals["label"] = gwjob.label
577 if gwjob.tags:
578 curvals.update(gwjob.tags)
580 subdir = subdir_template.format_map(curvals)
581 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
583 htc_job_cmds = {
584 "universe": "vanilla",
585 "should_transfer_files": "YES",
586 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
587 "transfer_output_files": '""', # Set to empty string to disable
588 "transfer_executable": "False",
589 "getenv": "True",
590 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
591 # htcondor to put on hold any jobs which exited by a signal.
592 "on_exit_hold": "ExitBySignal == true",
593 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
594 '"Handling signal as if job has gone over memory limit.")',
595 "on_exit_hold_subcode": "34",
596 }
598 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
600 # job stdout, stderr, htcondor user log.
601 for key in ("output", "error", "log"):
602 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
603 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
605 htc_job_cmds.update(
606 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
607 )
609 # Add the job cmds dict to the job object.
610 htc_job.add_job_cmds(htc_job_cmds)
612 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
614 # Add job attributes to job.
615 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
616 htc_job.add_job_attrs(gwjob.attrs)
617 htc_job.add_job_attrs(site_values["attrs"])
618 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
619 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
621 return htc_job
624def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
625 """Translate the job data that are one to one mapping
627 Parameters
628 ----------
629 cached_vals : `dict` [`str`, `Any`]
630 Config values common to jobs with same label.
631 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
632 Generic workflow that contains job to being converted.
633 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
634 Generic workflow job to be converted.
636 Returns
637 -------
638 htc_job_commands : `dict` [`str`, `Any`]
639 Contains commands which can appear in the HTCondor submit description
640 file.
641 """
642 # Values in the job script that just are name mappings.
643 job_translation = {
644 "mail_to": "notify_user",
645 "when_to_mail": "notification",
646 "request_cpus": "request_cpus",
647 "priority": "priority",
648 "category": "category",
649 "accounting_group": "accounting_group",
650 "accounting_user": "accounting_group_user",
651 }
653 jobcmds = {}
654 for gwkey, htckey in job_translation.items():
655 jobcmds[htckey] = getattr(gwjob, gwkey, None)
657 # If accounting info was not set explicitly, use site settings if any.
658 if not gwjob.accounting_group:
659 jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
660 if not gwjob.accounting_user:
661 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
663 # job commands that need modification
664 if gwjob.number_of_retries:
665 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
667 if gwjob.retry_unless_exit:
668 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
670 if gwjob.request_disk:
671 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
673 if gwjob.request_memory:
674 jobcmds["request_memory"] = f"{gwjob.request_memory}"
676 if gwjob.memory_multiplier:
677 # Do not use try-except! At the moment, BpsConfig returns an empty
678 # string if it does not contain the key.
679 memory_limit = cached_vals["memoryLimit"]
680 if not memory_limit:
681 raise RuntimeError(
682 "Memory autoscaling enabled, but automatic detection of the memory limit "
683 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
684 "search pattern 'executeMachinesPattern' might help."
685 )
687 # Set maximal amount of memory job can ask for.
688 #
689 # The check below assumes that 'memory_limit' was set to a value which
690 # realistically reflects actual physical limitations of a given compute
691 # resource.
692 memory_max = memory_limit
693 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
694 memory_max = gwjob.request_memory_max
696 # Make job ask for more memory each time it failed due to insufficient
697 # memory requirements.
698 jobcmds["request_memory"] = _create_request_memory_expr(
699 gwjob.request_memory, gwjob.memory_multiplier, memory_max
700 )
702 # Periodically release jobs which are being held due to exceeding
703 # memory. Stop doing that (by removing the job from the HTCondor queue)
704 # after the maximal number of retries has been reached or the job was
705 # already run at maximal allowed memory.
706 jobcmds["periodic_release"] = _create_periodic_release_expr(
707 gwjob.request_memory, gwjob.memory_multiplier, memory_max
708 )
709 jobcmds["periodic_remove"] = _create_periodic_remove_expr(
710 gwjob.request_memory, gwjob.memory_multiplier, memory_max
711 )
713 # Assume concurrency_limit implemented using HTCondor concurrency limits.
714 # May need to move to special site-specific implementation if sites use
715 # other mechanisms.
716 if gwjob.concurrency_limit:
717 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
719 # Handle command line
720 if gwjob.executable.transfer_executable:
721 jobcmds["transfer_executable"] = "True"
722 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
723 else:
724 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
726 if gwjob.arguments:
727 arguments = gwjob.arguments
728 arguments = _replace_cmd_vars(arguments, gwjob)
729 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
730 arguments = _fix_env_var_syntax(arguments)
731 jobcmds["arguments"] = arguments
733 # Add extra "pass-thru" job commands
734 if gwjob.profile:
735 for key, val in gwjob.profile.items():
736 jobcmds[key] = htc_escape(val)
737 for key, val in cached_vals["profile"].items():
738 jobcmds[key] = htc_escape(val)
740 return jobcmds
743def _translate_dag_cmds(gwjob):
744 """Translate job values into DAGMan commands.
746 Parameters
747 ----------
748 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
749 Job containing values to be translated.
751 Returns
752 -------
753 dagcmds : `dict` [`str`, `Any`]
754 DAGMan commands for the job.
755 """
756 # Values in the dag script that just are name mappings.
757 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
759 dagcmds = {}
760 for gwkey, htckey in dag_translation.items():
761 dagcmds[htckey] = getattr(gwjob, gwkey, None)
763 # Still to be coded: vars "pre_cmdline", "post_cmdline"
764 return dagcmds
767def _fix_env_var_syntax(oldstr):
768 """Change ENV place holders to HTCondor Env var syntax.
770 Parameters
771 ----------
772 oldstr : `str`
773 String in which environment variable syntax is to be fixed.
775 Returns
776 -------
777 newstr : `str`
778 Given string with environment variable syntax fixed.
779 """
780 newstr = oldstr
781 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
782 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
783 return newstr
786def _replace_file_vars(use_shared, arguments, workflow, gwjob):
787 """Replace file placeholders in command line arguments with correct
788 physical file names.
790 Parameters
791 ----------
792 use_shared : `bool`
793 Whether HTCondor can assume shared filesystem.
794 arguments : `str`
795 Arguments string in which to replace file placeholders.
796 workflow : `lsst.ctrl.bps.GenericWorkflow`
797 Generic workflow that contains file information.
798 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
799 The job corresponding to the arguments.
801 Returns
802 -------
803 arguments : `str`
804 Given arguments string with file placeholders replaced.
805 """
806 # Replace input file placeholders with paths.
807 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
808 if not gwfile.wms_transfer:
809 # Must assume full URI if in command line and told WMS is not
810 # responsible for transferring file.
811 uri = gwfile.src_uri
812 elif use_shared:
813 if gwfile.job_shared:
814 # Have shared filesystems and jobs can share file.
815 uri = gwfile.src_uri
816 else:
817 # Taking advantage of inside knowledge. Not future-proof.
818 # Temporary fix until have job wrapper that pulls files
819 # within job.
820 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
821 uri = "butler.yaml"
822 else:
823 uri = os.path.basename(gwfile.src_uri)
824 else: # Using push transfer
825 uri = os.path.basename(gwfile.src_uri)
826 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
828 # Replace output file placeholders with paths.
829 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
830 if not gwfile.wms_transfer:
831 # Must assume full URI if in command line and told WMS is not
832 # responsible for transferring file.
833 uri = gwfile.src_uri
834 elif use_shared:
835 if gwfile.job_shared:
836 # Have shared filesystems and jobs can share file.
837 uri = gwfile.src_uri
838 else:
839 uri = os.path.basename(gwfile.src_uri)
840 else: # Using push transfer
841 uri = os.path.basename(gwfile.src_uri)
842 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
843 return arguments
846def _replace_cmd_vars(arguments, gwjob):
847 """Replace format-style placeholders in arguments.
849 Parameters
850 ----------
851 arguments : `str`
852 Arguments string in which to replace placeholders.
853 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
854 Job containing values to be used to replace placeholders
855 (in particular gwjob.cmdvals).
857 Returns
858 -------
859 arguments : `str`
860 Given arguments string with placeholders replaced.
861 """
862 try:
863 arguments = arguments.format(**gwjob.cmdvals)
864 except (KeyError, TypeError): # TypeError in case None instead of {}
865 _LOG.error(
866 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals
867 )
868 raise
869 return arguments
872def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
873 """Add job input files from generic workflow to job.
875 Parameters
876 ----------
877 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
878 The generic workflow (e.g., has executable name and arguments).
879 job_name : `str`
880 Unique name for the job.
881 use_shared : `bool`
882 Whether job has access to files via shared filesystem.
883 out_prefix : `str`
884 The root directory into which all WMS-specific files are written.
886 Returns
887 -------
888 htc_commands : `dict` [`str`, `str`]
889 HTCondor commands for the job submission script.
890 """
891 htc_commands = {}
892 inputs = []
893 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
894 _LOG.debug("src_uri=%s", gwf_file.src_uri)
896 uri = Path(gwf_file.src_uri)
898 # Note if use_shared and job_shared, don't need to transfer file.
900 if not use_shared: # Copy file using push to job
901 inputs.append(str(uri.relative_to(out_prefix)))
902 elif not gwf_file.job_shared: # Jobs require own copy
903 # if using shared filesystem, but still need copy in job. Use
904 # HTCondor's curl plugin for a local copy.
906 # Execution butler is represented as a directory which the
907 # curl plugin does not handle. Taking advantage of inside
908 # knowledge for temporary fix until have job wrapper that pulls
909 # files within job.
910 if gwf_file.name == "butlerConfig":
911 # The execution butler directory doesn't normally exist until
912 # the submit phase so checking for suffix instead of using
913 # is_dir(). If other non-yaml file exists they would have a
914 # different gwf_file.name.
915 if uri.suffix == ".yaml": # Single file, so just copy.
916 inputs.append(f"file://{uri}")
917 else:
918 inputs.append(f"file://{uri / 'butler.yaml'}")
919 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
920 elif uri.is_dir():
921 raise RuntimeError(
922 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
923 )
924 else:
925 inputs.append(f"file://{uri}")
927 if inputs:
928 htc_commands["transfer_input_files"] = ",".join(inputs)
929 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
930 return htc_commands
933def _report_from_path(wms_path):
934 """Gather run information from a given run directory.
936 Parameters
937 ----------
938 wms_path : `str`
939 The directory containing the submit side files (e.g., HTCondor files).
941 Returns
942 -------
943 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
944 Run information for the detailed report. The key is the HTCondor id
945 and the value is a collection of report information for that run.
946 message : `str`
947 Message to be printed with the summary report.
948 """
949 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
950 if wms_workflow_id == MISSING_ID:
951 run_reports = {}
952 else:
953 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
954 return run_reports, message
957def _report_from_id(wms_workflow_id, hist, schedds=None):
958 """Gather run information using workflow id.
960 Parameters
961 ----------
962 wms_workflow_id : `str`
963 Limit to specific run based on id.
964 hist : `float`
965 Limit history search to this many days.
966 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
967 HTCondor schedulers which to query for job information. If None
968 (default), all queries will be run against the local scheduler only.
970 Returns
971 -------
972 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
973 Run information for the detailed report. The key is the HTCondor id
974 and the value is a collection of report information for that run.
975 message : `str`
976 Message to be printed with the summary report.
977 """
978 messages = []
980 # Collect information about the job by querying HTCondor schedd and
981 # HTCondor history.
982 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
983 if len(schedd_dag_info) == 1:
984 # Extract the DAG info without altering the results of the query.
985 schedd_name = next(iter(schedd_dag_info))
986 dag_id = next(iter(schedd_dag_info[schedd_name]))
987 dag_ad = schedd_dag_info[schedd_name][dag_id]
989 # If the provided workflow id does not correspond to the one extracted
990 # from the DAGMan log file in the submit directory, rerun the query
991 # with the id found in the file.
992 #
993 # This is to cover the situation in which the user provided the old job
994 # id of a restarted run.
995 try:
996 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"])
997 except FileNotFoundError as exc:
998 # At the moment missing DAGMan log is pretty much a fatal error.
999 # So empty the DAG info to finish early (see the if statement
1000 # below).
1001 schedd_dag_info.clean()
1002 messages.append(f"Cannot create the report for '{dag_id}': {exc}")
1003 else:
1004 if path_dag_id != dag_id:
1005 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
1006 messages.append(
1007 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
1008 "This normally occurs when a run is restarted. The report shown is for the most "
1009 f"recent status with run id '{path_dag_id}'"
1010 )
1012 if len(schedd_dag_info) == 0:
1013 run_reports = {}
1014 elif len(schedd_dag_info) == 1:
1015 _, dag_info = schedd_dag_info.popitem()
1016 dag_id, dag_ad = dag_info.popitem()
1018 # Create a mapping between jobs and their classads. The keys will
1019 # be of format 'ClusterId.ProcId'.
1020 job_info = {dag_id: dag_ad}
1022 # Find jobs (nodes) belonging to that DAGMan job.
1023 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
1024 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
1025 if schedd_job_info:
1026 _, node_info = schedd_job_info.popitem()
1027 job_info.update(node_info)
1029 # Collect additional pieces of information about jobs using HTCondor
1030 # files in the submission directory.
1031 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
1032 _update_jobs(job_info, path_jobs)
1033 if message:
1034 messages.append(message)
1035 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
1036 else:
1037 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
1038 message = (
1039 f"More than one job matches id '{wms_workflow_id}', "
1040 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
1041 )
1042 messages.append(message)
1043 run_reports = {}
1045 message = "\n".join(messages)
1046 return run_reports, message
1049def _get_info_from_schedd(wms_workflow_id, hist, schedds):
1050 """Gather run information from HTCondor.
1052 Parameters
1053 ----------
1054 wms_workflow_id : `str`
1055 Limit to specific run based on id.
1056 hist :
1057 Limit history search to this many days.
1058 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1059 HTCondor schedulers which to query for job information. If None
1060 (default), all queries will be run against the local scheduler only.
1062 Returns
1063 -------
1064 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1065 Information about jobs satisfying the search criteria where for each
1066 Scheduler, local HTCondor job ids are mapped to their respective
1067 classads.
1068 """
1069 dag_constraint = 'regexp("dagman$", Cmd)'
1070 try:
1071 cluster_id = int(float(wms_workflow_id))
1072 except ValueError:
1073 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
1074 else:
1075 dag_constraint += f" && ClusterId == {cluster_id}"
1077 # With the current implementation of the condor_* functions the query
1078 # will always return only one match per Scheduler.
1079 #
1080 # Even in the highly unlikely situation where HTCondor history (which
1081 # condor_search queries too) is long enough to have jobs from before
1082 # the cluster ids were rolled over (and as a result there is more then
1083 # one job with the same cluster id) they will not show up in
1084 # the results.
1085 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
1086 return schedd_dag_info
1089def _get_info_from_path(wms_path):
1090 """Gather run information from a given run directory.
1092 Parameters
1093 ----------
1094 wms_path : `str`
1095 Directory containing HTCondor files.
1097 Returns
1098 -------
1099 wms_workflow_id : `str`
1100 The run id which is a DAGman job id.
1101 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1102 Information about jobs read from files in the given directory.
1103 The key is the HTCondor id and the value is a dictionary of HTCondor
1104 keys and values.
1105 message : `str`
1106 Message to be printed with the summary report.
1107 """
1108 messages = []
1109 try:
1110 wms_workflow_id, jobs = read_dag_log(wms_path)
1111 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
1112 _update_jobs(jobs, read_node_status(wms_path))
1113 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
1115 # Add more info for DAGman job
1116 job = jobs[wms_workflow_id]
1117 job.update(read_dag_status(wms_path))
1119 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
1120 if "bps_run" not in job:
1121 _add_run_info(wms_path, job)
1123 message = htc_check_dagman_output(wms_path)
1124 if message:
1125 messages.append(message)
1126 _LOG.debug(
1127 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
1128 )
1130 # Add extra pieces of information which cannot be found in HTCondor
1131 # generated files like 'GlobalJobId'.
1132 #
1133 # Do not treat absence of this file as a serious error. Neither runs
1134 # submitted with earlier versions of the plugin nor the runs submitted
1135 # with Pegasus plugin will have it at the moment. However, once enough
1136 # time passes and Pegasus plugin will have its own report() method
1137 # (instead of sneakily using HTCondor's one), the lack of that file
1138 # should be treated as seriously as lack of any other file.
1139 try:
1140 job_info = read_dag_info(wms_path)
1141 except FileNotFoundError as exc:
1142 message = f"Warn: Some information may not be available: {exc}"
1143 messages.append(message)
1144 else:
1145 schedd_name = next(iter(job_info))
1146 job_ad = next(iter(job_info[schedd_name].values()))
1147 job.update(job_ad)
1148 except FileNotFoundError:
1149 message = f"Could not find HTCondor files in '{wms_path}'"
1150 _LOG.warning(message)
1151 messages.append(message)
1152 wms_workflow_id = MISSING_ID
1153 jobs = {}
1155 message = "\n".join([msg for msg in messages if msg])
1156 return wms_workflow_id, jobs, message
1159def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
1160 """Gather run information to be used in generating summary reports.
1162 Parameters
1163 ----------
1164 wms_workflow_id : `str`
1165 The run id to create the report for.
1166 jobs : `dict` [`str`, `dict` [`str`, Any]]
1167 Mapping HTCondor job id to job information.
1169 Returns
1170 -------
1171 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1172 Run information for the detailed report. The key is the given HTCondor
1173 id and the value is a collection of report information for that run.
1174 """
1175 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1176 dag_job = jobs[wms_workflow_id]
1177 report = WmsRunReport(
1178 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
1179 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
1180 path=dag_job["Iwd"],
1181 label=dag_job.get("bps_job_label", "MISS"),
1182 run=dag_job.get("bps_run", "MISS"),
1183 project=dag_job.get("bps_project", "MISS"),
1184 campaign=dag_job.get("bps_campaign", "MISS"),
1185 payload=dag_job.get("bps_payload", "MISS"),
1186 operator=_get_owner(dag_job),
1187 run_summary=_get_run_summary(dag_job),
1188 state=_htc_status_to_wms_state(dag_job),
1189 jobs=[],
1190 total_number_jobs=dag_job["total_jobs"],
1191 job_state_counts=dag_job["state_counts"],
1192 )
1194 for job_id, job_info in jobs.items():
1195 try:
1196 if job_info["ClusterId"] != int(float(wms_workflow_id)):
1197 job_report = WmsJobReport(
1198 wms_id=job_id,
1199 name=job_info.get("DAGNodeName", job_id),
1200 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])),
1201 state=_htc_status_to_wms_state(job_info),
1202 )
1203 if job_report.label == "init":
1204 job_report.label = "pipetaskInit"
1205 report.jobs.append(job_report)
1206 except KeyError as ex:
1207 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
1208 raise
1210 run_reports = {report.wms_id: report}
1211 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1212 return run_reports
1215def _summary_report(user, hist, pass_thru, schedds=None):
1216 """Gather run information to be used in generating summary reports.
1218 Parameters
1219 ----------
1220 user : `str`
1221 Run lookup restricted to given user.
1222 hist : `float`
1223 How many previous days to search for run information.
1224 pass_thru : `str`
1225 Advanced users can define the HTCondor constraint to be used
1226 when searching queue and history.
1228 Returns
1229 -------
1230 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1231 Run information for the summary report. The keys are HTCondor ids and
1232 the values are collections of report information for each run.
1233 message : `str`
1234 Message to be printed with the summary report.
1235 """
1236 # only doing summary report so only look for dagman jobs
1237 if pass_thru:
1238 constraint = pass_thru
1239 else:
1240 # Notes:
1241 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1242 # manually restarted.
1243 # * Any job with DAGManJobID isn't a DAG job
1244 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1245 if user:
1246 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1248 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1250 # Have list of DAGMan jobs, need to get run_report info.
1251 run_reports = {}
1252 for jobs in job_info.values():
1253 for job_id, job in jobs.items():
1254 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1255 # If didn't get from queue information (e.g., Kerberos bug),
1256 # try reading from file.
1257 if total_jobs == 0:
1258 try:
1259 job.update(read_dag_status(job["Iwd"]))
1260 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1261 except StopIteration:
1262 pass # don't kill report can't find htcondor files
1264 if "bps_run" not in job:
1265 _add_run_info(job["Iwd"], job)
1266 report = WmsRunReport(
1267 wms_id=job_id,
1268 global_wms_id=job["GlobalJobId"],
1269 path=job["Iwd"],
1270 label=job.get("bps_job_label", "MISS"),
1271 run=job.get("bps_run", "MISS"),
1272 project=job.get("bps_project", "MISS"),
1273 campaign=job.get("bps_campaign", "MISS"),
1274 payload=job.get("bps_payload", "MISS"),
1275 operator=_get_owner(job),
1276 run_summary=_get_run_summary(job),
1277 state=_htc_status_to_wms_state(job),
1278 jobs=[],
1279 total_number_jobs=total_jobs,
1280 job_state_counts=state_counts,
1281 )
1282 run_reports[report.global_wms_id] = report
1284 return run_reports, ""
1287def _add_run_info(wms_path, job):
1288 """Find BPS run information elsewhere for runs without bps attributes.
1290 Parameters
1291 ----------
1292 wms_path : `str`
1293 Path to submit files for the run.
1294 job : `dict` [`str`, `Any`]
1295 HTCondor dag job information.
1297 Raises
1298 ------
1299 StopIteration
1300 If cannot find file it is looking for. Permission errors are
1301 caught and job's run is marked with error.
1302 """
1303 path = Path(wms_path) / "jobs"
1304 try:
1305 subfile = next(path.glob("**/*.sub"))
1306 except (StopIteration, PermissionError):
1307 job["bps_run"] = "Unavailable"
1308 else:
1309 _LOG.debug("_add_run_info: subfile = %s", subfile)
1310 try:
1311 with open(subfile, "r", encoding="utf-8") as fh:
1312 for line in fh:
1313 if line.startswith("+bps_"):
1314 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1315 if m:
1316 _LOG.debug("Matching line: %s", line)
1317 job[m.group(1)] = m.group(2).replace('"', "")
1318 else:
1319 _LOG.debug("Could not parse attribute: %s", line)
1320 except PermissionError:
1321 job["bps_run"] = "PermissionError"
1322 _LOG.debug("After adding job = %s", job)
1325def _get_owner(job):
1326 """Get the owner of a dag job.
1328 Parameters
1329 ----------
1330 job : `dict` [`str`, `Any`]
1331 HTCondor dag job information.
1333 Returns
1334 -------
1335 owner : `str`
1336 Owner of the dag job.
1337 """
1338 owner = job.get("bps_operator", None)
1339 if not owner:
1340 owner = job.get("Owner", None)
1341 if not owner:
1342 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1343 owner = "MISS"
1344 return owner
1347def _get_run_summary(job):
1348 """Get the run summary for a job.
1350 Parameters
1351 ----------
1352 job : `dict` [`str`, `Any`]
1353 HTCondor dag job information.
1355 Returns
1356 -------
1357 summary : `str`
1358 Number of jobs per PipelineTask label in approximate pipeline order.
1359 Format: <label>:<count>[;<label>:<count>]+
1360 """
1361 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1362 if not summary:
1363 summary, _ = summary_from_dag(job["Iwd"])
1364 if not summary:
1365 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1366 _LOG.debug("_get_run_summary: summary=%s", summary)
1368 # Workaround sometimes using init vs pipetaskInit
1369 summary = summary.replace("init:", "pipetaskInit:")
1371 if "pegasus_version" in job and "pegasus" not in summary:
1372 summary += ";pegasus:0"
1374 return summary
1377def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1378 """Count number of jobs per WMS state.
1380 Parameters
1381 ----------
1382 wms_workflow_id : `str`
1383 HTCondor job id.
1384 jobs : `dict` [`str`, `Any`]
1385 HTCondor dag job information.
1387 Returns
1388 -------
1389 total_count : `int`
1390 Total number of dag nodes.
1391 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1392 Keys are the different WMS states and values are counts of jobs
1393 that are in that WMS state.
1394 """
1395 state_counts = dict.fromkeys(WmsStates, 0)
1397 for jid, jinfo in jobs.items():
1398 if jid != wms_workflow_id:
1399 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1401 total_counted = sum(state_counts.values())
1402 if "NodesTotal" in jobs[wms_workflow_id]:
1403 total_count = jobs[wms_workflow_id]["NodesTotal"]
1404 else:
1405 total_count = total_counted
1407 state_counts[WmsStates.UNREADY] += total_count - total_counted
1409 return total_count, state_counts
1412def _get_state_counts_from_dag_job(job):
1413 """Count number of jobs per WMS state.
1415 Parameters
1416 ----------
1417 job : `dict` [`str`, `Any`]
1418 HTCondor dag job information.
1420 Returns
1421 -------
1422 total_count : `int`
1423 Total number of dag nodes.
1424 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1425 Keys are the different WMS states and values are counts of jobs
1426 that are in that WMS state.
1427 """
1428 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1429 state_counts = dict.fromkeys(WmsStates, 0)
1430 if "DAG_NodesReady" in job:
1431 state_counts = {
1432 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1433 WmsStates.READY: job.get("DAG_NodesReady", 0),
1434 WmsStates.HELD: job.get("JobProcsHeld", 0),
1435 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1436 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1437 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
1438 }
1439 total_jobs = job.get("DAG_NodesTotal")
1440 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1441 elif "NodesFailed" in job:
1442 state_counts = {
1443 WmsStates.UNREADY: job.get("NodesUnready", 0),
1444 WmsStates.READY: job.get("NodesReady", 0),
1445 WmsStates.HELD: job.get("JobProcsHeld", 0),
1446 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1447 WmsStates.FAILED: job.get("NodesFailed", 0),
1448 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
1449 }
1450 try:
1451 total_jobs = job.get("NodesTotal")
1452 except KeyError as ex:
1453 _LOG.error("Job missing %s. job = %s", str(ex), job)
1454 raise
1455 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1456 else:
1457 # With Kerberos job auth and Kerberos bug, if warning would be printed
1458 # for every DAG.
1459 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1460 total_jobs = 0
1462 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1463 return total_jobs, state_counts
1466def _htc_status_to_wms_state(job):
1467 """Convert HTCondor job status to generic wms state.
1469 Parameters
1470 ----------
1471 job : `dict` [`str`, `Any`]
1472 HTCondor job information.
1474 Returns
1475 -------
1476 wms_state : `WmsStates`
1477 The equivalent WmsState to given job's status.
1478 """
1479 wms_state = WmsStates.MISFIT
1480 if "JobStatus" in job:
1481 wms_state = _htc_job_status_to_wms_state(job)
1482 elif "NodeStatus" in job:
1483 wms_state = _htc_node_status_to_wms_state(job)
1484 return wms_state
1487def _htc_job_status_to_wms_state(job):
1488 """Convert HTCondor job status to generic wms state.
1490 Parameters
1491 ----------
1492 job : `dict` [`str`, `Any`]
1493 HTCondor job information.
1495 Returns
1496 -------
1497 wms_state : `lsst.ctrl.bps.WmsStates`
1498 The equivalent WmsState to given job's status.
1499 """
1500 _LOG.debug(
1501 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1502 )
1503 job_status = int(job["JobStatus"])
1504 wms_state = WmsStates.MISFIT
1506 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1507 if job_status == JobStatus.IDLE:
1508 wms_state = WmsStates.PENDING
1509 elif job_status == JobStatus.RUNNING:
1510 wms_state = WmsStates.RUNNING
1511 elif job_status == JobStatus.REMOVED:
1512 wms_state = WmsStates.DELETED
1513 elif job_status == JobStatus.COMPLETED:
1514 if (
1515 job.get("ExitBySignal", False)
1516 or job.get("ExitCode", 0)
1517 or job.get("ExitSignal", 0)
1518 or job.get("DAG_Status", 0)
1519 or job.get("ReturnValue", 0)
1520 ):
1521 wms_state = WmsStates.FAILED
1522 else:
1523 wms_state = WmsStates.SUCCEEDED
1524 elif job_status == JobStatus.HELD:
1525 wms_state = WmsStates.HELD
1527 return wms_state
1530def _htc_node_status_to_wms_state(job):
1531 """Convert HTCondor status to generic wms state.
1533 Parameters
1534 ----------
1535 job : `dict` [`str`, `Any`]
1536 HTCondor job information.
1538 Returns
1539 -------
1540 wms_state : `lsst.ctrl.bps.WmsStates`
1541 The equivalent WmsState to given node's status.
1542 """
1543 wms_state = WmsStates.MISFIT
1545 status = job["NodeStatus"]
1546 if status == NodeStatus.NOT_READY:
1547 wms_state = WmsStates.UNREADY
1548 elif status == NodeStatus.READY:
1549 wms_state = WmsStates.READY
1550 elif status == NodeStatus.PRERUN:
1551 wms_state = WmsStates.MISFIT
1552 elif status == NodeStatus.SUBMITTED:
1553 if job["JobProcsHeld"]:
1554 wms_state = WmsStates.HELD
1555 elif job["StatusDetails"] == "not_idle":
1556 wms_state = WmsStates.RUNNING
1557 elif job["JobProcsQueued"]:
1558 wms_state = WmsStates.PENDING
1559 elif status == NodeStatus.POSTRUN:
1560 wms_state = WmsStates.MISFIT
1561 elif status == NodeStatus.DONE:
1562 wms_state = WmsStates.SUCCEEDED
1563 elif status == NodeStatus.ERROR:
1564 # Use job exist instead of post script exit
1565 if "DAGMAN error 0" in job["StatusDetails"]:
1566 wms_state = WmsStates.SUCCEEDED
1567 else:
1568 wms_state = WmsStates.FAILED
1570 return wms_state
1573def _update_jobs(jobs1, jobs2):
1574 """Update jobs1 with info in jobs2.
1576 (Basically an update for nested dictionaries.)
1578 Parameters
1579 ----------
1580 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1581 HTCondor job information to be updated.
1582 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1583 Additional HTCondor job information.
1584 """
1585 for jid, jinfo in jobs2.items():
1586 if jid in jobs1:
1587 jobs1[jid].update(jinfo)
1588 else:
1589 jobs1[jid] = jinfo
1592def _wms_id_type(wms_id):
1593 """Determine the type of the WMS id.
1595 Parameters
1596 ----------
1597 wms_id : `str`
1598 WMS id identifying a job.
1600 Returns
1601 -------
1602 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1603 Type of WMS id.
1604 """
1605 try:
1606 int(float(wms_id))
1607 except ValueError:
1608 wms_path = Path(wms_id)
1609 if wms_path.exists():
1610 id_type = WmsIdType.PATH
1611 else:
1612 id_type = WmsIdType.GLOBAL
1613 except TypeError:
1614 id_type = WmsIdType.UNKNOWN
1615 else:
1616 id_type = WmsIdType.LOCAL
1617 return id_type
1620def _wms_id_to_cluster(wms_id):
1621 """Convert WMS id to cluster id.
1623 Parameters
1624 ----------
1625 wms_id : `int` or `float` or `str`
1626 HTCondor job id or path.
1628 Returns
1629 -------
1630 schedd_ad : `classad.ClassAd`
1631 ClassAd describing the scheduler managing the job with the given id.
1632 cluster_id : `int`
1633 HTCondor cluster id.
1634 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1635 The type of the provided id.
1636 """
1637 coll = htcondor.Collector()
1639 schedd_ad = None
1640 cluster_id = None
1641 id_type = _wms_id_type(wms_id)
1642 if id_type == WmsIdType.LOCAL:
1643 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1644 cluster_id = int(float(wms_id))
1645 elif id_type == WmsIdType.GLOBAL:
1646 constraint = f'GlobalJobId == "{wms_id}"'
1647 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1648 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()]
1649 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds]
1650 results = {
1651 query.tag(): dict(ads[0])
1652 for query in htcondor.poll(queries)
1653 if (ads := query.nextAdsNonBlocking())
1654 }
1655 if results:
1656 schedd_name = next(iter(results))
1657 schedd_ad = schedd_ads[schedd_name]
1658 cluster_id = results[schedd_name]["ClusterId"]
1659 elif id_type == WmsIdType.PATH:
1660 try:
1661 job_info = read_dag_info(wms_id)
1662 except (FileNotFoundError, PermissionError, IOError):
1663 pass
1664 else:
1665 schedd_name = next(iter(job_info))
1666 job_id = next(iter(job_info[schedd_name]))
1667 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1668 cluster_id = int(float(job_id))
1669 else:
1670 pass
1671 return schedd_ad, cluster_id, id_type
1674def _create_periodic_release_expr(memory, multiplier, limit):
1675 """Construct an HTCondorAd expression for releasing held jobs.
1677 The expression instruct HTCondor to release any job which was put on hold
1678 due to exceeding memory requirements back to the job queue providing it
1679 satisfies all of the conditions below:
1681 * number of run attempts did not reach allowable number of retries,
1682 * the memory requirements in the last failed run attempt did not reach
1683 the specified memory limit.
1685 Parameters
1686 ----------
1687 memory : `int`
1688 Requested memory in MB.
1689 multiplier : `float`
1690 Memory growth rate between retires.
1691 limit : `int`
1692 Memory limit.
1694 Returns
1695 -------
1696 expr : `str`
1697 A string representing an HTCondor ClassAd expression for releasing jobs
1698 which have been held due to exceeding the memory requirements.
1699 """
1700 is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1701 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1703 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1704 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1705 # The special comparison operators ensure that all comparisons below will
1706 # evaluate to FALSE in this case.
1707 #
1708 # Note:
1709 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1710 # the entire expression should evaluate to FALSE when the job is not HELD.
1711 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1712 # but better safe than sorry.
1713 was_mem_exceeded = (
1714 "JobStatus == 5 "
1715 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1716 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1717 )
1719 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1720 return expr
1723def _create_periodic_remove_expr(memory, multiplier, limit):
1724 """Construct an HTCondorAd expression for removing jobs from the queue.
1726 The expression instruct HTCondor to remove any job which was put on hold
1727 due to exceeding memory requirements from the job queue providing it
1728 satisfies any of the conditions below:
1730 * allowable number of retries was reached,
1731 * the memory requirements during the last failed run attempt reached
1732 the specified memory limit.
1734 Parameters
1735 ----------
1736 memory : `int`
1737 Requested memory in MB.
1738 multiplier : `float`
1739 Memory growth rate between retires.
1740 limit : `int`
1741 Memory limit.
1743 Returns
1744 -------
1745 expr : `str`
1746 A string representing an HTCondor ClassAd expression for removing jobs
1747 which were run at the maximal allowable memory and still exceeded
1748 the memory requirements.
1749 """
1750 is_retry_disallowed = "NumJobStarts > JobMaxRetries"
1751 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
1753 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1754 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1755 # The special comparison operators ensure that all comparisons below will
1756 # evaluate to FALSE in this case.
1757 #
1758 # Note:
1759 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1760 # the entire expression should evaluate to FALSE when the job is not HELD.
1761 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1762 # but better safe than sorry.
1763 was_mem_exceeded = (
1764 "JobStatus == 5 "
1765 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1766 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1767 )
1769 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
1770 return expr
1773def _create_request_memory_expr(memory, multiplier, limit):
1774 """Construct an HTCondor ClassAd expression for safe memory scaling.
1776 Parameters
1777 ----------
1778 memory : `int`
1779 Requested memory in MB.
1780 multiplier : `float`
1781 Memory growth rate between retires.
1782 limit : `int`
1783 Memory limit.
1785 Returns
1786 -------
1787 expr : `str`
1788 A string representing an HTCondor ClassAd expression enabling safe
1789 memory scaling between job retries.
1790 """
1791 # The check if the job was held due to exceeding memory requirements
1792 # will be made *after* job was released back to the job queue (is in
1793 # the IDLE state), hence the need to use `Last*` job ClassAds instead of
1794 # the ones describing job's current state.
1795 #
1796 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
1797 # initially put in the job queue. The special comparison operators ensure
1798 # that all comparisons below will evaluate to FALSE in this case.
1799 was_mem_exceeded = (
1800 "LastJobStatus =?= 5 "
1801 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
1802 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1803 )
1805 # If job runs the first time or was held for reasons other than exceeding
1806 # the memory, set the required memory to the requested value or use
1807 # the memory value measured by HTCondor (MemoryUsage) depending on
1808 # whichever is greater.
1809 expr = (
1810 f"({was_mem_exceeded}) "
1811 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
1812 f": max({{{memory}, MemoryUsage ?: 0}})"
1813 )
1814 return expr
1817def _locate_schedds(locate_all=False):
1818 """Find out Scheduler daemons in an HTCondor pool.
1820 Parameters
1821 ----------
1822 locate_all : `bool`, optional
1823 If True, all available schedulers in the HTCondor pool will be located.
1824 False by default which means that the search will be limited to looking
1825 for the Scheduler running on a local host.
1827 Returns
1828 -------
1829 schedds : `dict` [`str`, `htcondor.Schedd`]
1830 A mapping between Scheduler names and Python objects allowing for
1831 interacting with them.
1832 """
1833 coll = htcondor.Collector()
1835 schedd_ads = []
1836 if locate_all:
1837 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1838 else:
1839 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1840 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1843def _gather_site_values(config, compute_site):
1844 """Gather values specific to given site.
1846 Parameters
1847 ----------
1848 config : `lsst.ctrl.bps.BpsConfig`
1849 BPS configuration that includes necessary submit/runtime
1850 information.
1851 compute_site : `str`
1852 Compute site name.
1854 Returns
1855 -------
1856 site_values : `dict` [`str`, `Any`]
1857 Values specific to the given site.
1858 """
1859 site_values = {"attrs": {}, "profile": {}}
1860 search_opts = {}
1861 if compute_site:
1862 search_opts["curvals"] = {"curr_site": compute_site}
1864 # Determine the hard limit for the memory requirement.
1865 found, limit = config.search("memoryLimit", opt=search_opts)
1866 if not found:
1867 search_opts["default"] = DEFAULT_HTC_EXEC_PATT
1868 _, patt = config.search("executeMachinesPattern", opt=search_opts)
1869 del search_opts["default"]
1871 # To reduce the amount of data, ignore dynamic slots (if any) as,
1872 # by definition, they cannot have more memory than
1873 # the partitionable slot they are the part of.
1874 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
1875 pool_info = condor_status(constraint=constraint)
1876 try:
1877 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
1878 except ValueError:
1879 _LOG.debug("No execute machine in the pool matches %s", patt)
1880 if limit:
1881 config[".bps_defined.memory_limit"] = limit
1883 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
1884 site_values["memoryLimit"] = limit
1886 found, value = config.search("accountingGroup", opt=search_opts)
1887 if found:
1888 site_values["accountingGroup"] = value
1889 found, value = config.search("accountingUser", opt=search_opts)
1890 if found:
1891 site_values["accountingUser"] = value
1893 key = f".site.{compute_site}.profile.condor"
1894 if key in config:
1895 for key, val in config[key].items():
1896 if key.startswith("+"):
1897 site_values["attrs"][key[1:]] = val
1898 else:
1899 site_values["profile"][key] = val
1901 return site_values