Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%
693 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-28 09:48 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-28 09:48 +0000
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Interface between generic workflow to HTCondor workflow system.
29"""
31__all__ = ["HTCondorService", "HTCondorWorkflow"]
34import logging
35import os
36import re
37from collections import defaultdict
38from enum import IntEnum, auto
39from pathlib import Path
41import htcondor
42from lsst.ctrl.bps import (
43 BaseWmsService,
44 BaseWmsWorkflow,
45 GenericWorkflow,
46 GenericWorkflowJob,
47 WmsJobReport,
48 WmsRunReport,
49 WmsStates,
50)
51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary
52from lsst.utils.timer import time_this
53from packaging import version
55from .lssthtc import (
56 MISSING_ID,
57 HTCDag,
58 HTCJob,
59 JobStatus,
60 NodeStatus,
61 condor_q,
62 condor_search,
63 condor_status,
64 htc_backup_files,
65 htc_check_dagman_output,
66 htc_create_submit_from_cmd,
67 htc_create_submit_from_dag,
68 htc_create_submit_from_file,
69 htc_escape,
70 htc_submit_dag,
71 htc_version,
72 pegasus_name_to_label,
73 read_dag_info,
74 read_dag_log,
75 read_dag_status,
76 read_node_status,
77 summary_from_dag,
78 write_dag_info,
79)
82class WmsIdType(IntEnum):
83 """Type of valid WMS ids."""
85 UNKNOWN = auto()
86 """The type of id cannot be determined.
87 """
89 LOCAL = auto()
90 """The id is HTCondor job's ClusterId (with optional '.ProcId').
91 """
93 GLOBAL = auto()
94 """Id is a HTCondor's global job id.
95 """
97 PATH = auto()
98 """Id is a submission path.
99 """
102DEFAULT_HTC_EXEC_PATT = ".*worker.*"
103"""Default pattern for searching execute machines in an HTCondor pool.
104"""
106_LOG = logging.getLogger(__name__)
109class HTCondorService(BaseWmsService):
110 """HTCondor version of WMS service."""
112 def prepare(self, config, generic_workflow, out_prefix=None):
113 """Convert generic workflow to an HTCondor DAG ready for submission.
115 Parameters
116 ----------
117 config : `lsst.ctrl.bps.BpsConfig`
118 BPS configuration that includes necessary submit/runtime
119 information.
120 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
121 The generic workflow (e.g., has executable name and arguments).
122 out_prefix : `str`
123 The root directory into which all WMS-specific files are written.
125 Returns
126 -------
127 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
128 HTCondor workflow ready to be run.
129 """
130 _LOG.debug("out_prefix = '%s'", out_prefix)
131 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"):
132 workflow = HTCondorWorkflow.from_generic_workflow(
133 config,
134 generic_workflow,
135 out_prefix,
136 f"{self.__class__.__module__}.{self.__class__.__name__}",
137 )
139 with time_this(
140 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow"
141 ):
142 workflow.write(out_prefix)
143 return workflow
145 def submit(self, workflow):
146 """Submit a single HTCondor workflow.
148 Parameters
149 ----------
150 workflow : `lsst.ctrl.bps.BaseWorkflow`
151 A single HTCondor workflow to submit. run_id is updated after
152 successful submission to WMS.
153 """
154 dag = workflow.dag
156 ver = version.parse(htc_version())
157 if ver >= version.parse("8.9.3"):
158 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
159 else:
160 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
162 # For workflow portability, internal paths are all relative. Hence
163 # the DAG needs to be submitted to HTCondor from inside the submit
164 # directory.
165 with chdir(workflow.submit_path):
166 _LOG.info("Submitting from directory: %s", os.getcwd())
167 schedd_dag_info = htc_submit_dag(sub)
168 if schedd_dag_info:
169 write_dag_info(f"{dag.name}.info.json", schedd_dag_info)
171 _, dag_info = schedd_dag_info.popitem()
172 _, dag_ad = dag_info.popitem()
174 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
175 workflow.run_id = dag.run_id
176 else:
177 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information")
179 def restart(self, wms_workflow_id):
180 """Restart a failed DAGMan workflow.
182 Parameters
183 ----------
184 wms_workflow_id : `str`
185 The directory with HTCondor files.
187 Returns
188 -------
189 run_id : `str`
190 HTCondor id of the restarted DAGMan job. If restart failed, it will
191 be set to None.
192 run_name : `str`
193 Name of the restarted workflow. If restart failed, it will be set
194 to None.
195 message : `str`
196 A message describing any issues encountered during the restart.
197 If there were no issues, an empty string is returned.
198 """
199 wms_path = Path(wms_workflow_id)
200 if not wms_path.is_dir():
201 return None, None, f"Directory '{wms_path}' not found"
203 _LOG.info("Restarting workflow from directory '%s'", wms_path)
204 rescue_dags = list(wms_path.glob("*.dag.rescue*"))
205 if not rescue_dags:
206 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'"
208 _LOG.info("Verifying that the workflow is not already in the job queue")
209 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_workflow_id}"')
210 if schedd_dag_info:
211 _, dag_info = schedd_dag_info.popitem()
212 _, dag_ad = dag_info.popitem()
213 id_ = dag_ad["GlobalJobId"]
214 return None, None, f"Workflow already in the job queue (global job id: '{id_}')"
216 _LOG.info("Checking execution status of the workflow")
217 warn = False
218 dag_ad = read_dag_status(str(wms_path))
219 if dag_ad:
220 nodes_total = dag_ad.get("NodesTotal", 0)
221 if nodes_total != 0:
222 nodes_done = dag_ad.get("NodesDone", 0)
223 if nodes_total == nodes_done:
224 return None, None, "All jobs in the workflow finished successfully"
225 else:
226 warn = True
227 else:
228 warn = True
229 if warn:
230 _LOG.warning(
231 "Cannot determine the execution status of the workflow, continuing with restart regardless"
232 )
234 _LOG.info("Backing up select HTCondor files from previous run attempt")
235 htc_backup_files(wms_path, subdir="backups")
237 # For workflow portability, internal paths are all relative. Hence
238 # the DAG needs to be resubmitted to HTCondor from inside the submit
239 # directory.
240 _LOG.info("Adding workflow to the job queue")
241 run_id, run_name, message = None, None, ""
242 with chdir(wms_path):
243 try:
244 dag_path = next(wms_path.glob("*.dag.condor.sub"))
245 except StopIteration:
246 message = f"DAGMan submit description file not found in '{wms_path}'"
247 else:
248 sub = htc_create_submit_from_file(dag_path.name)
249 schedd_dag_info = htc_submit_dag(sub)
251 # Save select information about the DAGMan job to a file. Use
252 # the run name (available in the ClassAd) as the filename.
253 if schedd_dag_info:
254 dag_info = next(iter(schedd_dag_info.values()))
255 dag_ad = next(iter(dag_info.values()))
256 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info)
257 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
258 run_name = dag_ad["bps_run"]
259 else:
260 message = "DAGMan job information unavailable"
262 return run_id, run_name, message
264 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
265 """Query WMS for list of submitted WMS workflows/jobs.
267 This should be a quick lookup function to create list of jobs for
268 other functions.
270 Parameters
271 ----------
272 wms_id : `int` or `str`, optional
273 Id or path that can be used by WMS service to look up job.
274 user : `str`, optional
275 User whose submitted jobs should be listed.
276 require_bps : `bool`, optional
277 Whether to require jobs returned in list to be bps-submitted jobs.
278 pass_thru : `str`, optional
279 Information to pass through to WMS.
280 is_global : `bool`, optional
281 If set, all job queues (and their histories) will be queried for
282 job information. Defaults to False which means that only the local
283 job queue will be queried.
285 Returns
286 -------
287 job_ids : `list` [`Any`]
288 Only job ids to be used by cancel and other functions. Typically
289 this means top-level jobs (i.e., not children jobs).
290 """
291 _LOG.debug(
292 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
293 wms_id,
294 user,
295 require_bps,
296 pass_thru,
297 is_global,
298 )
300 # Determine which Schedds will be queried for job information.
301 coll = htcondor.Collector()
303 schedd_ads = []
304 if is_global:
305 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
306 else:
307 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
309 # Construct appropriate constraint expression using provided arguments.
310 constraint = "False"
311 if wms_id is None:
312 if user is not None:
313 constraint = f'(Owner == "{user}")'
314 else:
315 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
316 if cluster_id is not None:
317 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
319 # If provided id is either a submission path or a global id,
320 # make sure the right Schedd will be queried regardless of
321 # 'is_global' value.
322 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
323 schedd_ads = [schedd_ad]
324 if require_bps:
325 constraint += ' && (bps_isjob == "True")'
326 if pass_thru:
327 if "-forcex" in pass_thru:
328 pass_thru_2 = pass_thru.replace("-forcex", "")
329 if pass_thru_2 and not pass_thru_2.isspace():
330 constraint += f" && ({pass_thru_2})"
331 else:
332 constraint += f" && ({pass_thru})"
334 # Create a list of scheduler daemons which need to be queried.
335 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
337 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
338 results = condor_q(constraint=constraint, schedds=schedds)
340 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
341 job_ids = []
342 for schedd_name, job_info in results.items():
343 for job_id, job_ad in job_info.items():
344 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
345 if "DAGManJobId" not in job_ad:
346 job_ids.append(job_ad.get("GlobalJobId", job_id))
347 else:
348 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
349 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
350 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
351 job_ids.append(job_ad.get("GlobalJobId", job_id))
353 _LOG.debug("job_ids = %s", job_ids)
354 return job_ids
356 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
357 """Return run information based upon given constraints.
359 Parameters
360 ----------
361 wms_workflow_id : `str`, optional
362 Limit to specific run based on id.
363 user : `str`, optional
364 Limit results to runs for this user.
365 hist : `float`, optional
366 Limit history search to this many days. Defaults to 0.
367 pass_thru : `str`, optional
368 Constraints to pass through to HTCondor.
369 is_global : `bool`, optional
370 If set, all job queues (and their histories) will be queried for
371 job information. Defaults to False which means that only the local
372 job queue will be queried.
374 Returns
375 -------
376 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
377 Information about runs from given job information.
378 message : `str`
379 Extra message for report command to print. This could be pointers
380 to documentation or to WMS specific commands.
381 """
382 if wms_workflow_id:
383 id_type = _wms_id_type(wms_workflow_id)
384 if id_type == WmsIdType.LOCAL:
385 schedulers = _locate_schedds(locate_all=is_global)
386 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
387 elif id_type == WmsIdType.GLOBAL:
388 schedulers = _locate_schedds(locate_all=True)
389 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
390 elif id_type == WmsIdType.PATH:
391 run_reports, message = _report_from_path(wms_workflow_id)
392 else:
393 run_reports, message = {}, "Invalid job id"
394 else:
395 schedulers = _locate_schedds(locate_all=is_global)
396 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
397 _LOG.debug("report: %s, %s", run_reports, message)
399 return list(run_reports.values()), message
401 def cancel(self, wms_id, pass_thru=None):
402 """Cancel submitted workflows/jobs.
404 Parameters
405 ----------
406 wms_id : `str`
407 Id or path of job that should be canceled.
408 pass_thru : `str`, optional
409 Information to pass through to WMS.
411 Returns
412 -------
413 deleted : `bool`
414 Whether successful deletion or not. Currently, if any doubt or any
415 individual jobs not deleted, return False.
416 message : `str`
417 Any message from WMS (e.g., error details).
418 """
419 _LOG.debug("Canceling wms_id = %s", wms_id)
421 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
423 if cluster_id is None:
424 deleted = False
425 message = "invalid id"
426 else:
427 _LOG.debug(
428 "Canceling job managed by schedd_name = %s with cluster_id = %s",
429 cluster_id,
430 schedd_ad["Name"],
431 )
432 schedd = htcondor.Schedd(schedd_ad)
434 constraint = f"ClusterId == {cluster_id}"
435 if pass_thru is not None and "-forcex" in pass_thru:
436 pass_thru_2 = pass_thru.replace("-forcex", "")
437 if pass_thru_2 and not pass_thru_2.isspace():
438 constraint += f"&& ({pass_thru_2})"
439 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
440 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
441 else:
442 if pass_thru:
443 constraint += f"&& ({pass_thru})"
444 _LOG.debug("JobAction.Remove constraint = %s", constraint)
445 results = schedd.act(htcondor.JobAction.Remove, constraint)
446 _LOG.debug("Remove results: %s", results)
448 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
449 deleted = True
450 message = ""
451 else:
452 deleted = False
453 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
454 message = "no such bps job in batch queue"
455 else:
456 message = f"unknown problems deleting: {results}"
458 _LOG.debug("deleted: %s; message = %s", deleted, message)
459 return deleted, message
462class HTCondorWorkflow(BaseWmsWorkflow):
463 """Single HTCondor workflow.
465 Parameters
466 ----------
467 name : `str`
468 Unique name for Workflow used when naming files.
469 config : `lsst.ctrl.bps.BpsConfig`
470 BPS configuration that includes necessary submit/runtime information.
471 """
473 def __init__(self, name, config=None):
474 super().__init__(name, config)
475 self.dag = None
477 @classmethod
478 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
479 # Docstring inherited
480 htc_workflow = cls(generic_workflow.name, config)
481 htc_workflow.dag = HTCDag(name=generic_workflow.name)
483 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
484 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
485 htc_workflow.dag.add_attribs(
486 {
487 "bps_wms_service": service_class,
488 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
489 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
490 "bps_job_summary": create_count_summary(generic_workflow.job_counts),
491 }
492 )
494 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
495 if isinstance(tmp_template, str):
496 subdir_template = defaultdict(lambda: tmp_template)
497 else:
498 subdir_template = tmp_template
500 # Create all DAG jobs
501 site_values = {} # cache compute site specific values to reduce config lookups
502 for job_name in generic_workflow:
503 gwjob = generic_workflow.get_job(job_name)
504 if gwjob.compute_site not in site_values:
505 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
506 htc_job = _create_job(
507 subdir_template[gwjob.label],
508 site_values[gwjob.compute_site],
509 generic_workflow,
510 gwjob,
511 out_prefix,
512 )
513 htc_workflow.dag.add_job(htc_job)
515 # Add job dependencies to the DAG
516 for job_name in generic_workflow:
517 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
519 # If final job exists in generic workflow, create DAG final job
520 final = generic_workflow.get_final()
521 if final and isinstance(final, GenericWorkflowJob):
522 if final.compute_site and final.compute_site not in site_values:
523 site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
524 final_htjob = _create_job(
525 subdir_template[final.label],
526 site_values[final.compute_site],
527 generic_workflow,
528 final,
529 out_prefix,
530 )
531 if "post" not in final_htjob.dagcmds:
532 final_htjob.dagcmds[
533 "post"
534 ] = f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
535 htc_workflow.dag.add_final_job(final_htjob)
536 elif final and isinstance(final, GenericWorkflow):
537 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
538 elif final:
539 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
541 return htc_workflow
543 def write(self, out_prefix):
544 """Output HTCondor DAGMan files needed for workflow submission.
546 Parameters
547 ----------
548 out_prefix : `str`
549 Directory prefix for HTCondor files.
550 """
551 self.submit_path = out_prefix
552 os.makedirs(out_prefix, exist_ok=True)
554 # Write down the workflow in HTCondor format.
555 self.dag.write(out_prefix, "jobs/{self.label}")
558def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
559 """Convert GenericWorkflow job nodes to DAG jobs.
561 Parameters
562 ----------
563 subdir_template : `str`
564 Template for making subdirs.
565 site_values : `dict`
566 Site specific values
567 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
568 Generic workflow that is being converted.
569 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
570 The generic job to convert to a HTCondor job.
571 out_prefix : `str`
572 Directory prefix for HTCondor files.
574 Returns
575 -------
576 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
577 The HTCondor job equivalent to the given generic job.
578 """
579 htc_job = HTCJob(gwjob.name, label=gwjob.label)
581 curvals = defaultdict(str)
582 curvals["label"] = gwjob.label
583 if gwjob.tags:
584 curvals.update(gwjob.tags)
586 subdir = subdir_template.format_map(curvals)
587 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
589 htc_job_cmds = {
590 "universe": "vanilla",
591 "should_transfer_files": "YES",
592 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
593 "transfer_output_files": '""', # Set to empty string to disable
594 "transfer_executable": "False",
595 "getenv": "True",
596 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
597 # htcondor to put on hold any jobs which exited by a signal.
598 "on_exit_hold": "ExitBySignal == true",
599 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
600 '"Handling signal as if job has gone over memory limit.")',
601 "on_exit_hold_subcode": "34",
602 }
604 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
606 # job stdout, stderr, htcondor user log.
607 for key in ("output", "error", "log"):
608 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
609 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
611 htc_job_cmds.update(
612 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
613 )
615 # Add the job cmds dict to the job object.
616 htc_job.add_job_cmds(htc_job_cmds)
618 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
620 # Add job attributes to job.
621 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
622 htc_job.add_job_attrs(gwjob.attrs)
623 htc_job.add_job_attrs(site_values["attrs"])
624 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
625 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
627 return htc_job
630def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
631 """Translate the job data that are one to one mapping
633 Parameters
634 ----------
635 cached_vals : `dict` [`str`, `Any`]
636 Config values common to jobs with same label.
637 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
638 Generic workflow that contains job to being converted.
639 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
640 Generic workflow job to be converted.
642 Returns
643 -------
644 htc_job_commands : `dict` [`str`, `Any`]
645 Contains commands which can appear in the HTCondor submit description
646 file.
647 """
648 # Values in the job script that just are name mappings.
649 job_translation = {
650 "mail_to": "notify_user",
651 "when_to_mail": "notification",
652 "request_cpus": "request_cpus",
653 "priority": "priority",
654 "category": "category",
655 "accounting_group": "accounting_group",
656 "accounting_user": "accounting_group_user",
657 }
659 jobcmds = {}
660 for gwkey, htckey in job_translation.items():
661 jobcmds[htckey] = getattr(gwjob, gwkey, None)
663 # If accounting info was not set explicitly, use site settings if any.
664 if not gwjob.accounting_group:
665 jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
666 if not gwjob.accounting_user:
667 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
669 # job commands that need modification
670 if gwjob.number_of_retries:
671 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
673 if gwjob.retry_unless_exit:
674 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
676 if gwjob.request_disk:
677 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
679 if gwjob.request_memory:
680 jobcmds["request_memory"] = f"{gwjob.request_memory}"
682 if gwjob.memory_multiplier:
683 # Do not use try-except! At the moment, BpsConfig returns an empty
684 # string if it does not contain the key.
685 memory_limit = cached_vals["memoryLimit"]
686 if not memory_limit:
687 raise RuntimeError(
688 "Memory autoscaling enabled, but automatic detection of the memory limit "
689 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
690 "search pattern 'executeMachinesPattern' might help."
691 )
693 # Set maximal amount of memory job can ask for.
694 #
695 # The check below assumes that 'memory_limit' was set to a value which
696 # realistically reflects actual physical limitations of a given compute
697 # resource.
698 memory_max = memory_limit
699 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
700 memory_max = gwjob.request_memory_max
702 # Make job ask for more memory each time it failed due to insufficient
703 # memory requirements.
704 jobcmds["request_memory"] = _create_request_memory_expr(
705 gwjob.request_memory, gwjob.memory_multiplier, memory_max
706 )
708 # Periodically release jobs which are being held due to exceeding
709 # memory. Stop doing that (by removing the job from the HTCondor queue)
710 # after the maximal number of retries has been reached or the job was
711 # already run at maximal allowed memory.
712 jobcmds["periodic_release"] = _create_periodic_release_expr(
713 gwjob.request_memory, gwjob.memory_multiplier, memory_max
714 )
715 jobcmds["periodic_remove"] = _create_periodic_remove_expr(
716 gwjob.request_memory, gwjob.memory_multiplier, memory_max
717 )
719 # Assume concurrency_limit implemented using HTCondor concurrency limits.
720 # May need to move to special site-specific implementation if sites use
721 # other mechanisms.
722 if gwjob.concurrency_limit:
723 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
725 # Handle command line
726 if gwjob.executable.transfer_executable:
727 jobcmds["transfer_executable"] = "True"
728 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
729 else:
730 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
732 if gwjob.arguments:
733 arguments = gwjob.arguments
734 arguments = _replace_cmd_vars(arguments, gwjob)
735 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
736 arguments = _fix_env_var_syntax(arguments)
737 jobcmds["arguments"] = arguments
739 # Add extra "pass-thru" job commands
740 if gwjob.profile:
741 for key, val in gwjob.profile.items():
742 jobcmds[key] = htc_escape(val)
743 for key, val in cached_vals["profile"].items():
744 jobcmds[key] = htc_escape(val)
746 return jobcmds
749def _translate_dag_cmds(gwjob):
750 """Translate job values into DAGMan commands.
752 Parameters
753 ----------
754 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
755 Job containing values to be translated.
757 Returns
758 -------
759 dagcmds : `dict` [`str`, `Any`]
760 DAGMan commands for the job.
761 """
762 # Values in the dag script that just are name mappings.
763 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
765 dagcmds = {}
766 for gwkey, htckey in dag_translation.items():
767 dagcmds[htckey] = getattr(gwjob, gwkey, None)
769 # Still to be coded: vars "pre_cmdline", "post_cmdline"
770 return dagcmds
773def _fix_env_var_syntax(oldstr):
774 """Change ENV place holders to HTCondor Env var syntax.
776 Parameters
777 ----------
778 oldstr : `str`
779 String in which environment variable syntax is to be fixed.
781 Returns
782 -------
783 newstr : `str`
784 Given string with environment variable syntax fixed.
785 """
786 newstr = oldstr
787 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
788 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
789 return newstr
792def _replace_file_vars(use_shared, arguments, workflow, gwjob):
793 """Replace file placeholders in command line arguments with correct
794 physical file names.
796 Parameters
797 ----------
798 use_shared : `bool`
799 Whether HTCondor can assume shared filesystem.
800 arguments : `str`
801 Arguments string in which to replace file placeholders.
802 workflow : `lsst.ctrl.bps.GenericWorkflow`
803 Generic workflow that contains file information.
804 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
805 The job corresponding to the arguments.
807 Returns
808 -------
809 arguments : `str`
810 Given arguments string with file placeholders replaced.
811 """
812 # Replace input file placeholders with paths.
813 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
814 if not gwfile.wms_transfer:
815 # Must assume full URI if in command line and told WMS is not
816 # responsible for transferring file.
817 uri = gwfile.src_uri
818 elif use_shared:
819 if gwfile.job_shared:
820 # Have shared filesystems and jobs can share file.
821 uri = gwfile.src_uri
822 else:
823 # Taking advantage of inside knowledge. Not future-proof.
824 # Temporary fix until have job wrapper that pulls files
825 # within job.
826 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
827 uri = "butler.yaml"
828 else:
829 uri = os.path.basename(gwfile.src_uri)
830 else: # Using push transfer
831 uri = os.path.basename(gwfile.src_uri)
832 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
834 # Replace output file placeholders with paths.
835 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
836 if not gwfile.wms_transfer:
837 # Must assume full URI if in command line and told WMS is not
838 # responsible for transferring file.
839 uri = gwfile.src_uri
840 elif use_shared:
841 if gwfile.job_shared:
842 # Have shared filesystems and jobs can share file.
843 uri = gwfile.src_uri
844 else:
845 uri = os.path.basename(gwfile.src_uri)
846 else: # Using push transfer
847 uri = os.path.basename(gwfile.src_uri)
848 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
849 return arguments
852def _replace_cmd_vars(arguments, gwjob):
853 """Replace format-style placeholders in arguments.
855 Parameters
856 ----------
857 arguments : `str`
858 Arguments string in which to replace placeholders.
859 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
860 Job containing values to be used to replace placeholders
861 (in particular gwjob.cmdvals).
863 Returns
864 -------
865 arguments : `str`
866 Given arguments string with placeholders replaced.
867 """
868 try:
869 arguments = arguments.format(**gwjob.cmdvals)
870 except (KeyError, TypeError): # TypeError in case None instead of {}
871 _LOG.error(
872 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals
873 )
874 raise
875 return arguments
878def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
879 """Add job input files from generic workflow to job.
881 Parameters
882 ----------
883 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
884 The generic workflow (e.g., has executable name and arguments).
885 job_name : `str`
886 Unique name for the job.
887 use_shared : `bool`
888 Whether job has access to files via shared filesystem.
889 out_prefix : `str`
890 The root directory into which all WMS-specific files are written.
892 Returns
893 -------
894 htc_commands : `dict` [`str`, `str`]
895 HTCondor commands for the job submission script.
896 """
897 htc_commands = {}
898 inputs = []
899 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
900 _LOG.debug("src_uri=%s", gwf_file.src_uri)
902 uri = Path(gwf_file.src_uri)
904 # Note if use_shared and job_shared, don't need to transfer file.
906 if not use_shared: # Copy file using push to job
907 inputs.append(str(uri.relative_to(out_prefix)))
908 elif not gwf_file.job_shared: # Jobs require own copy
909 # if using shared filesystem, but still need copy in job. Use
910 # HTCondor's curl plugin for a local copy.
912 # Execution butler is represented as a directory which the
913 # curl plugin does not handle. Taking advantage of inside
914 # knowledge for temporary fix until have job wrapper that pulls
915 # files within job.
916 if gwf_file.name == "butlerConfig":
917 # The execution butler directory doesn't normally exist until
918 # the submit phase so checking for suffix instead of using
919 # is_dir(). If other non-yaml file exists they would have a
920 # different gwf_file.name.
921 if uri.suffix == ".yaml": # Single file, so just copy.
922 inputs.append(f"file://{uri}")
923 else:
924 inputs.append(f"file://{uri / 'butler.yaml'}")
925 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
926 elif uri.is_dir():
927 raise RuntimeError(
928 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
929 )
930 else:
931 inputs.append(f"file://{uri}")
933 if inputs:
934 htc_commands["transfer_input_files"] = ",".join(inputs)
935 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
936 return htc_commands
939def _report_from_path(wms_path):
940 """Gather run information from a given run directory.
942 Parameters
943 ----------
944 wms_path : `str`
945 The directory containing the submit side files (e.g., HTCondor files).
947 Returns
948 -------
949 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
950 Run information for the detailed report. The key is the HTCondor id
951 and the value is a collection of report information for that run.
952 message : `str`
953 Message to be printed with the summary report.
954 """
955 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
956 if wms_workflow_id == MISSING_ID:
957 run_reports = {}
958 else:
959 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
960 return run_reports, message
963def _report_from_id(wms_workflow_id, hist, schedds=None):
964 """Gather run information using workflow id.
966 Parameters
967 ----------
968 wms_workflow_id : `str`
969 Limit to specific run based on id.
970 hist : `float`
971 Limit history search to this many days.
972 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
973 HTCondor schedulers which to query for job information. If None
974 (default), all queries will be run against the local scheduler only.
976 Returns
977 -------
978 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
979 Run information for the detailed report. The key is the HTCondor id
980 and the value is a collection of report information for that run.
981 message : `str`
982 Message to be printed with the summary report.
983 """
984 messages = []
986 # Collect information about the job by querying HTCondor schedd and
987 # HTCondor history.
988 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
989 if len(schedd_dag_info) == 1:
990 # Extract the DAG info without altering the results of the query.
991 schedd_name = next(iter(schedd_dag_info))
992 dag_id = next(iter(schedd_dag_info[schedd_name]))
993 dag_ad = schedd_dag_info[schedd_name][dag_id]
995 # If the provided workflow id does not correspond to the one extracted
996 # from the DAGMan log file in the submit directory, rerun the query
997 # with the id found in the file.
998 #
999 # This is to cover the situation in which the user provided the old job
1000 # id of a restarted run.
1001 try:
1002 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"])
1003 except FileNotFoundError as exc:
1004 # At the moment missing DAGMan log is pretty much a fatal error.
1005 # So empty the DAG info to finish early (see the if statement
1006 # below).
1007 schedd_dag_info.clean()
1008 messages.append(f"Cannot create the report for '{dag_id}': {exc}")
1009 else:
1010 if path_dag_id != dag_id:
1011 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
1012 messages.append(
1013 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
1014 "This normally occurs when a run is restarted. The report shown is for the most "
1015 f"recent status with run id '{path_dag_id}'"
1016 )
1018 if len(schedd_dag_info) == 0:
1019 run_reports = {}
1020 elif len(schedd_dag_info) == 1:
1021 _, dag_info = schedd_dag_info.popitem()
1022 dag_id, dag_ad = dag_info.popitem()
1024 # Create a mapping between jobs and their classads. The keys will
1025 # be of format 'ClusterId.ProcId'.
1026 job_info = {dag_id: dag_ad}
1028 # Find jobs (nodes) belonging to that DAGMan job.
1029 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
1030 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
1031 if schedd_job_info:
1032 _, node_info = schedd_job_info.popitem()
1033 job_info.update(node_info)
1035 # Collect additional pieces of information about jobs using HTCondor
1036 # files in the submission directory.
1037 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
1038 _update_jobs(job_info, path_jobs)
1039 if message:
1040 messages.append(message)
1041 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
1042 else:
1043 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
1044 message = (
1045 f"More than one job matches id '{wms_workflow_id}', "
1046 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
1047 )
1048 messages.append(message)
1049 run_reports = {}
1051 message = "\n".join(messages)
1052 return run_reports, message
1055def _get_info_from_schedd(wms_workflow_id, hist, schedds):
1056 """Gather run information from HTCondor.
1058 Parameters
1059 ----------
1060 wms_workflow_id : `str`
1061 Limit to specific run based on id.
1062 hist :
1063 Limit history search to this many days.
1064 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1065 HTCondor schedulers which to query for job information. If None
1066 (default), all queries will be run against the local scheduler only.
1068 Returns
1069 -------
1070 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1071 Information about jobs satisfying the search criteria where for each
1072 Scheduler, local HTCondor job ids are mapped to their respective
1073 classads.
1074 """
1075 dag_constraint = 'regexp("dagman$", Cmd)'
1076 try:
1077 cluster_id = int(float(wms_workflow_id))
1078 except ValueError:
1079 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
1080 else:
1081 dag_constraint += f" && ClusterId == {cluster_id}"
1083 # With the current implementation of the condor_* functions the query
1084 # will always return only one match per Scheduler.
1085 #
1086 # Even in the highly unlikely situation where HTCondor history (which
1087 # condor_search queries too) is long enough to have jobs from before
1088 # the cluster ids were rolled over (and as a result there is more then
1089 # one job with the same cluster id) they will not show up in
1090 # the results.
1091 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
1092 return schedd_dag_info
1095def _get_info_from_path(wms_path):
1096 """Gather run information from a given run directory.
1098 Parameters
1099 ----------
1100 wms_path : `str`
1101 Directory containing HTCondor files.
1103 Returns
1104 -------
1105 wms_workflow_id : `str`
1106 The run id which is a DAGman job id.
1107 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1108 Information about jobs read from files in the given directory.
1109 The key is the HTCondor id and the value is a dictionary of HTCondor
1110 keys and values.
1111 message : `str`
1112 Message to be printed with the summary report.
1113 """
1114 messages = []
1115 try:
1116 wms_workflow_id, jobs = read_dag_log(wms_path)
1117 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
1118 _update_jobs(jobs, read_node_status(wms_path))
1119 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
1121 # Add more info for DAGman job
1122 job = jobs[wms_workflow_id]
1123 job.update(read_dag_status(wms_path))
1125 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
1126 if "bps_run" not in job:
1127 _add_run_info(wms_path, job)
1129 message = htc_check_dagman_output(wms_path)
1130 if message:
1131 messages.append(message)
1132 _LOG.debug(
1133 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
1134 )
1136 # Add extra pieces of information which cannot be found in HTCondor
1137 # generated files like 'GlobalJobId'.
1138 #
1139 # Do not treat absence of this file as a serious error. Neither runs
1140 # submitted with earlier versions of the plugin nor the runs submitted
1141 # with Pegasus plugin will have it at the moment. However, once enough
1142 # time passes and Pegasus plugin will have its own report() method
1143 # (instead of sneakily using HTCondor's one), the lack of that file
1144 # should be treated as seriously as lack of any other file.
1145 try:
1146 job_info = read_dag_info(wms_path)
1147 except FileNotFoundError as exc:
1148 message = f"Warn: Some information may not be available: {exc}"
1149 messages.append(message)
1150 else:
1151 schedd_name = next(iter(job_info))
1152 job_ad = next(iter(job_info[schedd_name].values()))
1153 job.update(job_ad)
1154 except FileNotFoundError:
1155 message = f"Could not find HTCondor files in '{wms_path}'"
1156 _LOG.warning(message)
1157 messages.append(message)
1158 wms_workflow_id = MISSING_ID
1159 jobs = {}
1161 message = "\n".join([msg for msg in messages if msg])
1162 return wms_workflow_id, jobs, message
1165def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
1166 """Gather run information to be used in generating summary reports.
1168 Parameters
1169 ----------
1170 wms_workflow_id : `str`
1171 The run id to create the report for.
1172 jobs : `dict` [`str`, `dict` [`str`, Any]]
1173 Mapping HTCondor job id to job information.
1175 Returns
1176 -------
1177 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1178 Run information for the detailed report. The key is the given HTCondor
1179 id and the value is a collection of report information for that run.
1180 """
1181 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1182 dag_job = jobs[wms_workflow_id]
1183 report = WmsRunReport(
1184 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
1185 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
1186 path=dag_job["Iwd"],
1187 label=dag_job.get("bps_job_label", "MISS"),
1188 run=dag_job.get("bps_run", "MISS"),
1189 project=dag_job.get("bps_project", "MISS"),
1190 campaign=dag_job.get("bps_campaign", "MISS"),
1191 payload=dag_job.get("bps_payload", "MISS"),
1192 operator=_get_owner(dag_job),
1193 run_summary=_get_run_summary(dag_job),
1194 state=_htc_status_to_wms_state(dag_job),
1195 jobs=[],
1196 total_number_jobs=dag_job["total_jobs"],
1197 job_state_counts=dag_job["state_counts"],
1198 )
1200 for job_id, job_info in jobs.items():
1201 try:
1202 if job_info["ClusterId"] != int(float(wms_workflow_id)):
1203 job_report = WmsJobReport(
1204 wms_id=job_id,
1205 name=job_info.get("DAGNodeName", job_id),
1206 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])),
1207 state=_htc_status_to_wms_state(job_info),
1208 )
1209 if job_report.label == "init":
1210 job_report.label = "pipetaskInit"
1211 report.jobs.append(job_report)
1212 except KeyError as ex:
1213 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
1214 raise
1216 run_reports = {report.wms_id: report}
1217 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1218 return run_reports
1221def _summary_report(user, hist, pass_thru, schedds=None):
1222 """Gather run information to be used in generating summary reports.
1224 Parameters
1225 ----------
1226 user : `str`
1227 Run lookup restricted to given user.
1228 hist : `float`
1229 How many previous days to search for run information.
1230 pass_thru : `str`
1231 Advanced users can define the HTCondor constraint to be used
1232 when searching queue and history.
1234 Returns
1235 -------
1236 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1237 Run information for the summary report. The keys are HTCondor ids and
1238 the values are collections of report information for each run.
1239 message : `str`
1240 Message to be printed with the summary report.
1241 """
1242 # only doing summary report so only look for dagman jobs
1243 if pass_thru:
1244 constraint = pass_thru
1245 else:
1246 # Notes:
1247 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1248 # manually restarted.
1249 # * Any job with DAGManJobID isn't a DAG job
1250 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1251 if user:
1252 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1254 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1256 # Have list of DAGMan jobs, need to get run_report info.
1257 run_reports = {}
1258 for jobs in job_info.values():
1259 for job_id, job in jobs.items():
1260 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1261 # If didn't get from queue information (e.g., Kerberos bug),
1262 # try reading from file.
1263 if total_jobs == 0:
1264 try:
1265 job.update(read_dag_status(job["Iwd"]))
1266 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1267 except StopIteration:
1268 pass # don't kill report can't find htcondor files
1270 if "bps_run" not in job:
1271 _add_run_info(job["Iwd"], job)
1272 report = WmsRunReport(
1273 wms_id=job_id,
1274 global_wms_id=job["GlobalJobId"],
1275 path=job["Iwd"],
1276 label=job.get("bps_job_label", "MISS"),
1277 run=job.get("bps_run", "MISS"),
1278 project=job.get("bps_project", "MISS"),
1279 campaign=job.get("bps_campaign", "MISS"),
1280 payload=job.get("bps_payload", "MISS"),
1281 operator=_get_owner(job),
1282 run_summary=_get_run_summary(job),
1283 state=_htc_status_to_wms_state(job),
1284 jobs=[],
1285 total_number_jobs=total_jobs,
1286 job_state_counts=state_counts,
1287 )
1288 run_reports[report.global_wms_id] = report
1290 return run_reports, ""
1293def _add_run_info(wms_path, job):
1294 """Find BPS run information elsewhere for runs without bps attributes.
1296 Parameters
1297 ----------
1298 wms_path : `str`
1299 Path to submit files for the run.
1300 job : `dict` [`str`, `Any`]
1301 HTCondor dag job information.
1303 Raises
1304 ------
1305 StopIteration
1306 If cannot find file it is looking for. Permission errors are
1307 caught and job's run is marked with error.
1308 """
1309 path = Path(wms_path) / "jobs"
1310 try:
1311 subfile = next(path.glob("**/*.sub"))
1312 except (StopIteration, PermissionError):
1313 job["bps_run"] = "Unavailable"
1314 else:
1315 _LOG.debug("_add_run_info: subfile = %s", subfile)
1316 try:
1317 with open(subfile, encoding="utf-8") as fh:
1318 for line in fh:
1319 if line.startswith("+bps_"):
1320 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1321 if m:
1322 _LOG.debug("Matching line: %s", line)
1323 job[m.group(1)] = m.group(2).replace('"', "")
1324 else:
1325 _LOG.debug("Could not parse attribute: %s", line)
1326 except PermissionError:
1327 job["bps_run"] = "PermissionError"
1328 _LOG.debug("After adding job = %s", job)
1331def _get_owner(job):
1332 """Get the owner of a dag job.
1334 Parameters
1335 ----------
1336 job : `dict` [`str`, `Any`]
1337 HTCondor dag job information.
1339 Returns
1340 -------
1341 owner : `str`
1342 Owner of the dag job.
1343 """
1344 owner = job.get("bps_operator", None)
1345 if not owner:
1346 owner = job.get("Owner", None)
1347 if not owner:
1348 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1349 owner = "MISS"
1350 return owner
1353def _get_run_summary(job):
1354 """Get the run summary for a job.
1356 Parameters
1357 ----------
1358 job : `dict` [`str`, `Any`]
1359 HTCondor dag job information.
1361 Returns
1362 -------
1363 summary : `str`
1364 Number of jobs per PipelineTask label in approximate pipeline order.
1365 Format: <label>:<count>[;<label>:<count>]+
1366 """
1367 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1368 if not summary:
1369 summary, _ = summary_from_dag(job["Iwd"])
1370 if not summary:
1371 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1372 _LOG.debug("_get_run_summary: summary=%s", summary)
1374 # Workaround sometimes using init vs pipetaskInit
1375 summary = summary.replace("init:", "pipetaskInit:")
1377 if "pegasus_version" in job and "pegasus" not in summary:
1378 summary += ";pegasus:0"
1380 return summary
1383def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1384 """Count number of jobs per WMS state.
1386 Parameters
1387 ----------
1388 wms_workflow_id : `str`
1389 HTCondor job id.
1390 jobs : `dict` [`str`, `Any`]
1391 HTCondor dag job information.
1393 Returns
1394 -------
1395 total_count : `int`
1396 Total number of dag nodes.
1397 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1398 Keys are the different WMS states and values are counts of jobs
1399 that are in that WMS state.
1400 """
1401 state_counts = dict.fromkeys(WmsStates, 0)
1403 for jid, jinfo in jobs.items():
1404 if jid != wms_workflow_id:
1405 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1407 total_counted = sum(state_counts.values())
1408 if "NodesTotal" in jobs[wms_workflow_id]:
1409 total_count = jobs[wms_workflow_id]["NodesTotal"]
1410 else:
1411 total_count = total_counted
1413 state_counts[WmsStates.UNREADY] += total_count - total_counted
1415 return total_count, state_counts
1418def _get_state_counts_from_dag_job(job):
1419 """Count number of jobs per WMS state.
1421 Parameters
1422 ----------
1423 job : `dict` [`str`, `Any`]
1424 HTCondor dag job information.
1426 Returns
1427 -------
1428 total_count : `int`
1429 Total number of dag nodes.
1430 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1431 Keys are the different WMS states and values are counts of jobs
1432 that are in that WMS state.
1433 """
1434 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1435 state_counts = dict.fromkeys(WmsStates, 0)
1436 if "DAG_NodesReady" in job:
1437 state_counts = {
1438 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1439 WmsStates.READY: job.get("DAG_NodesReady", 0),
1440 WmsStates.HELD: job.get("JobProcsHeld", 0),
1441 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1442 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1443 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
1444 }
1445 total_jobs = job.get("DAG_NodesTotal")
1446 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1447 elif "NodesFailed" in job:
1448 state_counts = {
1449 WmsStates.UNREADY: job.get("NodesUnready", 0),
1450 WmsStates.READY: job.get("NodesReady", 0),
1451 WmsStates.HELD: job.get("JobProcsHeld", 0),
1452 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1453 WmsStates.FAILED: job.get("NodesFailed", 0),
1454 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
1455 }
1456 try:
1457 total_jobs = job.get("NodesTotal")
1458 except KeyError as ex:
1459 _LOG.error("Job missing %s. job = %s", str(ex), job)
1460 raise
1461 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1462 else:
1463 # With Kerberos job auth and Kerberos bug, if warning would be printed
1464 # for every DAG.
1465 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1466 total_jobs = 0
1468 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1469 return total_jobs, state_counts
1472def _htc_status_to_wms_state(job):
1473 """Convert HTCondor job status to generic wms state.
1475 Parameters
1476 ----------
1477 job : `dict` [`str`, `Any`]
1478 HTCondor job information.
1480 Returns
1481 -------
1482 wms_state : `WmsStates`
1483 The equivalent WmsState to given job's status.
1484 """
1485 wms_state = WmsStates.MISFIT
1486 if "JobStatus" in job:
1487 wms_state = _htc_job_status_to_wms_state(job)
1488 elif "NodeStatus" in job:
1489 wms_state = _htc_node_status_to_wms_state(job)
1490 return wms_state
1493def _htc_job_status_to_wms_state(job):
1494 """Convert HTCondor job status to generic wms state.
1496 Parameters
1497 ----------
1498 job : `dict` [`str`, `Any`]
1499 HTCondor job information.
1501 Returns
1502 -------
1503 wms_state : `lsst.ctrl.bps.WmsStates`
1504 The equivalent WmsState to given job's status.
1505 """
1506 _LOG.debug(
1507 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1508 )
1509 job_status = int(job["JobStatus"])
1510 wms_state = WmsStates.MISFIT
1512 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1513 if job_status == JobStatus.IDLE:
1514 wms_state = WmsStates.PENDING
1515 elif job_status == JobStatus.RUNNING:
1516 wms_state = WmsStates.RUNNING
1517 elif job_status == JobStatus.REMOVED:
1518 wms_state = WmsStates.DELETED
1519 elif job_status == JobStatus.COMPLETED:
1520 if (
1521 job.get("ExitBySignal", False)
1522 or job.get("ExitCode", 0)
1523 or job.get("ExitSignal", 0)
1524 or job.get("DAG_Status", 0)
1525 or job.get("ReturnValue", 0)
1526 ):
1527 wms_state = WmsStates.FAILED
1528 else:
1529 wms_state = WmsStates.SUCCEEDED
1530 elif job_status == JobStatus.HELD:
1531 wms_state = WmsStates.HELD
1533 return wms_state
1536def _htc_node_status_to_wms_state(job):
1537 """Convert HTCondor status to generic wms state.
1539 Parameters
1540 ----------
1541 job : `dict` [`str`, `Any`]
1542 HTCondor job information.
1544 Returns
1545 -------
1546 wms_state : `lsst.ctrl.bps.WmsStates`
1547 The equivalent WmsState to given node's status.
1548 """
1549 wms_state = WmsStates.MISFIT
1551 status = job["NodeStatus"]
1552 if status == NodeStatus.NOT_READY:
1553 wms_state = WmsStates.UNREADY
1554 elif status == NodeStatus.READY:
1555 wms_state = WmsStates.READY
1556 elif status == NodeStatus.PRERUN:
1557 wms_state = WmsStates.MISFIT
1558 elif status == NodeStatus.SUBMITTED:
1559 if job["JobProcsHeld"]:
1560 wms_state = WmsStates.HELD
1561 elif job["StatusDetails"] == "not_idle":
1562 wms_state = WmsStates.RUNNING
1563 elif job["JobProcsQueued"]:
1564 wms_state = WmsStates.PENDING
1565 elif status == NodeStatus.POSTRUN:
1566 wms_state = WmsStates.MISFIT
1567 elif status == NodeStatus.DONE:
1568 wms_state = WmsStates.SUCCEEDED
1569 elif status == NodeStatus.ERROR:
1570 # Use job exist instead of post script exit
1571 if "DAGMAN error 0" in job["StatusDetails"]:
1572 wms_state = WmsStates.SUCCEEDED
1573 else:
1574 wms_state = WmsStates.FAILED
1576 return wms_state
1579def _update_jobs(jobs1, jobs2):
1580 """Update jobs1 with info in jobs2.
1582 (Basically an update for nested dictionaries.)
1584 Parameters
1585 ----------
1586 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1587 HTCondor job information to be updated.
1588 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1589 Additional HTCondor job information.
1590 """
1591 for jid, jinfo in jobs2.items():
1592 if jid in jobs1:
1593 jobs1[jid].update(jinfo)
1594 else:
1595 jobs1[jid] = jinfo
1598def _wms_id_type(wms_id):
1599 """Determine the type of the WMS id.
1601 Parameters
1602 ----------
1603 wms_id : `str`
1604 WMS id identifying a job.
1606 Returns
1607 -------
1608 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1609 Type of WMS id.
1610 """
1611 try:
1612 int(float(wms_id))
1613 except ValueError:
1614 wms_path = Path(wms_id)
1615 if wms_path.exists():
1616 id_type = WmsIdType.PATH
1617 else:
1618 id_type = WmsIdType.GLOBAL
1619 except TypeError:
1620 id_type = WmsIdType.UNKNOWN
1621 else:
1622 id_type = WmsIdType.LOCAL
1623 return id_type
1626def _wms_id_to_cluster(wms_id):
1627 """Convert WMS id to cluster id.
1629 Parameters
1630 ----------
1631 wms_id : `int` or `float` or `str`
1632 HTCondor job id or path.
1634 Returns
1635 -------
1636 schedd_ad : `classad.ClassAd`
1637 ClassAd describing the scheduler managing the job with the given id.
1638 cluster_id : `int`
1639 HTCondor cluster id.
1640 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1641 The type of the provided id.
1642 """
1643 coll = htcondor.Collector()
1645 schedd_ad = None
1646 cluster_id = None
1647 id_type = _wms_id_type(wms_id)
1648 if id_type == WmsIdType.LOCAL:
1649 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1650 cluster_id = int(float(wms_id))
1651 elif id_type == WmsIdType.GLOBAL:
1652 constraint = f'GlobalJobId == "{wms_id}"'
1653 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1654 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
1655 job_info = condor_q(constraint=constraint, schedds=schedds)
1656 if job_info:
1657 schedd_name, job_rec = job_info.popitem()
1658 job_id, _ = job_rec.popitem()
1659 schedd_ad = schedd_ads[schedd_name]
1660 cluster_id = int(float(job_id))
1661 elif id_type == WmsIdType.PATH:
1662 try:
1663 job_info = read_dag_info(wms_id)
1664 except (FileNotFoundError, PermissionError, OSError):
1665 pass
1666 else:
1667 schedd_name, job_rec = job_info.popitem()
1668 job_id, _ = job_rec.popitem()
1669 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1670 cluster_id = int(float(job_id))
1671 else:
1672 pass
1673 return schedd_ad, cluster_id, id_type
1676def _create_periodic_release_expr(memory, multiplier, limit):
1677 """Construct an HTCondorAd expression for releasing held jobs.
1679 The expression instruct HTCondor to release any job which was put on hold
1680 due to exceeding memory requirements back to the job queue providing it
1681 satisfies all of the conditions below:
1683 * number of run attempts did not reach allowable number of retries,
1684 * the memory requirements in the last failed run attempt did not reach
1685 the specified memory limit.
1687 Parameters
1688 ----------
1689 memory : `int`
1690 Requested memory in MB.
1691 multiplier : `float`
1692 Memory growth rate between retires.
1693 limit : `int`
1694 Memory limit.
1696 Returns
1697 -------
1698 expr : `str`
1699 A string representing an HTCondor ClassAd expression for releasing jobs
1700 which have been held due to exceeding the memory requirements.
1701 """
1702 is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1703 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1705 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1706 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1707 # The special comparison operators ensure that all comparisons below will
1708 # evaluate to FALSE in this case.
1709 #
1710 # Note:
1711 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1712 # the entire expression should evaluate to FALSE when the job is not HELD.
1713 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1714 # but better safe than sorry.
1715 was_mem_exceeded = (
1716 "JobStatus == 5 "
1717 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1718 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1719 )
1721 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1722 return expr
1725def _create_periodic_remove_expr(memory, multiplier, limit):
1726 """Construct an HTCondorAd expression for removing jobs from the queue.
1728 The expression instruct HTCondor to remove any job which was put on hold
1729 due to exceeding memory requirements from the job queue providing it
1730 satisfies any of the conditions below:
1732 * allowable number of retries was reached,
1733 * the memory requirements during the last failed run attempt reached
1734 the specified memory limit.
1736 Parameters
1737 ----------
1738 memory : `int`
1739 Requested memory in MB.
1740 multiplier : `float`
1741 Memory growth rate between retires.
1742 limit : `int`
1743 Memory limit.
1745 Returns
1746 -------
1747 expr : `str`
1748 A string representing an HTCondor ClassAd expression for removing jobs
1749 which were run at the maximal allowable memory and still exceeded
1750 the memory requirements.
1751 """
1752 is_retry_disallowed = "NumJobStarts > JobMaxRetries"
1753 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
1755 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1756 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1757 # The special comparison operators ensure that all comparisons below will
1758 # evaluate to FALSE in this case.
1759 #
1760 # Note:
1761 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1762 # the entire expression should evaluate to FALSE when the job is not HELD.
1763 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1764 # but better safe than sorry.
1765 was_mem_exceeded = (
1766 "JobStatus == 5 "
1767 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1768 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1769 )
1771 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
1772 return expr
1775def _create_request_memory_expr(memory, multiplier, limit):
1776 """Construct an HTCondor ClassAd expression for safe memory scaling.
1778 Parameters
1779 ----------
1780 memory : `int`
1781 Requested memory in MB.
1782 multiplier : `float`
1783 Memory growth rate between retires.
1784 limit : `int`
1785 Memory limit.
1787 Returns
1788 -------
1789 expr : `str`
1790 A string representing an HTCondor ClassAd expression enabling safe
1791 memory scaling between job retries.
1792 """
1793 # The check if the job was held due to exceeding memory requirements
1794 # will be made *after* job was released back to the job queue (is in
1795 # the IDLE state), hence the need to use `Last*` job ClassAds instead of
1796 # the ones describing job's current state.
1797 #
1798 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
1799 # initially put in the job queue. The special comparison operators ensure
1800 # that all comparisons below will evaluate to FALSE in this case.
1801 was_mem_exceeded = (
1802 "LastJobStatus =?= 5 "
1803 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
1804 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1805 )
1807 # If job runs the first time or was held for reasons other than exceeding
1808 # the memory, set the required memory to the requested value or use
1809 # the memory value measured by HTCondor (MemoryUsage) depending on
1810 # whichever is greater.
1811 expr = (
1812 f"({was_mem_exceeded}) "
1813 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
1814 f": max({{{memory}, MemoryUsage ?: 0}})"
1815 )
1816 return expr
1819def _locate_schedds(locate_all=False):
1820 """Find out Scheduler daemons in an HTCondor pool.
1822 Parameters
1823 ----------
1824 locate_all : `bool`, optional
1825 If True, all available schedulers in the HTCondor pool will be located.
1826 False by default which means that the search will be limited to looking
1827 for the Scheduler running on a local host.
1829 Returns
1830 -------
1831 schedds : `dict` [`str`, `htcondor.Schedd`]
1832 A mapping between Scheduler names and Python objects allowing for
1833 interacting with them.
1834 """
1835 coll = htcondor.Collector()
1837 schedd_ads = []
1838 if locate_all:
1839 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1840 else:
1841 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1842 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1845def _gather_site_values(config, compute_site):
1846 """Gather values specific to given site.
1848 Parameters
1849 ----------
1850 config : `lsst.ctrl.bps.BpsConfig`
1851 BPS configuration that includes necessary submit/runtime
1852 information.
1853 compute_site : `str`
1854 Compute site name.
1856 Returns
1857 -------
1858 site_values : `dict` [`str`, `Any`]
1859 Values specific to the given site.
1860 """
1861 site_values = {"attrs": {}, "profile": {}}
1862 search_opts = {}
1863 if compute_site:
1864 search_opts["curvals"] = {"curr_site": compute_site}
1866 # Determine the hard limit for the memory requirement.
1867 found, limit = config.search("memoryLimit", opt=search_opts)
1868 if not found:
1869 search_opts["default"] = DEFAULT_HTC_EXEC_PATT
1870 _, patt = config.search("executeMachinesPattern", opt=search_opts)
1871 del search_opts["default"]
1873 # To reduce the amount of data, ignore dynamic slots (if any) as,
1874 # by definition, they cannot have more memory than
1875 # the partitionable slot they are the part of.
1876 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
1877 pool_info = condor_status(constraint=constraint)
1878 try:
1879 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
1880 except ValueError:
1881 _LOG.debug("No execute machine in the pool matches %s", patt)
1882 if limit:
1883 config[".bps_defined.memory_limit"] = limit
1885 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
1886 site_values["memoryLimit"] = limit
1888 found, value = config.search("accountingGroup", opt=search_opts)
1889 if found:
1890 site_values["accountingGroup"] = value
1891 found, value = config.search("accountingUser", opt=search_opts)
1892 if found:
1893 site_values["accountingUser"] = value
1895 key = f".site.{compute_site}.profile.condor"
1896 if key in config:
1897 for key, val in config[key].items():
1898 if key.startswith("+"):
1899 site_values["attrs"][key[1:]] = val
1900 else:
1901 site_values["profile"][key] = val
1903 return site_values