Coverage for python/lsst/ctrl/bps/htcondor/htcondor_service.py: 7%
721 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-16 11:15 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-16 11:15 +0000
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Interface between generic workflow to HTCondor workflow system.
29"""
31__all__ = ["HTCondorService", "HTCondorWorkflow"]
34import logging
35import os
36import re
37from collections import defaultdict
38from enum import IntEnum, auto
39from pathlib import Path
41import htcondor
42from lsst.ctrl.bps import (
43 BaseWmsService,
44 BaseWmsWorkflow,
45 GenericWorkflow,
46 GenericWorkflowJob,
47 WmsJobReport,
48 WmsRunReport,
49 WmsStates,
50)
51from lsst.ctrl.bps.bps_utils import chdir, create_count_summary
52from lsst.utils.timer import time_this
53from packaging import version
55from .lssthtc import (
56 MISSING_ID,
57 HTCDag,
58 HTCJob,
59 JobStatus,
60 NodeStatus,
61 condor_history,
62 condor_q,
63 condor_search,
64 condor_status,
65 htc_backup_files,
66 htc_check_dagman_output,
67 htc_create_submit_from_cmd,
68 htc_create_submit_from_dag,
69 htc_create_submit_from_file,
70 htc_escape,
71 htc_submit_dag,
72 htc_version,
73 pegasus_name_to_label,
74 read_dag_info,
75 read_dag_log,
76 read_dag_status,
77 read_node_status,
78 summary_from_dag,
79 write_dag_info,
80)
83class WmsIdType(IntEnum):
84 """Type of valid WMS ids."""
86 UNKNOWN = auto()
87 """The type of id cannot be determined.
88 """
90 LOCAL = auto()
91 """The id is HTCondor job's ClusterId (with optional '.ProcId').
92 """
94 GLOBAL = auto()
95 """Id is a HTCondor's global job id.
96 """
98 PATH = auto()
99 """Id is a submission path.
100 """
103DEFAULT_HTC_EXEC_PATT = ".*worker.*"
104"""Default pattern for searching execute machines in an HTCondor pool.
105"""
107_LOG = logging.getLogger(__name__)
110class HTCondorService(BaseWmsService):
111 """HTCondor version of WMS service."""
113 def prepare(self, config, generic_workflow, out_prefix=None):
114 """Convert generic workflow to an HTCondor DAG ready for submission.
116 Parameters
117 ----------
118 config : `lsst.ctrl.bps.BpsConfig`
119 BPS configuration that includes necessary submit/runtime
120 information.
121 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
122 The generic workflow (e.g., has executable name and arguments).
123 out_prefix : `str`
124 The root directory into which all WMS-specific files are written.
126 Returns
127 -------
128 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
129 HTCondor workflow ready to be run.
130 """
131 _LOG.debug("out_prefix = '%s'", out_prefix)
132 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed HTCondor workflow creation"):
133 workflow = HTCondorWorkflow.from_generic_workflow(
134 config,
135 generic_workflow,
136 out_prefix,
137 f"{self.__class__.__module__}.{self.__class__.__name__}",
138 )
140 with time_this(
141 log=_LOG, level=logging.INFO, prefix=None, msg="Completed writing out HTCondor workflow"
142 ):
143 workflow.write(out_prefix)
144 return workflow
146 def submit(self, workflow):
147 """Submit a single HTCondor workflow.
149 Parameters
150 ----------
151 workflow : `lsst.ctrl.bps.BaseWorkflow`
152 A single HTCondor workflow to submit. run_id is updated after
153 successful submission to WMS.
154 """
155 dag = workflow.dag
157 ver = version.parse(htc_version())
158 if ver >= version.parse("8.9.3"):
159 sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
160 else:
161 sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
163 # For workflow portability, internal paths are all relative. Hence
164 # the DAG needs to be submitted to HTCondor from inside the submit
165 # directory.
166 with chdir(workflow.submit_path):
167 _LOG.info("Submitting from directory: %s", os.getcwd())
168 schedd_dag_info = htc_submit_dag(sub)
169 if schedd_dag_info:
170 write_dag_info(f"{dag.name}.info.json", schedd_dag_info)
172 _, dag_info = schedd_dag_info.popitem()
173 _, dag_ad = dag_info.popitem()
175 dag.run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
176 workflow.run_id = dag.run_id
177 else:
178 raise RuntimeError("Submission failed: unable to retrieve DAGMan job information")
180 def restart(self, wms_workflow_id):
181 """Restart a failed DAGMan workflow.
183 Parameters
184 ----------
185 wms_workflow_id : `str`
186 The directory with HTCondor files.
188 Returns
189 -------
190 run_id : `str`
191 HTCondor id of the restarted DAGMan job. If restart failed, it will
192 be set to None.
193 run_name : `str`
194 Name of the restarted workflow. If restart failed, it will be set
195 to None.
196 message : `str`
197 A message describing any issues encountered during the restart.
198 If there were no issues, an empty string is returned.
199 """
200 wms_path, id_type = _wms_id_to_dir(wms_workflow_id)
201 if wms_path is None:
202 return (
203 None,
204 None,
205 (
206 f"workflow with run id '{wms_workflow_id}' not found. "
207 f"Hint: use run's submit directory as the id instead"
208 ),
209 )
211 if id_type in {WmsIdType.GLOBAL, WmsIdType.LOCAL}:
212 if not wms_path.is_dir():
213 return None, None, f"submit directory '{wms_path}' for run id '{wms_workflow_id}' not found."
215 _LOG.info("Restarting workflow from directory '%s'", wms_path)
216 rescue_dags = list(wms_path.glob("*.dag.rescue*"))
217 if not rescue_dags:
218 return None, None, f"HTCondor rescue DAG(s) not found in '{wms_path}'"
220 _LOG.info("Verifying that the workflow is not already in the job queue")
221 schedd_dag_info = condor_q(constraint=f'regexp("dagman$", Cmd) && Iwd == "{wms_path}"')
222 if schedd_dag_info:
223 _, dag_info = schedd_dag_info.popitem()
224 _, dag_ad = dag_info.popitem()
225 id_ = dag_ad["GlobalJobId"]
226 return None, None, f"Workflow already in the job queue (global job id: '{id_}')"
228 _LOG.info("Checking execution status of the workflow")
229 warn = False
230 dag_ad = read_dag_status(str(wms_path))
231 if dag_ad:
232 nodes_total = dag_ad.get("NodesTotal", 0)
233 if nodes_total != 0:
234 nodes_done = dag_ad.get("NodesDone", 0)
235 if nodes_total == nodes_done:
236 return None, None, "All jobs in the workflow finished successfully"
237 else:
238 warn = True
239 else:
240 warn = True
241 if warn:
242 _LOG.warning(
243 "Cannot determine the execution status of the workflow, continuing with restart regardless"
244 )
246 _LOG.info("Backing up select HTCondor files from previous run attempt")
247 htc_backup_files(wms_path, subdir="backups")
249 # For workflow portability, internal paths are all relative. Hence
250 # the DAG needs to be resubmitted to HTCondor from inside the submit
251 # directory.
252 _LOG.info("Adding workflow to the job queue")
253 run_id, run_name, message = None, None, ""
254 with chdir(wms_path):
255 try:
256 dag_path = next(wms_path.glob("*.dag.condor.sub"))
257 except StopIteration:
258 message = f"DAGMan submit description file not found in '{wms_path}'"
259 else:
260 sub = htc_create_submit_from_file(dag_path.name)
261 schedd_dag_info = htc_submit_dag(sub)
263 # Save select information about the DAGMan job to a file. Use
264 # the run name (available in the ClassAd) as the filename.
265 if schedd_dag_info:
266 dag_info = next(iter(schedd_dag_info.values()))
267 dag_ad = next(iter(dag_info.values()))
268 write_dag_info(f"{dag_ad['bps_run']}.info.json", schedd_dag_info)
269 run_id = f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}"
270 run_name = dag_ad["bps_run"]
271 else:
272 message = "DAGMan job information unavailable"
274 return run_id, run_name, message
276 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
277 """Query WMS for list of submitted WMS workflows/jobs.
279 This should be a quick lookup function to create list of jobs for
280 other functions.
282 Parameters
283 ----------
284 wms_id : `int` or `str`, optional
285 Id or path that can be used by WMS service to look up job.
286 user : `str`, optional
287 User whose submitted jobs should be listed.
288 require_bps : `bool`, optional
289 Whether to require jobs returned in list to be bps-submitted jobs.
290 pass_thru : `str`, optional
291 Information to pass through to WMS.
292 is_global : `bool`, optional
293 If set, all job queues (and their histories) will be queried for
294 job information. Defaults to False which means that only the local
295 job queue will be queried.
297 Returns
298 -------
299 job_ids : `list` [`Any`]
300 Only job ids to be used by cancel and other functions. Typically
301 this means top-level jobs (i.e., not children jobs).
302 """
303 _LOG.debug(
304 "list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
305 wms_id,
306 user,
307 require_bps,
308 pass_thru,
309 is_global,
310 )
312 # Determine which Schedds will be queried for job information.
313 coll = htcondor.Collector()
315 schedd_ads = []
316 if is_global:
317 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
318 else:
319 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
321 # Construct appropriate constraint expression using provided arguments.
322 constraint = "False"
323 if wms_id is None:
324 if user is not None:
325 constraint = f'(Owner == "{user}")'
326 else:
327 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
328 if cluster_id is not None:
329 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
331 # If provided id is either a submission path or a global id,
332 # make sure the right Schedd will be queried regardless of
333 # 'is_global' value.
334 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
335 schedd_ads = [schedd_ad]
336 if require_bps:
337 constraint += ' && (bps_isjob == "True")'
338 if pass_thru:
339 if "-forcex" in pass_thru:
340 pass_thru_2 = pass_thru.replace("-forcex", "")
341 if pass_thru_2 and not pass_thru_2.isspace():
342 constraint += f" && ({pass_thru_2})"
343 else:
344 constraint += f" && ({pass_thru})"
346 # Create a list of scheduler daemons which need to be queried.
347 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
349 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
350 results = condor_q(constraint=constraint, schedds=schedds)
352 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
353 job_ids = []
354 for schedd_name, job_info in results.items():
355 for job_id, job_ad in job_info.items():
356 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
357 if "DAGManJobId" not in job_ad:
358 job_ids.append(job_ad.get("GlobalJobId", job_id))
359 else:
360 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
361 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
362 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
363 job_ids.append(job_ad.get("GlobalJobId", job_id))
365 _LOG.debug("job_ids = %s", job_ids)
366 return job_ids
368 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
369 """Return run information based upon given constraints.
371 Parameters
372 ----------
373 wms_workflow_id : `str`, optional
374 Limit to specific run based on id.
375 user : `str`, optional
376 Limit results to runs for this user.
377 hist : `float`, optional
378 Limit history search to this many days. Defaults to 0.
379 pass_thru : `str`, optional
380 Constraints to pass through to HTCondor.
381 is_global : `bool`, optional
382 If set, all job queues (and their histories) will be queried for
383 job information. Defaults to False which means that only the local
384 job queue will be queried.
386 Returns
387 -------
388 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
389 Information about runs from given job information.
390 message : `str`
391 Extra message for report command to print. This could be pointers
392 to documentation or to WMS specific commands.
393 """
394 if wms_workflow_id:
395 id_type = _wms_id_type(wms_workflow_id)
396 if id_type == WmsIdType.LOCAL:
397 schedulers = _locate_schedds(locate_all=is_global)
398 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
399 elif id_type == WmsIdType.GLOBAL:
400 schedulers = _locate_schedds(locate_all=True)
401 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
402 elif id_type == WmsIdType.PATH:
403 run_reports, message = _report_from_path(wms_workflow_id)
404 else:
405 run_reports, message = {}, "Invalid job id"
406 else:
407 schedulers = _locate_schedds(locate_all=is_global)
408 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
409 _LOG.debug("report: %s, %s", run_reports, message)
411 return list(run_reports.values()), message
413 def cancel(self, wms_id, pass_thru=None):
414 """Cancel submitted workflows/jobs.
416 Parameters
417 ----------
418 wms_id : `str`
419 Id or path of job that should be canceled.
420 pass_thru : `str`, optional
421 Information to pass through to WMS.
423 Returns
424 -------
425 deleted : `bool`
426 Whether successful deletion or not. Currently, if any doubt or any
427 individual jobs not deleted, return False.
428 message : `str`
429 Any message from WMS (e.g., error details).
430 """
431 _LOG.debug("Canceling wms_id = %s", wms_id)
433 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
435 if cluster_id is None:
436 deleted = False
437 message = "invalid id"
438 else:
439 _LOG.debug(
440 "Canceling job managed by schedd_name = %s with cluster_id = %s",
441 cluster_id,
442 schedd_ad["Name"],
443 )
444 schedd = htcondor.Schedd(schedd_ad)
446 constraint = f"ClusterId == {cluster_id}"
447 if pass_thru is not None and "-forcex" in pass_thru:
448 pass_thru_2 = pass_thru.replace("-forcex", "")
449 if pass_thru_2 and not pass_thru_2.isspace():
450 constraint += f"&& ({pass_thru_2})"
451 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
452 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
453 else:
454 if pass_thru:
455 constraint += f"&& ({pass_thru})"
456 _LOG.debug("JobAction.Remove constraint = %s", constraint)
457 results = schedd.act(htcondor.JobAction.Remove, constraint)
458 _LOG.debug("Remove results: %s", results)
460 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
461 deleted = True
462 message = ""
463 else:
464 deleted = False
465 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
466 message = "no such bps job in batch queue"
467 else:
468 message = f"unknown problems deleting: {results}"
470 _LOG.debug("deleted: %s; message = %s", deleted, message)
471 return deleted, message
474class HTCondorWorkflow(BaseWmsWorkflow):
475 """Single HTCondor workflow.
477 Parameters
478 ----------
479 name : `str`
480 Unique name for Workflow used when naming files.
481 config : `lsst.ctrl.bps.BpsConfig`
482 BPS configuration that includes necessary submit/runtime information.
483 """
485 def __init__(self, name, config=None):
486 super().__init__(name, config)
487 self.dag = None
489 @classmethod
490 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
491 # Docstring inherited
492 htc_workflow = cls(generic_workflow.name, config)
493 htc_workflow.dag = HTCDag(name=generic_workflow.name)
495 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
496 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
497 htc_workflow.dag.add_attribs(
498 {
499 "bps_wms_service": service_class,
500 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
501 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
502 "bps_job_summary": create_count_summary(generic_workflow.job_counts),
503 }
504 )
506 _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
507 if isinstance(tmp_template, str):
508 subdir_template = defaultdict(lambda: tmp_template)
509 else:
510 subdir_template = tmp_template
512 # Create all DAG jobs
513 site_values = {} # cache compute site specific values to reduce config lookups
514 for job_name in generic_workflow:
515 gwjob = generic_workflow.get_job(job_name)
516 if gwjob.compute_site not in site_values:
517 site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
518 htc_job = _create_job(
519 subdir_template[gwjob.label],
520 site_values[gwjob.compute_site],
521 generic_workflow,
522 gwjob,
523 out_prefix,
524 )
525 htc_workflow.dag.add_job(htc_job)
527 # Add job dependencies to the DAG
528 for job_name in generic_workflow:
529 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
531 # If final job exists in generic workflow, create DAG final job
532 final = generic_workflow.get_final()
533 if final and isinstance(final, GenericWorkflowJob):
534 if final.compute_site and final.compute_site not in site_values:
535 site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
536 final_htjob = _create_job(
537 subdir_template[final.label],
538 site_values[final.compute_site],
539 generic_workflow,
540 final,
541 out_prefix,
542 )
543 if "post" not in final_htjob.dagcmds:
544 final_htjob.dagcmds[
545 "post"
546 ] = f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
547 htc_workflow.dag.add_final_job(final_htjob)
548 elif final and isinstance(final, GenericWorkflow):
549 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
550 elif final:
551 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
553 return htc_workflow
555 def write(self, out_prefix):
556 """Output HTCondor DAGMan files needed for workflow submission.
558 Parameters
559 ----------
560 out_prefix : `str`
561 Directory prefix for HTCondor files.
562 """
563 self.submit_path = out_prefix
564 os.makedirs(out_prefix, exist_ok=True)
566 # Write down the workflow in HTCondor format.
567 self.dag.write(out_prefix, "jobs/{self.label}")
570def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
571 """Convert GenericWorkflow job nodes to DAG jobs.
573 Parameters
574 ----------
575 subdir_template : `str`
576 Template for making subdirs.
577 site_values : `dict`
578 Site specific values
579 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
580 Generic workflow that is being converted.
581 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
582 The generic job to convert to a HTCondor job.
583 out_prefix : `str`
584 Directory prefix for HTCondor files.
586 Returns
587 -------
588 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
589 The HTCondor job equivalent to the given generic job.
590 """
591 htc_job = HTCJob(gwjob.name, label=gwjob.label)
593 curvals = defaultdict(str)
594 curvals["label"] = gwjob.label
595 if gwjob.tags:
596 curvals.update(gwjob.tags)
598 subdir = subdir_template.format_map(curvals)
599 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
601 htc_job_cmds = {
602 "universe": "vanilla",
603 "should_transfer_files": "YES",
604 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
605 "transfer_output_files": '""', # Set to empty string to disable
606 "transfer_executable": "False",
607 "getenv": "True",
608 # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
609 # htcondor to put on hold any jobs which exited by a signal.
610 "on_exit_hold": "ExitBySignal == true",
611 "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
612 '"Handling signal as if job has gone over memory limit.")',
613 "on_exit_hold_subcode": "34",
614 }
616 htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
618 # job stdout, stderr, htcondor user log.
619 for key in ("output", "error", "log"):
620 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
621 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
623 htc_job_cmds.update(
624 _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
625 )
627 # Add the job cmds dict to the job object.
628 htc_job.add_job_cmds(htc_job_cmds)
630 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
632 # Add job attributes to job.
633 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
634 htc_job.add_job_attrs(gwjob.attrs)
635 htc_job.add_job_attrs(site_values["attrs"])
636 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
637 htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
639 return htc_job
642def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
643 """Translate the job data that are one to one mapping
645 Parameters
646 ----------
647 cached_vals : `dict` [`str`, `Any`]
648 Config values common to jobs with same label.
649 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
650 Generic workflow that contains job to being converted.
651 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
652 Generic workflow job to be converted.
654 Returns
655 -------
656 htc_job_commands : `dict` [`str`, `Any`]
657 Contains commands which can appear in the HTCondor submit description
658 file.
659 """
660 # Values in the job script that just are name mappings.
661 job_translation = {
662 "mail_to": "notify_user",
663 "when_to_mail": "notification",
664 "request_cpus": "request_cpus",
665 "priority": "priority",
666 "category": "category",
667 "accounting_group": "accounting_group",
668 "accounting_user": "accounting_group_user",
669 }
671 jobcmds = {}
672 for gwkey, htckey in job_translation.items():
673 jobcmds[htckey] = getattr(gwjob, gwkey, None)
675 # If accounting info was not set explicitly, use site settings if any.
676 if not gwjob.accounting_group:
677 jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
678 if not gwjob.accounting_user:
679 jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
681 # job commands that need modification
682 if gwjob.number_of_retries:
683 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
685 if gwjob.retry_unless_exit:
686 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
688 if gwjob.request_disk:
689 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
691 if gwjob.request_memory:
692 jobcmds["request_memory"] = f"{gwjob.request_memory}"
694 if gwjob.memory_multiplier:
695 # Do not use try-except! At the moment, BpsConfig returns an empty
696 # string if it does not contain the key.
697 memory_limit = cached_vals["memoryLimit"]
698 if not memory_limit:
699 raise RuntimeError(
700 "Memory autoscaling enabled, but automatic detection of the memory limit "
701 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
702 "search pattern 'executeMachinesPattern' might help."
703 )
705 # Set maximal amount of memory job can ask for.
706 #
707 # The check below assumes that 'memory_limit' was set to a value which
708 # realistically reflects actual physical limitations of a given compute
709 # resource.
710 memory_max = memory_limit
711 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
712 memory_max = gwjob.request_memory_max
714 # Make job ask for more memory each time it failed due to insufficient
715 # memory requirements.
716 jobcmds["request_memory"] = _create_request_memory_expr(
717 gwjob.request_memory, gwjob.memory_multiplier, memory_max
718 )
720 # Periodically release jobs which are being held due to exceeding
721 # memory. Stop doing that (by removing the job from the HTCondor queue)
722 # after the maximal number of retries has been reached or the job was
723 # already run at maximal allowed memory.
724 jobcmds["periodic_release"] = _create_periodic_release_expr(
725 gwjob.request_memory, gwjob.memory_multiplier, memory_max
726 )
727 jobcmds["periodic_remove"] = _create_periodic_remove_expr(
728 gwjob.request_memory, gwjob.memory_multiplier, memory_max
729 )
731 # Assume concurrency_limit implemented using HTCondor concurrency limits.
732 # May need to move to special site-specific implementation if sites use
733 # other mechanisms.
734 if gwjob.concurrency_limit:
735 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
737 # Handle command line
738 if gwjob.executable.transfer_executable:
739 jobcmds["transfer_executable"] = "True"
740 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
741 else:
742 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
744 if gwjob.arguments:
745 arguments = gwjob.arguments
746 arguments = _replace_cmd_vars(arguments, gwjob)
747 arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
748 arguments = _fix_env_var_syntax(arguments)
749 jobcmds["arguments"] = arguments
751 # Add extra "pass-thru" job commands
752 if gwjob.profile:
753 for key, val in gwjob.profile.items():
754 jobcmds[key] = htc_escape(val)
755 for key, val in cached_vals["profile"].items():
756 jobcmds[key] = htc_escape(val)
758 return jobcmds
761def _translate_dag_cmds(gwjob):
762 """Translate job values into DAGMan commands.
764 Parameters
765 ----------
766 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
767 Job containing values to be translated.
769 Returns
770 -------
771 dagcmds : `dict` [`str`, `Any`]
772 DAGMan commands for the job.
773 """
774 # Values in the dag script that just are name mappings.
775 dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
777 dagcmds = {}
778 for gwkey, htckey in dag_translation.items():
779 dagcmds[htckey] = getattr(gwjob, gwkey, None)
781 # Still to be coded: vars "pre_cmdline", "post_cmdline"
782 return dagcmds
785def _fix_env_var_syntax(oldstr):
786 """Change ENV place holders to HTCondor Env var syntax.
788 Parameters
789 ----------
790 oldstr : `str`
791 String in which environment variable syntax is to be fixed.
793 Returns
794 -------
795 newstr : `str`
796 Given string with environment variable syntax fixed.
797 """
798 newstr = oldstr
799 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
800 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
801 return newstr
804def _replace_file_vars(use_shared, arguments, workflow, gwjob):
805 """Replace file placeholders in command line arguments with correct
806 physical file names.
808 Parameters
809 ----------
810 use_shared : `bool`
811 Whether HTCondor can assume shared filesystem.
812 arguments : `str`
813 Arguments string in which to replace file placeholders.
814 workflow : `lsst.ctrl.bps.GenericWorkflow`
815 Generic workflow that contains file information.
816 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
817 The job corresponding to the arguments.
819 Returns
820 -------
821 arguments : `str`
822 Given arguments string with file placeholders replaced.
823 """
824 # Replace input file placeholders with paths.
825 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
826 if not gwfile.wms_transfer:
827 # Must assume full URI if in command line and told WMS is not
828 # responsible for transferring file.
829 uri = gwfile.src_uri
830 elif use_shared:
831 if gwfile.job_shared:
832 # Have shared filesystems and jobs can share file.
833 uri = gwfile.src_uri
834 else:
835 # Taking advantage of inside knowledge. Not future-proof.
836 # Temporary fix until have job wrapper that pulls files
837 # within job.
838 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
839 uri = "butler.yaml"
840 else:
841 uri = os.path.basename(gwfile.src_uri)
842 else: # Using push transfer
843 uri = os.path.basename(gwfile.src_uri)
844 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
846 # Replace output file placeholders with paths.
847 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
848 if not gwfile.wms_transfer:
849 # Must assume full URI if in command line and told WMS is not
850 # responsible for transferring file.
851 uri = gwfile.src_uri
852 elif use_shared:
853 if gwfile.job_shared:
854 # Have shared filesystems and jobs can share file.
855 uri = gwfile.src_uri
856 else:
857 uri = os.path.basename(gwfile.src_uri)
858 else: # Using push transfer
859 uri = os.path.basename(gwfile.src_uri)
860 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
861 return arguments
864def _replace_cmd_vars(arguments, gwjob):
865 """Replace format-style placeholders in arguments.
867 Parameters
868 ----------
869 arguments : `str`
870 Arguments string in which to replace placeholders.
871 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
872 Job containing values to be used to replace placeholders
873 (in particular gwjob.cmdvals).
875 Returns
876 -------
877 arguments : `str`
878 Given arguments string with placeholders replaced.
879 """
880 try:
881 arguments = arguments.format(**gwjob.cmdvals)
882 except (KeyError, TypeError): # TypeError in case None instead of {}
883 _LOG.error(
884 "Could not replace command variables:\narguments: %s\ncmdvals: %s", arguments, gwjob.cmdvals
885 )
886 raise
887 return arguments
890def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
891 """Add job input files from generic workflow to job.
893 Parameters
894 ----------
895 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
896 The generic workflow (e.g., has executable name and arguments).
897 job_name : `str`
898 Unique name for the job.
899 use_shared : `bool`
900 Whether job has access to files via shared filesystem.
901 out_prefix : `str`
902 The root directory into which all WMS-specific files are written.
904 Returns
905 -------
906 htc_commands : `dict` [`str`, `str`]
907 HTCondor commands for the job submission script.
908 """
909 htc_commands = {}
910 inputs = []
911 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
912 _LOG.debug("src_uri=%s", gwf_file.src_uri)
914 uri = Path(gwf_file.src_uri)
916 # Note if use_shared and job_shared, don't need to transfer file.
918 if not use_shared: # Copy file using push to job
919 inputs.append(str(uri.relative_to(out_prefix)))
920 elif not gwf_file.job_shared: # Jobs require own copy
921 # if using shared filesystem, but still need copy in job. Use
922 # HTCondor's curl plugin for a local copy.
924 # Execution butler is represented as a directory which the
925 # curl plugin does not handle. Taking advantage of inside
926 # knowledge for temporary fix until have job wrapper that pulls
927 # files within job.
928 if gwf_file.name == "butlerConfig":
929 # The execution butler directory doesn't normally exist until
930 # the submit phase so checking for suffix instead of using
931 # is_dir(). If other non-yaml file exists they would have a
932 # different gwf_file.name.
933 if uri.suffix == ".yaml": # Single file, so just copy.
934 inputs.append(f"file://{uri}")
935 else:
936 inputs.append(f"file://{uri / 'butler.yaml'}")
937 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
938 elif uri.is_dir():
939 raise RuntimeError(
940 f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
941 )
942 else:
943 inputs.append(f"file://{uri}")
945 if inputs:
946 htc_commands["transfer_input_files"] = ",".join(inputs)
947 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
948 return htc_commands
951def _report_from_path(wms_path):
952 """Gather run information from a given run directory.
954 Parameters
955 ----------
956 wms_path : `str`
957 The directory containing the submit side files (e.g., HTCondor files).
959 Returns
960 -------
961 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
962 Run information for the detailed report. The key is the HTCondor id
963 and the value is a collection of report information for that run.
964 message : `str`
965 Message to be printed with the summary report.
966 """
967 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
968 if wms_workflow_id == MISSING_ID:
969 run_reports = {}
970 else:
971 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
972 return run_reports, message
975def _report_from_id(wms_workflow_id, hist, schedds=None):
976 """Gather run information using workflow id.
978 Parameters
979 ----------
980 wms_workflow_id : `str`
981 Limit to specific run based on id.
982 hist : `float`
983 Limit history search to this many days.
984 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
985 HTCondor schedulers which to query for job information. If None
986 (default), all queries will be run against the local scheduler only.
988 Returns
989 -------
990 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
991 Run information for the detailed report. The key is the HTCondor id
992 and the value is a collection of report information for that run.
993 message : `str`
994 Message to be printed with the summary report.
995 """
996 messages = []
998 # Collect information about the job by querying HTCondor schedd and
999 # HTCondor history.
1000 schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
1001 if len(schedd_dag_info) == 1:
1002 # Extract the DAG info without altering the results of the query.
1003 schedd_name = next(iter(schedd_dag_info))
1004 dag_id = next(iter(schedd_dag_info[schedd_name]))
1005 dag_ad = schedd_dag_info[schedd_name][dag_id]
1007 # If the provided workflow id does not correspond to the one extracted
1008 # from the DAGMan log file in the submit directory, rerun the query
1009 # with the id found in the file.
1010 #
1011 # This is to cover the situation in which the user provided the old job
1012 # id of a restarted run.
1013 try:
1014 path_dag_id, path_dag_ad = read_dag_log(dag_ad["Iwd"])
1015 except FileNotFoundError as exc:
1016 # At the moment missing DAGMan log is pretty much a fatal error.
1017 # So empty the DAG info to finish early (see the if statement
1018 # below).
1019 schedd_dag_info.clean()
1020 messages.append(f"Cannot create the report for '{dag_id}': {exc}")
1021 else:
1022 if path_dag_id != dag_id:
1023 schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
1024 messages.append(
1025 f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
1026 "This normally occurs when a run is restarted. The report shown is for the most "
1027 f"recent status with run id '{path_dag_id}'"
1028 )
1030 if len(schedd_dag_info) == 0:
1031 run_reports = {}
1032 elif len(schedd_dag_info) == 1:
1033 _, dag_info = schedd_dag_info.popitem()
1034 dag_id, dag_ad = dag_info.popitem()
1036 # Create a mapping between jobs and their classads. The keys will
1037 # be of format 'ClusterId.ProcId'.
1038 job_info = {dag_id: dag_ad}
1040 # Find jobs (nodes) belonging to that DAGMan job.
1041 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
1042 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
1043 if schedd_job_info:
1044 _, node_info = schedd_job_info.popitem()
1045 job_info.update(node_info)
1047 # Collect additional pieces of information about jobs using HTCondor
1048 # files in the submission directory.
1049 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
1050 _update_jobs(job_info, path_jobs)
1051 if message:
1052 messages.append(message)
1053 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
1054 else:
1055 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
1056 message = (
1057 f"More than one job matches id '{wms_workflow_id}', "
1058 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
1059 )
1060 messages.append(message)
1061 run_reports = {}
1063 message = "\n".join(messages)
1064 return run_reports, message
1067def _get_info_from_schedd(wms_workflow_id, hist, schedds):
1068 """Gather run information from HTCondor.
1070 Parameters
1071 ----------
1072 wms_workflow_id : `str`
1073 Limit to specific run based on id.
1074 hist :
1075 Limit history search to this many days.
1076 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1077 HTCondor schedulers which to query for job information. If None
1078 (default), all queries will be run against the local scheduler only.
1080 Returns
1081 -------
1082 schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1083 Information about jobs satisfying the search criteria where for each
1084 Scheduler, local HTCondor job ids are mapped to their respective
1085 classads.
1086 """
1087 dag_constraint = 'regexp("dagman$", Cmd)'
1088 try:
1089 cluster_id = int(float(wms_workflow_id))
1090 except ValueError:
1091 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
1092 else:
1093 dag_constraint += f" && ClusterId == {cluster_id}"
1095 # With the current implementation of the condor_* functions the query
1096 # will always return only one match per Scheduler.
1097 #
1098 # Even in the highly unlikely situation where HTCondor history (which
1099 # condor_search queries too) is long enough to have jobs from before
1100 # the cluster ids were rolled over (and as a result there is more then
1101 # one job with the same cluster id) they will not show up in
1102 # the results.
1103 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
1104 return schedd_dag_info
1107def _get_info_from_path(wms_path):
1108 """Gather run information from a given run directory.
1110 Parameters
1111 ----------
1112 wms_path : `str`
1113 Directory containing HTCondor files.
1115 Returns
1116 -------
1117 wms_workflow_id : `str`
1118 The run id which is a DAGman job id.
1119 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1120 Information about jobs read from files in the given directory.
1121 The key is the HTCondor id and the value is a dictionary of HTCondor
1122 keys and values.
1123 message : `str`
1124 Message to be printed with the summary report.
1125 """
1126 messages = []
1127 try:
1128 wms_workflow_id, jobs = read_dag_log(wms_path)
1129 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
1130 _update_jobs(jobs, read_node_status(wms_path))
1131 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
1133 # Add more info for DAGman job
1134 job = jobs[wms_workflow_id]
1135 job.update(read_dag_status(wms_path))
1137 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
1138 if "bps_run" not in job:
1139 _add_run_info(wms_path, job)
1141 message = htc_check_dagman_output(wms_path)
1142 if message:
1143 messages.append(message)
1144 _LOG.debug(
1145 "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
1146 )
1148 # Add extra pieces of information which cannot be found in HTCondor
1149 # generated files like 'GlobalJobId'.
1150 #
1151 # Do not treat absence of this file as a serious error. Neither runs
1152 # submitted with earlier versions of the plugin nor the runs submitted
1153 # with Pegasus plugin will have it at the moment. However, once enough
1154 # time passes and Pegasus plugin will have its own report() method
1155 # (instead of sneakily using HTCondor's one), the lack of that file
1156 # should be treated as seriously as lack of any other file.
1157 try:
1158 job_info = read_dag_info(wms_path)
1159 except FileNotFoundError as exc:
1160 message = f"Warn: Some information may not be available: {exc}"
1161 messages.append(message)
1162 else:
1163 schedd_name = next(iter(job_info))
1164 job_ad = next(iter(job_info[schedd_name].values()))
1165 job.update(job_ad)
1166 except FileNotFoundError:
1167 message = f"Could not find HTCondor files in '{wms_path}'"
1168 _LOG.warning(message)
1169 messages.append(message)
1170 wms_workflow_id = MISSING_ID
1171 jobs = {}
1173 message = "\n".join([msg for msg in messages if msg])
1174 return wms_workflow_id, jobs, message
1177def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
1178 """Gather run information to be used in generating summary reports.
1180 Parameters
1181 ----------
1182 wms_workflow_id : `str`
1183 The run id to create the report for.
1184 jobs : `dict` [`str`, `dict` [`str`, Any]]
1185 Mapping HTCondor job id to job information.
1187 Returns
1188 -------
1189 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1190 Run information for the detailed report. The key is the given HTCondor
1191 id and the value is a collection of report information for that run.
1192 """
1193 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1194 dag_job = jobs[wms_workflow_id]
1195 report = WmsRunReport(
1196 wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
1197 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
1198 path=dag_job["Iwd"],
1199 label=dag_job.get("bps_job_label", "MISS"),
1200 run=dag_job.get("bps_run", "MISS"),
1201 project=dag_job.get("bps_project", "MISS"),
1202 campaign=dag_job.get("bps_campaign", "MISS"),
1203 payload=dag_job.get("bps_payload", "MISS"),
1204 operator=_get_owner(dag_job),
1205 run_summary=_get_run_summary(dag_job),
1206 state=_htc_status_to_wms_state(dag_job),
1207 jobs=[],
1208 total_number_jobs=dag_job["total_jobs"],
1209 job_state_counts=dag_job["state_counts"],
1210 )
1212 for job_id, job_info in jobs.items():
1213 try:
1214 if job_info["ClusterId"] != int(float(wms_workflow_id)):
1215 job_report = WmsJobReport(
1216 wms_id=job_id,
1217 name=job_info.get("DAGNodeName", job_id),
1218 label=job_info.get("bps_job_label", pegasus_name_to_label(job_info["DAGNodeName"])),
1219 state=_htc_status_to_wms_state(job_info),
1220 )
1221 if job_report.label == "init":
1222 job_report.label = "pipetaskInit"
1223 report.jobs.append(job_report)
1224 except KeyError as ex:
1225 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
1226 raise
1228 run_reports = {report.wms_id: report}
1229 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1230 return run_reports
1233def _summary_report(user, hist, pass_thru, schedds=None):
1234 """Gather run information to be used in generating summary reports.
1236 Parameters
1237 ----------
1238 user : `str`
1239 Run lookup restricted to given user.
1240 hist : `float`
1241 How many previous days to search for run information.
1242 pass_thru : `str`
1243 Advanced users can define the HTCondor constraint to be used
1244 when searching queue and history.
1246 Returns
1247 -------
1248 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1249 Run information for the summary report. The keys are HTCondor ids and
1250 the values are collections of report information for each run.
1251 message : `str`
1252 Message to be printed with the summary report.
1253 """
1254 # only doing summary report so only look for dagman jobs
1255 if pass_thru:
1256 constraint = pass_thru
1257 else:
1258 # Notes:
1259 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1260 # manually restarted.
1261 # * Any job with DAGManJobID isn't a DAG job
1262 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1263 if user:
1264 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1266 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1268 # Have list of DAGMan jobs, need to get run_report info.
1269 run_reports = {}
1270 for jobs in job_info.values():
1271 for job_id, job in jobs.items():
1272 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1273 # If didn't get from queue information (e.g., Kerberos bug),
1274 # try reading from file.
1275 if total_jobs == 0:
1276 try:
1277 job.update(read_dag_status(job["Iwd"]))
1278 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1279 except StopIteration:
1280 pass # don't kill report can't find htcondor files
1282 if "bps_run" not in job:
1283 _add_run_info(job["Iwd"], job)
1284 report = WmsRunReport(
1285 wms_id=job_id,
1286 global_wms_id=job["GlobalJobId"],
1287 path=job["Iwd"],
1288 label=job.get("bps_job_label", "MISS"),
1289 run=job.get("bps_run", "MISS"),
1290 project=job.get("bps_project", "MISS"),
1291 campaign=job.get("bps_campaign", "MISS"),
1292 payload=job.get("bps_payload", "MISS"),
1293 operator=_get_owner(job),
1294 run_summary=_get_run_summary(job),
1295 state=_htc_status_to_wms_state(job),
1296 jobs=[],
1297 total_number_jobs=total_jobs,
1298 job_state_counts=state_counts,
1299 )
1300 run_reports[report.global_wms_id] = report
1302 return run_reports, ""
1305def _add_run_info(wms_path, job):
1306 """Find BPS run information elsewhere for runs without bps attributes.
1308 Parameters
1309 ----------
1310 wms_path : `str`
1311 Path to submit files for the run.
1312 job : `dict` [`str`, `Any`]
1313 HTCondor dag job information.
1315 Raises
1316 ------
1317 StopIteration
1318 If cannot find file it is looking for. Permission errors are
1319 caught and job's run is marked with error.
1320 """
1321 path = Path(wms_path) / "jobs"
1322 try:
1323 subfile = next(path.glob("**/*.sub"))
1324 except (StopIteration, PermissionError):
1325 job["bps_run"] = "Unavailable"
1326 else:
1327 _LOG.debug("_add_run_info: subfile = %s", subfile)
1328 try:
1329 with open(subfile, encoding="utf-8") as fh:
1330 for line in fh:
1331 if line.startswith("+bps_"):
1332 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1333 if m:
1334 _LOG.debug("Matching line: %s", line)
1335 job[m.group(1)] = m.group(2).replace('"', "")
1336 else:
1337 _LOG.debug("Could not parse attribute: %s", line)
1338 except PermissionError:
1339 job["bps_run"] = "PermissionError"
1340 _LOG.debug("After adding job = %s", job)
1343def _get_owner(job):
1344 """Get the owner of a dag job.
1346 Parameters
1347 ----------
1348 job : `dict` [`str`, `Any`]
1349 HTCondor dag job information.
1351 Returns
1352 -------
1353 owner : `str`
1354 Owner of the dag job.
1355 """
1356 owner = job.get("bps_operator", None)
1357 if not owner:
1358 owner = job.get("Owner", None)
1359 if not owner:
1360 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1361 owner = "MISS"
1362 return owner
1365def _get_run_summary(job):
1366 """Get the run summary for a job.
1368 Parameters
1369 ----------
1370 job : `dict` [`str`, `Any`]
1371 HTCondor dag job information.
1373 Returns
1374 -------
1375 summary : `str`
1376 Number of jobs per PipelineTask label in approximate pipeline order.
1377 Format: <label>:<count>[;<label>:<count>]+
1378 """
1379 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1380 if not summary:
1381 summary, _ = summary_from_dag(job["Iwd"])
1382 if not summary:
1383 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1384 _LOG.debug("_get_run_summary: summary=%s", summary)
1386 # Workaround sometimes using init vs pipetaskInit
1387 summary = summary.replace("init:", "pipetaskInit:")
1389 if "pegasus_version" in job and "pegasus" not in summary:
1390 summary += ";pegasus:0"
1392 return summary
1395def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1396 """Count number of jobs per WMS state.
1398 Parameters
1399 ----------
1400 wms_workflow_id : `str`
1401 HTCondor job id.
1402 jobs : `dict` [`str`, `Any`]
1403 HTCondor dag job information.
1405 Returns
1406 -------
1407 total_count : `int`
1408 Total number of dag nodes.
1409 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1410 Keys are the different WMS states and values are counts of jobs
1411 that are in that WMS state.
1412 """
1413 state_counts = dict.fromkeys(WmsStates, 0)
1415 for jid, jinfo in jobs.items():
1416 if jid != wms_workflow_id:
1417 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1419 total_counted = sum(state_counts.values())
1420 if "NodesTotal" in jobs[wms_workflow_id]:
1421 total_count = jobs[wms_workflow_id]["NodesTotal"]
1422 else:
1423 total_count = total_counted
1425 state_counts[WmsStates.UNREADY] += total_count - total_counted
1427 return total_count, state_counts
1430def _get_state_counts_from_dag_job(job):
1431 """Count number of jobs per WMS state.
1433 Parameters
1434 ----------
1435 job : `dict` [`str`, `Any`]
1436 HTCondor dag job information.
1438 Returns
1439 -------
1440 total_count : `int`
1441 Total number of dag nodes.
1442 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1443 Keys are the different WMS states and values are counts of jobs
1444 that are in that WMS state.
1445 """
1446 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1447 state_counts = dict.fromkeys(WmsStates, 0)
1448 if "DAG_NodesReady" in job:
1449 state_counts = {
1450 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1451 WmsStates.READY: job.get("DAG_NodesReady", 0),
1452 WmsStates.HELD: job.get("JobProcsHeld", 0),
1453 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1454 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1455 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
1456 }
1457 total_jobs = job.get("DAG_NodesTotal")
1458 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1459 elif "NodesFailed" in job:
1460 state_counts = {
1461 WmsStates.UNREADY: job.get("NodesUnready", 0),
1462 WmsStates.READY: job.get("NodesReady", 0),
1463 WmsStates.HELD: job.get("JobProcsHeld", 0),
1464 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1465 WmsStates.FAILED: job.get("NodesFailed", 0),
1466 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
1467 }
1468 try:
1469 total_jobs = job.get("NodesTotal")
1470 except KeyError as ex:
1471 _LOG.error("Job missing %s. job = %s", str(ex), job)
1472 raise
1473 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1474 else:
1475 # With Kerberos job auth and Kerberos bug, if warning would be printed
1476 # for every DAG.
1477 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1478 total_jobs = 0
1480 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1481 return total_jobs, state_counts
1484def _htc_status_to_wms_state(job):
1485 """Convert HTCondor job status to generic wms state.
1487 Parameters
1488 ----------
1489 job : `dict` [`str`, `Any`]
1490 HTCondor job information.
1492 Returns
1493 -------
1494 wms_state : `WmsStates`
1495 The equivalent WmsState to given job's status.
1496 """
1497 wms_state = WmsStates.MISFIT
1498 if "JobStatus" in job:
1499 wms_state = _htc_job_status_to_wms_state(job)
1500 elif "NodeStatus" in job:
1501 wms_state = _htc_node_status_to_wms_state(job)
1502 return wms_state
1505def _htc_job_status_to_wms_state(job):
1506 """Convert HTCondor job status to generic wms state.
1508 Parameters
1509 ----------
1510 job : `dict` [`str`, `Any`]
1511 HTCondor job information.
1513 Returns
1514 -------
1515 wms_state : `lsst.ctrl.bps.WmsStates`
1516 The equivalent WmsState to given job's status.
1517 """
1518 _LOG.debug(
1519 "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1520 )
1521 job_status = int(job["JobStatus"])
1522 wms_state = WmsStates.MISFIT
1524 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1525 if job_status == JobStatus.IDLE:
1526 wms_state = WmsStates.PENDING
1527 elif job_status == JobStatus.RUNNING:
1528 wms_state = WmsStates.RUNNING
1529 elif job_status == JobStatus.REMOVED:
1530 wms_state = WmsStates.DELETED
1531 elif job_status == JobStatus.COMPLETED:
1532 if (
1533 job.get("ExitBySignal", False)
1534 or job.get("ExitCode", 0)
1535 or job.get("ExitSignal", 0)
1536 or job.get("DAG_Status", 0)
1537 or job.get("ReturnValue", 0)
1538 ):
1539 wms_state = WmsStates.FAILED
1540 else:
1541 wms_state = WmsStates.SUCCEEDED
1542 elif job_status == JobStatus.HELD:
1543 wms_state = WmsStates.HELD
1545 return wms_state
1548def _htc_node_status_to_wms_state(job):
1549 """Convert HTCondor status to generic wms state.
1551 Parameters
1552 ----------
1553 job : `dict` [`str`, `Any`]
1554 HTCondor job information.
1556 Returns
1557 -------
1558 wms_state : `lsst.ctrl.bps.WmsStates`
1559 The equivalent WmsState to given node's status.
1560 """
1561 wms_state = WmsStates.MISFIT
1563 status = job["NodeStatus"]
1564 if status == NodeStatus.NOT_READY:
1565 wms_state = WmsStates.UNREADY
1566 elif status == NodeStatus.READY:
1567 wms_state = WmsStates.READY
1568 elif status == NodeStatus.PRERUN:
1569 wms_state = WmsStates.MISFIT
1570 elif status == NodeStatus.SUBMITTED:
1571 if job["JobProcsHeld"]:
1572 wms_state = WmsStates.HELD
1573 elif job["StatusDetails"] == "not_idle":
1574 wms_state = WmsStates.RUNNING
1575 elif job["JobProcsQueued"]:
1576 wms_state = WmsStates.PENDING
1577 elif status == NodeStatus.POSTRUN:
1578 wms_state = WmsStates.MISFIT
1579 elif status == NodeStatus.DONE:
1580 wms_state = WmsStates.SUCCEEDED
1581 elif status == NodeStatus.ERROR:
1582 # Use job exist instead of post script exit
1583 if "DAGMAN error 0" in job["StatusDetails"]:
1584 wms_state = WmsStates.SUCCEEDED
1585 else:
1586 wms_state = WmsStates.FAILED
1588 return wms_state
1591def _update_jobs(jobs1, jobs2):
1592 """Update jobs1 with info in jobs2.
1594 (Basically an update for nested dictionaries.)
1596 Parameters
1597 ----------
1598 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1599 HTCondor job information to be updated.
1600 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1601 Additional HTCondor job information.
1602 """
1603 for jid, jinfo in jobs2.items():
1604 if jid in jobs1:
1605 jobs1[jid].update(jinfo)
1606 else:
1607 jobs1[jid] = jinfo
1610def _wms_id_type(wms_id):
1611 """Determine the type of the WMS id.
1613 Parameters
1614 ----------
1615 wms_id : `str`
1616 WMS id identifying a job.
1618 Returns
1619 -------
1620 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1621 Type of WMS id.
1622 """
1623 try:
1624 int(float(wms_id))
1625 except ValueError:
1626 wms_path = Path(wms_id)
1627 if wms_path.is_dir():
1628 id_type = WmsIdType.PATH
1629 else:
1630 id_type = WmsIdType.GLOBAL
1631 except TypeError:
1632 id_type = WmsIdType.UNKNOWN
1633 else:
1634 id_type = WmsIdType.LOCAL
1635 return id_type
1638def _wms_id_to_cluster(wms_id):
1639 """Convert WMS id to cluster id.
1641 Parameters
1642 ----------
1643 wms_id : `int` or `float` or `str`
1644 HTCondor job id or path.
1646 Returns
1647 -------
1648 schedd_ad : `classad.ClassAd`
1649 ClassAd describing the scheduler managing the job with the given id.
1650 cluster_id : `int`
1651 HTCondor cluster id.
1652 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1653 The type of the provided id.
1654 """
1655 coll = htcondor.Collector()
1657 schedd_ad = None
1658 cluster_id = None
1659 id_type = _wms_id_type(wms_id)
1660 if id_type == WmsIdType.LOCAL:
1661 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1662 cluster_id = int(float(wms_id))
1663 elif id_type == WmsIdType.GLOBAL:
1664 constraint = f'GlobalJobId == "{wms_id}"'
1665 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1666 schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
1667 job_info = condor_q(constraint=constraint, schedds=schedds)
1668 if job_info:
1669 schedd_name, job_rec = job_info.popitem()
1670 job_id, _ = job_rec.popitem()
1671 schedd_ad = schedd_ads[schedd_name]
1672 cluster_id = int(float(job_id))
1673 elif id_type == WmsIdType.PATH:
1674 try:
1675 job_info = read_dag_info(wms_id)
1676 except (FileNotFoundError, PermissionError, OSError):
1677 pass
1678 else:
1679 schedd_name, job_rec = job_info.popitem()
1680 job_id, _ = job_rec.popitem()
1681 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1682 cluster_id = int(float(job_id))
1683 else:
1684 pass
1685 return schedd_ad, cluster_id, id_type
1688def _wms_id_to_dir(wms_id):
1689 """Convert WMS id to a submit directory candidate.
1691 The function does not check if the directory exists or if it is a valid
1692 BPS submit directory.
1694 Parameters
1695 ----------
1696 wms_id : `int` or `float` or `str`
1697 HTCondor job id or path.
1699 Returns
1700 -------
1701 wms_path : `pathlib.Path` or None
1702 Submit directory candidate for the run with the given job id. If no
1703 directory can be associated with the provided WMS id, it will be set
1704 to None.
1705 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1706 The type of the provided id.
1708 Raises
1709 ------
1710 TypeError
1711 Raised if provided WMS id has invalid type.
1712 """
1713 coll = htcondor.Collector()
1714 schedd_ads = []
1716 constraint = None
1717 wms_path = None
1718 id_type = _wms_id_type(wms_id)
1719 match id_type:
1720 case WmsIdType.LOCAL:
1721 constraint = f"ClusterId == {int(float(wms_id))}"
1722 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1723 case WmsIdType.GLOBAL:
1724 constraint = f'GlobalJobId == "{wms_id}"'
1725 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1726 case WmsIdType.PATH:
1727 wms_path = Path(wms_id)
1728 case WmsIdType.UNKNOWN:
1729 raise TypeError(f"Invalid job id type: {wms_id}")
1730 if constraint is not None:
1731 schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1732 job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"])
1733 if job_info:
1734 _, job_rec = job_info.popitem()
1735 _, job_ad = job_rec.popitem()
1736 wms_path = Path(job_ad["Iwd"])
1737 return wms_path, id_type
1740def _create_periodic_release_expr(memory, multiplier, limit):
1741 """Construct an HTCondorAd expression for releasing held jobs.
1743 The expression instruct HTCondor to release any job which was put on hold
1744 due to exceeding memory requirements back to the job queue providing it
1745 satisfies all of the conditions below:
1747 * number of run attempts did not reach allowable number of retries,
1748 * the memory requirements in the last failed run attempt did not reach
1749 the specified memory limit.
1751 Parameters
1752 ----------
1753 memory : `int`
1754 Requested memory in MB.
1755 multiplier : `float`
1756 Memory growth rate between retires.
1757 limit : `int`
1758 Memory limit.
1760 Returns
1761 -------
1762 expr : `str`
1763 A string representing an HTCondor ClassAd expression for releasing jobs
1764 which have been held due to exceeding the memory requirements.
1765 """
1766 is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1767 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1769 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1770 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1771 # The special comparison operators ensure that all comparisons below will
1772 # evaluate to FALSE in this case.
1773 #
1774 # Note:
1775 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1776 # the entire expression should evaluate to FALSE when the job is not HELD.
1777 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1778 # but better safe than sorry.
1779 was_mem_exceeded = (
1780 "JobStatus == 5 "
1781 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1782 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1783 )
1785 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1786 return expr
1789def _create_periodic_remove_expr(memory, multiplier, limit):
1790 """Construct an HTCondorAd expression for removing jobs from the queue.
1792 The expression instruct HTCondor to remove any job which was put on hold
1793 due to exceeding memory requirements from the job queue providing it
1794 satisfies any of the conditions below:
1796 * allowable number of retries was reached,
1797 * the memory requirements during the last failed run attempt reached
1798 the specified memory limit.
1800 Parameters
1801 ----------
1802 memory : `int`
1803 Requested memory in MB.
1804 multiplier : `float`
1805 Memory growth rate between retires.
1806 limit : `int`
1807 Memory limit.
1809 Returns
1810 -------
1811 expr : `str`
1812 A string representing an HTCondor ClassAd expression for removing jobs
1813 which were run at the maximal allowable memory and still exceeded
1814 the memory requirements.
1815 """
1816 is_retry_disallowed = "NumJobStarts > JobMaxRetries"
1817 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
1819 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1820 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1821 # The special comparison operators ensure that all comparisons below will
1822 # evaluate to FALSE in this case.
1823 #
1824 # Note:
1825 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1826 # the entire expression should evaluate to FALSE when the job is not HELD.
1827 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1828 # but better safe than sorry.
1829 was_mem_exceeded = (
1830 "JobStatus == 5 "
1831 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1832 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1833 )
1835 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
1836 return expr
1839def _create_request_memory_expr(memory, multiplier, limit):
1840 """Construct an HTCondor ClassAd expression for safe memory scaling.
1842 Parameters
1843 ----------
1844 memory : `int`
1845 Requested memory in MB.
1846 multiplier : `float`
1847 Memory growth rate between retires.
1848 limit : `int`
1849 Memory limit.
1851 Returns
1852 -------
1853 expr : `str`
1854 A string representing an HTCondor ClassAd expression enabling safe
1855 memory scaling between job retries.
1856 """
1857 # The check if the job was held due to exceeding memory requirements
1858 # will be made *after* job was released back to the job queue (is in
1859 # the IDLE state), hence the need to use `Last*` job ClassAds instead of
1860 # the ones describing job's current state.
1861 #
1862 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
1863 # initially put in the job queue. The special comparison operators ensure
1864 # that all comparisons below will evaluate to FALSE in this case.
1865 was_mem_exceeded = (
1866 "LastJobStatus =?= 5 "
1867 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
1868 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1869 )
1871 # If job runs the first time or was held for reasons other than exceeding
1872 # the memory, set the required memory to the requested value or use
1873 # the memory value measured by HTCondor (MemoryUsage) depending on
1874 # whichever is greater.
1875 expr = (
1876 f"({was_mem_exceeded}) "
1877 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
1878 f": max({{{memory}, MemoryUsage ?: 0}})"
1879 )
1880 return expr
1883def _locate_schedds(locate_all=False):
1884 """Find out Scheduler daemons in an HTCondor pool.
1886 Parameters
1887 ----------
1888 locate_all : `bool`, optional
1889 If True, all available schedulers in the HTCondor pool will be located.
1890 False by default which means that the search will be limited to looking
1891 for the Scheduler running on a local host.
1893 Returns
1894 -------
1895 schedds : `dict` [`str`, `htcondor.Schedd`]
1896 A mapping between Scheduler names and Python objects allowing for
1897 interacting with them.
1898 """
1899 coll = htcondor.Collector()
1901 schedd_ads = []
1902 if locate_all:
1903 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1904 else:
1905 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1906 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
1909def _gather_site_values(config, compute_site):
1910 """Gather values specific to given site.
1912 Parameters
1913 ----------
1914 config : `lsst.ctrl.bps.BpsConfig`
1915 BPS configuration that includes necessary submit/runtime
1916 information.
1917 compute_site : `str`
1918 Compute site name.
1920 Returns
1921 -------
1922 site_values : `dict` [`str`, `Any`]
1923 Values specific to the given site.
1924 """
1925 site_values = {"attrs": {}, "profile": {}}
1926 search_opts = {}
1927 if compute_site:
1928 search_opts["curvals"] = {"curr_site": compute_site}
1930 # Determine the hard limit for the memory requirement.
1931 found, limit = config.search("memoryLimit", opt=search_opts)
1932 if not found:
1933 search_opts["default"] = DEFAULT_HTC_EXEC_PATT
1934 _, patt = config.search("executeMachinesPattern", opt=search_opts)
1935 del search_opts["default"]
1937 # To reduce the amount of data, ignore dynamic slots (if any) as,
1938 # by definition, they cannot have more memory than
1939 # the partitionable slot they are the part of.
1940 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
1941 pool_info = condor_status(constraint=constraint)
1942 try:
1943 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
1944 except ValueError:
1945 _LOG.debug("No execute machine in the pool matches %s", patt)
1946 if limit:
1947 config[".bps_defined.memory_limit"] = limit
1949 _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
1950 site_values["memoryLimit"] = limit
1952 found, value = config.search("accountingGroup", opt=search_opts)
1953 if found:
1954 site_values["accountingGroup"] = value
1955 found, value = config.search("accountingUser", opt=search_opts)
1956 if found:
1957 site_values["accountingUser"] = value
1959 key = f".site.{compute_site}.profile.condor"
1960 if key in config:
1961 for key, val in config[key].items():
1962 if key.startswith("+"):
1963 site_values["attrs"][key[1:]] = val
1964 else:
1965 site_values["profile"][key] = val
1967 return site_values