Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from datetime import datetime, timedelta
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import chdir
47from .lssthtc import (
48 HTCDag,
49 HTCJob,
50 MISSING_ID,
51 JobStatus,
52 NodeStatus,
53 htc_check_dagman_output,
54 htc_escape,
55 htc_submit_dag,
56 read_node_status,
57 read_dag_log,
58 read_dag_status,
59 condor_q,
60 condor_history,
61 pegasus_name_to_label,
62 summary_from_dag,
63)
66_LOG = logging.getLogger(__name__)
69class HTCondorService(BaseWmsService):
70 """HTCondor version of WMS service.
71 """
72 def prepare(self, config, generic_workflow, out_prefix=None):
73 """Convert generic workflow to an HTCondor DAG ready for submission.
75 Parameters
76 ----------
77 config : `lsst.ctrl.bps.BpsConfig`
78 BPS configuration that includes necessary submit/runtime
79 information.
80 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
81 The generic workflow (e.g., has executable name and arguments).
82 out_prefix : `str`
83 The root directory into which all WMS-specific files are written.
85 Returns
86 ----------
87 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
88 HTCondor workflow ready to be run.
89 """
90 _LOG.debug("out_prefix = '%s'", out_prefix)
91 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
92 f"{self.__class__.__module__}."
93 f"{self.__class__.__name__}")
94 workflow.write(out_prefix)
95 return workflow
97 def submit(self, workflow):
98 """Submit a single HTCondor workflow.
100 Parameters
101 ----------
102 workflow : `lsst.ctrl.bps.BaseWorkflow`
103 A single HTCondor workflow to submit. run_id is updated after
104 successful submission to WMS.
105 """
106 # For workflow portability, internal paths are all relative. Hence
107 # the DAG needs to be submitted to HTCondor from inside the submit
108 # directory.
109 with chdir(workflow.submit_path):
110 _LOG.info("Submitting from directory: %s", os.getcwd())
111 htc_submit_dag(workflow.dag, dict())
112 workflow.run_id = workflow.dag.run_id
114 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
115 """Query WMS for list of submitted WMS workflows/jobs.
117 This should be a quick lookup function to create list of jobs for
118 other functions.
120 Parameters
121 ----------
122 wms_id : `int` or `str`, optional
123 Id or path that can be used by WMS service to look up job.
124 user : `str`, optional
125 User whose submitted jobs should be listed.
126 require_bps : `bool`, optional
127 Whether to require jobs returned in list to be bps-submitted jobs.
128 pass_thru : `str`, optional
129 Information to pass through to WMS.
131 Returns
132 -------
133 job_ids : `list` [`Any`]
134 Only job ids to be used by cancel and other functions. Typically
135 this means top-level jobs (i.e., not children jobs).
136 """
137 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
138 wms_id, user, require_bps, pass_thru)
139 constraint = ""
141 if wms_id is None:
142 if user is not None:
143 constraint = f'(Owner == "{user}")'
144 else:
145 cluster_id = _wms_id_to_cluster(wms_id)
146 if cluster_id != 0:
147 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
149 if require_bps:
150 constraint += ' && (bps_isjob == "True")'
152 if pass_thru:
153 if "-forcex" in pass_thru:
154 pass_thru_2 = pass_thru.replace("-forcex", "")
155 if pass_thru_2 and not pass_thru_2.isspace():
156 constraint += f"&& ({pass_thru_2})"
157 else:
158 constraint += f" && ({pass_thru})"
160 _LOG.debug("constraint = %s", constraint)
161 jobs = condor_q(constraint)
163 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
164 job_ids = []
165 for job_id, job_info in jobs.items():
166 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
167 if "DAGManJobId" not in job_info: # orphaned job
168 job_ids.append(job_id)
169 else:
170 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
171 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
172 if f"{job_info['DAGManJobId']}.0" not in jobs:
173 job_ids.append(job_id)
175 _LOG.debug("job_ids = %s", job_ids)
176 return job_ids
178 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
179 """Return run information based upon given constraints.
181 Parameters
182 ----------
183 wms_workflow_id : `str`
184 Limit to specific run based on id.
185 user : `str`
186 Limit results to runs for this user.
187 hist : `float`
188 Limit history search to this many days.
189 pass_thru : `str`
190 Constraints to pass through to HTCondor.
192 Returns
193 -------
194 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
195 Information about runs from given job information.
196 message : `str`
197 Extra message for report command to print. This could be pointers
198 to documentation or to WMS specific commands.
199 """
200 message = ""
202 if wms_workflow_id:
203 # Explicitly checking if wms_workflow_id can be converted to a
204 # float instead of using try/except to avoid catching a different
205 # ValueError from _report_from_id
206 try:
207 float(wms_workflow_id)
208 is_float = True
209 except ValueError: # Don't need TypeError here as None goes to else branch.
210 is_float = False
212 if is_float:
213 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
214 else:
215 run_reports, message = _report_from_path(wms_workflow_id)
216 else:
217 run_reports, message = _summary_report(user, hist, pass_thru)
218 _LOG.debug("report: %s, %s", run_reports, message)
220 return list(run_reports.values()), message
222 def cancel(self, wms_id, pass_thru=None):
223 """Cancel submitted workflows/jobs.
225 Parameters
226 ----------
227 wms_id : `str`
228 ID or path of job that should be canceled.
229 pass_thru : `str`, optional
230 Information to pass through to WMS.
232 Returns
233 --------
234 deleted : `bool`
235 Whether successful deletion or not. Currently, if any doubt or any
236 individual jobs not deleted, return False.
237 message : `str`
238 Any message from WMS (e.g., error details).
239 """
240 _LOG.debug("Canceling wms_id = %s", wms_id)
242 cluster_id = _wms_id_to_cluster(wms_id)
243 if cluster_id == 0:
244 deleted = False
245 message = "Invalid id"
246 else:
247 _LOG.debug("Canceling cluster_id = %s", cluster_id)
248 schedd = htcondor.Schedd()
249 constraint = f"ClusterId == {cluster_id}"
250 if pass_thru is not None and "-forcex" in pass_thru:
251 pass_thru_2 = pass_thru.replace("-forcex", "")
252 if pass_thru_2 and not pass_thru_2.isspace():
253 constraint += f"&& ({pass_thru_2})"
254 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
255 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
256 else:
257 if pass_thru:
258 constraint += f"&& ({pass_thru})"
259 _LOG.debug("JobAction.Remove constraint = %s", constraint)
260 results = schedd.act(htcondor.JobAction.Remove, constraint)
261 _LOG.debug("Remove results: %s", results)
263 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
264 deleted = True
265 message = ""
266 else:
267 deleted = False
268 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
269 message = "no such bps job in batch queue"
270 else:
271 message = f"unknown problems deleting: {results}"
273 _LOG.debug("deleted: %s; message = %s", deleted, message)
274 return deleted, message
277class HTCondorWorkflow(BaseWmsWorkflow):
278 """Single HTCondor workflow.
280 Parameters
281 ----------
282 name : `str`
283 Unique name for Workflow used when naming files.
284 config : `lsst.ctrl.bps.BpsConfig`
285 BPS configuration that includes necessary submit/runtime information.
286 """
287 def __init__(self, name, config=None):
288 super().__init__(name, config)
289 self.dag = None
291 @classmethod
292 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
293 # Docstring inherited
294 htc_workflow = cls(generic_workflow.name, config)
295 htc_workflow.dag = HTCDag(name=generic_workflow.name)
297 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
298 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
299 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
300 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}"})
302 # Create all DAG jobs
303 for job_name in generic_workflow:
304 gwjob = generic_workflow.get_job(job_name)
305 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
306 htc_workflow.dag.add_job(htc_job)
308 # Add job dependencies to the DAG
309 for job_name in generic_workflow:
310 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
312 # If final job exists in generic workflow, create DAG final job
313 final = generic_workflow.get_final()
314 if final and isinstance(final, GenericWorkflowJob):
315 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
316 if "post" not in final_htjob.dagcmds:
317 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
318 f" {final.name} $DAG_STATUS $RETURN"
319 htc_workflow.dag.add_final_job(final_htjob)
320 elif final and isinstance(final, GenericWorkflow):
321 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
322 elif final:
323 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
325 return htc_workflow
327 @staticmethod
328 def _create_job(config, generic_workflow, gwjob, out_prefix):
329 """Convert GenericWorkflow job nodes to DAG jobs.
331 Parameters
332 ----------
333 config : `lsst.ctrl.bps.BpsConfig`
334 BPS configuration that includes necessary submit/runtime
335 information.
336 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
337 Generic workflow that is being converted.
338 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
339 The generic job to convert to a HTCondor job.
340 out_prefix : `str`
341 Directory prefix for HTCondor files.
343 Returns
344 -------
345 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
346 The HTCondor job equivalent to the given generic job.
347 """
348 htc_job = HTCJob(gwjob.name, label=gwjob.label)
350 curvals = dataclasses.asdict(gwjob)
351 if gwjob.tags:
352 curvals.update(gwjob.tags)
353 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
354 if not found:
355 subdir = "jobs"
356 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
358 htc_job_cmds = {
359 "universe": "vanilla",
360 "should_transfer_files": "YES",
361 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
362 "transfer_executable": "False",
363 "getenv": "True",
365 # Exceeding memory sometimes triggering SIGBUS error.
366 # Tell htcondor to put SIGBUS jobs on hold.
367 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
368 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
369 "on_exit_hold_subcode": "34"
370 }
372 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
374 # job stdout, stderr, htcondor user log.
375 for key in ("output", "error", "log"):
376 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
377 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
379 _, use_shared = config.search("bpsUseShared", opt={"default": False})
380 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
382 # Add the job cmds dict to the job object.
383 htc_job.add_job_cmds(htc_job_cmds)
385 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
387 # Add run level attributes to job.
388 htc_job.add_job_attrs(generic_workflow.run_attrs)
390 # Add job attributes to job.
391 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
392 htc_job.add_job_attrs(gwjob.attrs)
393 if gwjob.tags:
394 htc_job.add_job_attrs({"bps_job_quanta": gwjob.tags.get("quanta_summary", "")})
395 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
396 "bps_job_label": gwjob.label})
398 return htc_job
400 def write(self, out_prefix):
401 """Output HTCondor DAGMan files needed for workflow submission.
403 Parameters
404 ----------
405 out_prefix : `str`
406 Directory prefix for HTCondor files.
407 """
408 self.submit_path = out_prefix
409 os.makedirs(out_prefix, exist_ok=True)
411 # Write down the workflow in HTCondor format.
412 self.dag.write(out_prefix, "jobs/{self.label}")
415def _translate_job_cmds(config, generic_workflow, gwjob):
416 """Translate the job data that are one to one mapping
418 Parameters
419 ----------
420 config : `lsst.ctrl.bps.BpsConfig`
421 BPS configuration that includes necessary submit/runtime
422 information.
423 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
424 Generic workflow that contains job to being converted.
425 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
426 Generic workflow job to be converted.
428 Returns
429 -------
430 htc_job_commands : `dict` [`str`, `Any`]
431 Contains commands which can appear in the HTCondor submit description
432 file.
433 """
434 # Values in the job script that just are name mappings.
435 job_translation = {"mail_to": "notify_user",
436 "when_to_mail": "notification",
437 "request_cpus": "request_cpus",
438 "priority": "priority",
439 "category": "category"}
441 jobcmds = {}
442 for gwkey, htckey in job_translation.items():
443 jobcmds[htckey] = getattr(gwjob, gwkey, None)
445 # job commands that need modification
446 if gwjob.request_disk:
447 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
449 if gwjob.request_memory:
450 jobcmds["request_memory"] = f"{gwjob.request_memory}MB"
452 # Assume concurrency_limit implemented using HTCondor concurrency limits.
453 # May need to move to special site-specific implementation if sites use
454 # other mechanisms.
455 if gwjob.concurrency_limit:
456 jobcmds["concurrency_limit"] = ",".join(gwjob.concurrency_limit)
458 # Handle command line
459 if gwjob.executable.transfer_executable:
460 jobcmds["transfer_executable"] = "True"
461 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
462 else:
463 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
465 if gwjob.arguments:
466 arguments = gwjob.arguments
467 arguments = _replace_cmd_vars(arguments, gwjob)
468 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
469 arguments = _fix_env_var_syntax(arguments)
470 jobcmds["arguments"] = arguments
472 # Add extra "pass-thru" job commands
473 if gwjob.profile:
474 for key, val in gwjob.profile.items():
475 jobcmds[key] = htc_escape(val)
477 return jobcmds
480def _translate_dag_cmds(gwjob):
481 """Translate job values into DAGMan commands.
483 Parameters
484 ----------
485 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
486 Job containing values to be translated.
488 Returns
489 -------
490 dagcmds : `dict` [`str`, `Any`]
491 DAGMan commands for the job.
492 """
493 # Values in the dag script that just are name mappings.
494 dag_translation = {"number_of_retries": "retry",
495 "retry_unless_exit": "retry_unless_exit",
496 "abort_on_value": "abort_dag_on",
497 "abort_return_value": "abort_exit"}
499 dagcmds = {}
500 for gwkey, htckey in dag_translation.items():
501 dagcmds[htckey] = getattr(gwjob, gwkey, None)
503 # Still to be coded: vars "pre_cmdline", "post_cmdline"
504 return dagcmds
507def _fix_env_var_syntax(oldstr):
508 """Change ENV place holders to HTCondor Env var syntax.
510 Parameters
511 ----------
512 oldstr : `str`
513 String in which environment variable syntax is to be fixed.
515 Returns
516 -------
517 newstr : `str`
518 Given string with environment variable syntax fixed.
519 """
520 newstr = oldstr
521 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
522 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
523 return newstr
526def _replace_file_vars(config, arguments, workflow, gwjob):
527 """Replace file placeholders in command line arguments with correct
528 physical file names.
530 Parameters
531 ----------
532 config : `lsst.ctrl.bps.BpsConfig`
533 BPS configuration that includes necessary submit/runtime
534 information.
535 arguments : `str`
536 Arguments string in which to replace file placeholders.
537 workflow : `lsst.ctrl.bps.GenericWorkflow`
538 Generic workflow that contains file information.
539 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
540 The job corresponding to the arguments.
542 Returns
543 -------
544 arguments : `str`
545 Given arguments string with file placeholders replaced.
546 """
547 _, use_shared = config.search("bpsUseShared", opt={"default": False})
549 # Replace input file placeholders with paths.
550 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
551 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared:
552 uri = os.path.basename(gwfile.src_uri)
553 else:
554 uri = gwfile.src_uri
555 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
557 # Replace output file placeholders with paths.
558 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
559 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared:
560 uri = os.path.basename(gwfile.src_uri)
561 else:
562 uri = gwfile.src_uri
563 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
564 return arguments
567def _replace_cmd_vars(arguments, gwjob):
568 """Replace format-style placeholders in arguments.
570 Parameters
571 ----------
572 arguments : `str`
573 Arguments string in which to replace placeholders.
574 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
575 Job containing values to be used to replace placeholders
576 (in particular gwjob.cmdvals).
578 Returns
579 -------
580 arguments : `str`
581 Given arguments string with placeholders replaced.
582 """
583 try:
584 arguments = arguments.format(**gwjob.cmdvals)
585 except (KeyError, TypeError): # TypeError in case None instead of {}
586 _LOG.error("Could not replace command variables:\n"
587 "arguments: %s\n"
588 "cmdvals: %s", arguments, gwjob.cmdvals)
589 raise
590 return arguments
593def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
594 """Add job input files from generic workflow to job.
596 Parameters
597 ----------
598 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
599 The generic workflow (e.g., has executable name and arguments).
600 job_name : `str`
601 Unique name for the job.
602 use_shared : `bool`
603 Whether job has access to files via shared filesystem.
604 out_prefix : `str`
605 The root directory into which all WMS-specific files are written.
607 Returns
608 -------
609 htc_commands : `dict` [`str`, `str`]
610 HTCondor commands for the job submission script.
611 """
612 htc_commands = {}
613 inputs = []
614 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
615 _LOG.debug("src_uri=%s", gwf_file.src_uri)
616 if not use_shared or not gwf_file.job_shared:
617 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix))
619 if inputs:
620 htc_commands["transfer_input_files"] = ",".join(inputs)
621 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
622 return htc_commands
625def _report_from_path(wms_path):
626 """Gather run information from a given run directory.
628 Parameters
629 ----------
630 wms_path : `str`
631 The directory containing the submit side files (e.g., HTCondor files).
633 Returns
634 -------
635 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
636 Run information for the detailed report. The key is the HTCondor id
637 and the value is a collection of report information for that run.
638 message : `str`
639 Message to be printed with the summary report.
640 """
641 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
642 if wms_workflow_id == MISSING_ID:
643 run_reports = {}
644 else:
645 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
646 return run_reports, message
649def _report_from_id(wms_workflow_id, hist):
650 """Gather run information from a given run directory.
652 Parameters
653 ----------
654 wms_workflow_id : `int` or `str`
655 Limit to specific run based on id.
656 hist : `float`
657 Limit history search to this many days.
659 Returns
660 -------
661 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
662 Run information for the detailed report. The key is the HTCondor id
663 and the value is a collection of report information for that run.
664 message : `str`
665 Message to be printed with the summary report.
666 """
667 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
668 f"{int(float(wms_workflow_id))})"
669 jobs = condor_q(constraint)
670 if hist:
671 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
672 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
673 hist_jobs = condor_history(constraint)
674 _update_jobs(jobs, hist_jobs)
676 # keys in dictionary will be strings of format "ClusterId.ProcId"
677 wms_workflow_id = str(wms_workflow_id)
678 if not wms_workflow_id.endswith(".0"):
679 wms_workflow_id += ".0"
681 if wms_workflow_id in jobs:
682 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
683 _update_jobs(jobs, path_jobs)
684 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
685 else:
686 run_reports = {}
687 message = f"Found 0 records for run id {wms_workflow_id}"
688 return run_reports, message
691def _get_info_from_path(wms_path):
692 """Gather run information from a given run directory.
694 Parameters
695 ----------
696 wms_path : `str`
697 Directory containing HTCondor files.
699 Returns
700 -------
701 wms_workflow_id : `str`
702 The run id which is a DAGman job id.
703 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
704 Information about jobs read from files in the given directory.
705 The key is the HTCondor id and the value is a dictionary of HTCondor
706 keys and values.
707 message : `str`
708 Message to be printed with the summary report.
709 """
710 try:
711 wms_workflow_id, jobs = read_dag_log(wms_path)
712 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
713 _update_jobs(jobs, read_node_status(wms_path))
714 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
716 # Add more info for DAGman job
717 job = jobs[wms_workflow_id]
718 job.update(read_dag_status(wms_path))
719 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
720 if "bps_run" not in job:
721 _add_run_info(wms_path, job)
723 message = htc_check_dagman_output(wms_path)
724 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
725 jobs[wms_workflow_id]["total_jobs"])
726 except StopIteration:
727 message = f"Could not find HTCondor files in {wms_path}"
728 _LOG.warning(message)
729 wms_workflow_id = MISSING_ID
730 jobs = {}
732 return wms_workflow_id, jobs, message
735def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
736 """Gather run information to be used in generating summary reports.
738 Parameters
739 ----------
740 wms_workflow_id : `str`
741 Run lookup restricted to given user.
742 jobs : `float`
743 How many previous days to search for run information.
745 Returns
746 -------
747 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
748 Run information for the detailed report. The key is the given HTCondor
749 id and the value is a collection of report information for that run.
750 """
751 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
752 dag_job = jobs[wms_workflow_id]
753 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
754 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
755 return {}
756 report = WmsRunReport(wms_id=wms_workflow_id,
757 path=dag_job["Iwd"],
758 label=dag_job.get("bps_job_label", "MISS"),
759 run=dag_job.get("bps_run", "MISS"),
760 project=dag_job.get("bps_project", "MISS"),
761 campaign=dag_job.get("bps_campaign", "MISS"),
762 payload=dag_job.get("bps_payload", "MISS"),
763 operator=_get_owner(dag_job),
764 run_summary=_get_run_summary(dag_job),
765 state=_htc_status_to_wms_state(dag_job),
766 jobs=[],
767 total_number_jobs=dag_job["total_jobs"],
768 job_state_counts=dag_job["state_counts"])
770 try:
771 for job in jobs.values():
772 if job["ClusterId"] != int(float(wms_workflow_id)):
773 job_report = WmsJobReport(wms_id=job["ClusterId"],
774 name=job.get("DAGNodeName", str(job["ClusterId"])),
775 label=job.get("bps_job_label",
776 pegasus_name_to_label(job["DAGNodeName"])),
777 state=_htc_status_to_wms_state(job))
778 if job_report.label == "init":
779 job_report.label = "pipetaskInit"
780 report.jobs.append(job_report)
781 except KeyError as ex:
782 _LOG.error("Job missing key '%s': %s", str(ex), job)
783 raise
785 run_reports = {report.wms_id: report}
786 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
787 return run_reports
790def _summary_report(user, hist, pass_thru):
791 """Gather run information to be used in generating summary reports.
793 Parameters
794 ----------
795 user : `str`
796 Run lookup restricted to given user.
797 hist : `float`
798 How many previous days to search for run information.
799 pass_thru : `str`
800 Advanced users can define the HTCondor constraint to be used
801 when searching queue and history.
803 Returns
804 -------
805 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
806 Run information for the summary report. The keys are HTCondor ids and
807 the values are collections of report information for each run.
808 message : `str`
809 Message to be printed with the summary report.
810 """
811 # only doing summary report so only look for dagman jobs
812 if pass_thru:
813 constraint = pass_thru
814 else:
815 # Notes:
816 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
817 # manually restarted.
818 # * Any job with DAGManJobID isn't a DAG job
819 constraint = 'bps_isjob == "True" && JobUniverse == 7'
820 if user:
821 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
823 # Check runs in queue.
824 jobs = condor_q(constraint)
826 if hist:
827 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
828 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
829 hist_jobs = condor_history(constraint)
830 _update_jobs(jobs, hist_jobs)
832 _LOG.debug("Job ids from queue and history %s", jobs.keys())
834 # Have list of DAGMan jobs, need to get run_report info.
835 run_reports = {}
836 for job in jobs.values():
837 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
838 # If didn't get from queue information (e.g., Kerberos bug),
839 # try reading from file.
840 if total_jobs == 0:
841 try:
842 job.update(read_dag_status(job["Iwd"]))
843 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
844 except StopIteration:
845 pass # don't kill report can't find htcondor files
847 if "bps_run" not in job:
848 _add_run_info(job["Iwd"], job)
849 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
850 path=job["Iwd"],
851 label=job.get("bps_job_label", "MISS"),
852 run=job.get("bps_run", "MISS"),
853 project=job.get("bps_project", "MISS"),
854 campaign=job.get("bps_campaign", "MISS"),
855 payload=job.get("bps_payload", "MISS"),
856 operator=_get_owner(job),
857 run_summary=_get_run_summary(job),
858 state=_htc_status_to_wms_state(job),
859 jobs=[],
860 total_number_jobs=total_jobs,
861 job_state_counts=state_counts)
863 run_reports[report.wms_id] = report
865 return run_reports, ""
868def _add_run_info(wms_path, job):
869 """Find BPS run information elsewhere for runs without bps attributes.
871 Parameters
872 ----------
873 wms_path : `str`
874 Path to submit files for the run.
875 job : `dict` [`str`, `Any`]
876 HTCondor dag job information.
878 Raises
879 ------
880 StopIteration
881 If cannot find file it is looking for. Permission errors are
882 caught and job's run is marked with error.
883 """
884 path = Path(wms_path) / "jobs"
885 try:
886 jobdir = next(path.glob("*"), Path(wms_path))
887 try:
888 subfile = next(jobdir.glob("*.sub"))
889 _LOG.debug("_add_run_info: subfile = %s", subfile)
890 with open(subfile, "r") as fh:
891 for line in fh:
892 if line.startswith("+bps_"):
893 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
894 if m:
895 _LOG.debug("Matching line: %s", line)
896 job[m.group(1)] = m.group(2).replace('"', "")
897 else:
898 _LOG.debug("Could not parse attribute: %s", line)
899 except StopIteration:
900 job["bps_run"] = "Missing"
902 except PermissionError:
903 job["bps_run"] = "PermissionError"
904 _LOG.debug("After adding job = %s", job)
907def _get_owner(job):
908 """Get the owner of a dag job.
910 Parameters
911 ----------
912 job : `dict` [`str`, `Any`]
913 HTCondor dag job information.
915 Returns
916 -------
917 owner : `str`
918 Owner of the dag job.
919 """
920 owner = job.get("bps_operator", None)
921 if not owner:
922 owner = job.get("Owner", None)
923 if not owner:
924 _LOG.warning("Could not get Owner from htcondor job: %s", job)
925 owner = "MISS"
926 return owner
929def _get_run_summary(job):
930 """Get the run summary for a job.
932 Parameters
933 ----------
934 job : `dict` [`str`, `Any`]
935 HTCondor dag job information.
937 Returns
938 -------
939 summary : `str`
940 Number of jobs per PipelineTask label in approximate pipeline order.
941 Format: <label>:<count>[;<label>:<count>]+
942 """
943 summary = job.get("bps_run_summary", None)
944 if not summary:
945 summary, _ = summary_from_dag(job["Iwd"])
946 if not summary:
947 _LOG.warning("Could not get run summary for htcondor job: %s", job)
948 _LOG.debug("_get_run_summary: summary=%s", summary)
950 # Workaround sometimes using init vs pipetaskInit
951 summary = summary.replace("init:", "pipetaskInit:")
953 if "pegasus_version" in job and "pegasus" not in summary:
954 summary += ";pegasus:0"
956 return summary
959def _get_state_counts_from_jobs(wms_workflow_id, jobs):
960 """Count number of jobs per WMS state.
962 Parameters
963 ----------
964 wms_workflow_id : `str`
965 HTCondor job id.
966 jobs : `dict` [`str`, `Any`]
967 HTCondor dag job information.
969 Returns
970 -------
971 total_count : `int`
972 Total number of dag nodes.
973 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
974 Keys are the different WMS states and values are counts of jobs
975 that are in that WMS state.
976 """
977 state_counts = dict.fromkeys(WmsStates, 0)
979 for jid, jinfo in jobs.items():
980 if jid != wms_workflow_id:
981 state_counts[_htc_status_to_wms_state(jinfo)] += 1
983 total_counted = sum(state_counts.values())
984 if "NodesTotal" in jobs[wms_workflow_id]:
985 total_count = jobs[wms_workflow_id]["NodesTotal"]
986 else:
987 total_count = total_counted
989 state_counts[WmsStates.UNREADY] += total_count - total_counted
991 return total_count, state_counts
994def _get_state_counts_from_dag_job(job):
995 """Count number of jobs per WMS state.
997 Parameters
998 ----------
999 job : `dict` [`str`, `Any`]
1000 HTCondor dag job information.
1002 Returns
1003 -------
1004 total_count : `int`
1005 Total number of dag nodes.
1006 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1007 Keys are the different WMS states and values are counts of jobs
1008 that are in that WMS state.
1009 """
1010 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1011 state_counts = dict.fromkeys(WmsStates, 0)
1012 if "DAG_NodesReady" in job:
1013 state_counts = {
1014 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1015 WmsStates.READY: job.get("DAG_NodesReady", 0),
1016 WmsStates.HELD: job.get("JobProcsHeld", 0),
1017 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1018 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1019 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1020 total_jobs = job.get("DAG_NodesTotal")
1021 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1022 elif "NodesFailed" in job:
1023 state_counts = {
1024 WmsStates.UNREADY: job.get("NodesUnready", 0),
1025 WmsStates.READY: job.get("NodesReady", 0),
1026 WmsStates.HELD: job.get("JobProcsHeld", 0),
1027 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1028 WmsStates.FAILED: job.get("NodesFailed", 0),
1029 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1030 try:
1031 total_jobs = job.get("NodesTotal")
1032 except KeyError as ex:
1033 _LOG.error("Job missing %s. job = %s", str(ex), job)
1034 raise
1035 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1036 else:
1037 # With Kerberos job auth and Kerberos bug, if warning would be printed
1038 # for every DAG.
1039 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1040 total_jobs = 0
1042 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1043 return total_jobs, state_counts
1046def _htc_status_to_wms_state(job):
1047 """Convert HTCondor job status to generic wms state.
1049 Parameters
1050 ----------
1051 job : `dict` [`str`, `Any`]
1052 HTCondor job information.
1054 Returns
1055 -------
1056 wms_state : `WmsStates`
1057 The equivalent WmsState to given job's status.
1058 """
1059 wms_state = WmsStates.MISFIT
1060 if "JobStatus" in job:
1061 wms_state = _htc_job_status_to_wms_state(job)
1062 elif "NodeStatus" in job:
1063 wms_state = _htc_node_status_to_wms_state(job)
1064 return wms_state
1067def _htc_job_status_to_wms_state(job):
1068 """Convert HTCondor job status to generic wms state.
1070 Parameters
1071 ----------
1072 job : `dict` [`str`, `Any`]
1073 HTCondor job information.
1075 Returns
1076 -------
1077 wms_state : `lsst.ctrl.bps.WmsStates`
1078 The equivalent WmsState to given job's status.
1079 """
1080 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1081 type(job["JobStatus"]))
1082 job_status = int(job["JobStatus"])
1083 wms_state = WmsStates.MISFIT
1085 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1086 if job_status == JobStatus.IDLE:
1087 wms_state = WmsStates.PENDING
1088 elif job_status == JobStatus.RUNNING:
1089 wms_state = WmsStates.RUNNING
1090 elif job_status == JobStatus.REMOVED:
1091 wms_state = WmsStates.DELETED
1092 elif job_status == JobStatus.COMPLETED:
1093 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1094 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1095 job.get("ReturnValue", 0):
1096 wms_state = WmsStates.FAILED
1097 else:
1098 wms_state = WmsStates.SUCCEEDED
1099 elif job_status == JobStatus.HELD:
1100 wms_state = WmsStates.HELD
1102 return wms_state
1105def _htc_node_status_to_wms_state(job):
1106 """Convert HTCondor status to generic wms state.
1108 Parameters
1109 ----------
1110 job : `dict` [`str`, `Any`]
1111 HTCondor job information.
1113 Returns
1114 -------
1115 wms_state : `lsst.ctrl.bps.WmsStates`
1116 The equivalent WmsState to given node's status.
1117 """
1118 wms_state = WmsStates.MISFIT
1120 status = job["NodeStatus"]
1121 if status == NodeStatus.NOT_READY:
1122 wms_state = WmsStates.UNREADY
1123 elif status == NodeStatus.READY:
1124 wms_state = WmsStates.READY
1125 elif status == NodeStatus.PRERUN:
1126 wms_state = WmsStates.MISFIT
1127 elif status == NodeStatus.SUBMITTED:
1128 if job["JobProcsHeld"]:
1129 wms_state = WmsStates.HELD
1130 elif job["StatusDetails"] == "not_idle":
1131 wms_state = WmsStates.RUNNING
1132 elif job["JobProcsQueued"]:
1133 wms_state = WmsStates.PENDING
1134 elif status == NodeStatus.POSTRUN:
1135 wms_state = WmsStates.MISFIT
1136 elif status == NodeStatus.DONE:
1137 wms_state = WmsStates.SUCCEEDED
1138 elif status == NodeStatus.ERROR:
1139 wms_state = WmsStates.FAILED
1141 return wms_state
1144def _update_jobs(jobs1, jobs2):
1145 """Update jobs1 with info in jobs2.
1147 (Basically an update for nested dictionaries.)
1149 Parameters
1150 ----------
1151 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1152 HTCondor job information to be updated.
1153 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1154 Additional HTCondor job information.
1155 """
1156 for jid, jinfo in jobs2.items():
1157 if jid in jobs1:
1158 jobs1[jid].update(jinfo)
1159 else:
1160 jobs1[jid] = jinfo
1163def _wms_id_to_cluster(wms_id):
1164 """Convert WMS ID to cluster ID.
1166 Parameters
1167 ----------
1168 wms_id : `int` or `float` or `str`
1169 HTCondor job id or path.
1171 Returns
1172 -------
1173 cluster_id : `int`
1174 HTCondor cluster id.
1175 """
1176 # If wms_id represents path, get numeric id.
1177 try:
1178 cluster_id = int(float(wms_id))
1179 except ValueError:
1180 wms_path = Path(wms_id)
1181 if wms_path.exists():
1182 try:
1183 cluster_id, _ = read_dag_log(wms_id)
1184 cluster_id = int(float(cluster_id))
1185 except StopIteration:
1186 cluster_id = 0
1187 else:
1188 cluster_id = 0
1189 return cluster_id