Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import os
29import re
30import logging
31from datetime import datetime, timedelta
32from pathlib import Path
34import htcondor
36from ... import (
37 BaseWmsWorkflow,
38 BaseWmsService,
39 GenericWorkflow,
40 WmsRunReport,
41 WmsJobReport,
42 WmsStates
43)
44from ...bps_utils import chdir
45from .lssthtc import (
46 HTCDag,
47 HTCJob,
48 MISSING_ID,
49 JobStatus,
50 NodeStatus,
51 htc_check_dagman_output,
52 htc_escape,
53 htc_submit_dag,
54 read_node_status,
55 read_dag_log,
56 read_dag_status,
57 condor_q,
58 condor_history,
59 pegasus_name_to_label,
60 summary_from_dag,
61)
64_LOG = logging.getLogger(__name__)
67class HTCondorService(BaseWmsService):
68 """HTCondor version of WMS service.
69 """
70 def prepare(self, config, generic_workflow, out_prefix=None):
71 """Convert generic workflow to an HTCondor DAG ready for submission.
73 Parameters
74 ----------
75 config : `lsst.ctrl.bps.BPSConfig`
76 BPS configuration that includes necessary submit/runtime
77 information.
78 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
79 The generic workflow (e.g., has executable name and arguments).
80 out_prefix : `str`
81 The root directory into which all WMS-specific files are written.
83 Returns
84 ----------
85 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
86 HTCondor workflow ready to be run.
87 """
88 _LOG.debug("out_prefix = '%s'", out_prefix)
89 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
90 f"{self.__class__.__module__}."
91 f"{self.__class__.__name__}")
92 workflow.write(out_prefix)
93 return workflow
95 def submit(self, workflow):
96 """Submit a single HTCondor workflow.
98 Parameters
99 ----------
100 workflow : `lsst.ctrl.bps.BaseWorkflow`
101 A single HTCondor workflow to submit. run_id is updated after
102 successful submission to WMS.
103 """
104 # For workflow portability, internal paths are all relative. Hence
105 # the DAG needs to be submitted to HTCondor from inside the submit
106 # directory.
107 with chdir(workflow.submit_path):
108 _LOG.info("Submitting from directory: %s", os.getcwd())
109 htc_submit_dag(workflow.dag, dict())
110 workflow.run_id = workflow.dag.run_id
112 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
113 """Query WMS for list of submitted WMS workflows/jobs.
115 This should be a quick lookup function to create list of jobs for
116 other functions.
118 Parameters
119 ----------
120 wms_id : `int` or `str`, optional
121 Id or path that can be used by WMS service to look up job.
122 user : `str`, optional
123 User whose submitted jobs should be listed.
124 require_bps : `bool`, optional
125 Whether to require jobs returned in list to be bps-submitted jobs.
126 pass_thru : `str`, optional
127 Information to pass through to WMS.
129 Returns
130 -------
131 job_ids : `list` [`Any`]
132 Only job ids to be used by cancel and other functions. Typically
133 this means top-level jobs (i.e., not children jobs).
134 """
135 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
136 wms_id, user, require_bps, pass_thru)
137 constraint = ""
139 if wms_id is None:
140 if user is not None:
141 constraint = f'(Owner == "{user}")'
142 else:
143 cluster_id = _wms_id_to_cluster(wms_id)
144 if cluster_id != 0:
145 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
147 if require_bps:
148 constraint += ' && (bps_isjob == "True")'
150 if pass_thru:
151 if "-forcex" in pass_thru:
152 pass_thru_2 = pass_thru.replace("-forcex", "")
153 if pass_thru_2 and not pass_thru_2.isspace():
154 constraint += f"&& ({pass_thru_2})"
155 else:
156 constraint += f" && ({pass_thru})"
158 _LOG.debug("constraint = %s", constraint)
159 jobs = condor_q(constraint)
161 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
162 job_ids = []
163 for job_id, job_info in jobs.items():
164 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
165 if "DAGManJobId" not in job_info: # orphaned job
166 job_ids.append(job_id)
167 else:
168 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
169 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
170 if f"{job_info['DAGManJobId']}.0" not in jobs:
171 job_ids.append(job_id)
173 _LOG.debug("job_ids = %s", job_ids)
174 return job_ids
176 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
177 """Return run information based upon given constraints.
179 Parameters
180 ----------
181 wms_workflow_id : `str`
182 Limit to specific run based on id.
183 user : `str`
184 Limit results to runs for this user.
185 hist : `float`
186 Limit history search to this many days.
187 pass_thru : `str`
188 Constraints to pass through to HTCondor.
190 Returns
191 -------
192 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
193 Information about runs from given job information.
194 message : `str`
195 Extra message for report command to print. This could be pointers
196 to documentation or to WMS specific commands.
197 """
198 message = ""
200 if wms_workflow_id:
201 # Explicitly checking if wms_workflow_id can be converted to a
202 # float instead of using try/except to avoid catching a different
203 # ValueError from _report_from_id
204 try:
205 float(wms_workflow_id)
206 is_float = True
207 except ValueError: # Don't need TypeError here as None goes to else branch.
208 is_float = False
210 if is_float:
211 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
212 else:
213 run_reports, message = _report_from_path(wms_workflow_id)
214 else:
215 run_reports, message = _summary_report(user, hist, pass_thru)
216 _LOG.debug("report: %s, %s", run_reports, message)
218 return list(run_reports.values()), message
220 def cancel(self, wms_id, pass_thru=None):
221 """Cancel submitted workflows/jobs.
223 Parameters
224 ----------
225 wms_id : `str`
226 ID or path of job that should be canceled.
227 pass_thru : `str`, optional
228 Information to pass through to WMS.
230 Returns
231 --------
232 deleted : `bool`
233 Whether successful deletion or not. Currently, if any doubt or any
234 individual jobs not deleted, return False.
235 message : `str`
236 Any message from WMS (e.g., error details).
237 """
238 _LOG.debug("Canceling wms_id = %s", wms_id)
240 cluster_id = _wms_id_to_cluster(wms_id)
241 if cluster_id == 0:
242 deleted = False
243 message = "Invalid id"
244 else:
245 _LOG.debug("Canceling cluster_id = %s", cluster_id)
246 schedd = htcondor.Schedd()
247 constraint = f"ClusterId == {cluster_id}"
248 if pass_thru is not None and "-forcex" in pass_thru:
249 pass_thru_2 = pass_thru.replace("-forcex", "")
250 if pass_thru_2 and not pass_thru_2.isspace():
251 constraint += f"&& ({pass_thru_2})"
252 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
253 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
254 else:
255 if pass_thru:
256 constraint += f"&& ({pass_thru})"
257 _LOG.debug("JobAction.Remove constraint = %s", constraint)
258 results = schedd.act(htcondor.JobAction.Remove, constraint)
259 _LOG.debug("Remove results: %s", results)
261 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
262 deleted = True
263 message = ""
264 else:
265 deleted = False
266 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
267 message = "no such bps job in batch queue"
268 else:
269 message = f"unknown problems deleting: {results}"
271 _LOG.debug("deleted: %s; message = %s", deleted, message)
272 return deleted, message
275class HTCondorWorkflow(BaseWmsWorkflow):
276 """Single HTCondor workflow.
278 Parameters
279 ----------
280 name : `str`
281 Unique name for Workflow used when naming files.
282 config : `lsst.ctrl.bps.BpsConfig`
283 BPS configuration that includes necessary submit/runtime information.
284 """
285 def __init__(self, name, config=None):
286 super().__init__(name, config)
287 self.dag = None
289 @classmethod
290 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
291 # Docstring inherited
292 htc_workflow = cls(generic_workflow.name, config)
293 htc_workflow.dag = HTCDag(name=generic_workflow.name)
295 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
296 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
297 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
298 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}"})
300 # Create all DAG jobs
301 for job_name in generic_workflow:
302 gwf_job = generic_workflow.get_job(job_name)
303 htc_job = HTCondorWorkflow._create_job(generic_workflow, gwf_job, generic_workflow.run_attrs,
304 out_prefix)
305 htc_workflow.dag.add_job(htc_job)
307 # Add job dependencies to the DAG
308 for job_name in generic_workflow:
309 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
310 return htc_workflow
312 @staticmethod
313 def _create_job(generic_workflow, gwf_job, run_attrs, out_prefix):
314 """Convert GenericWorkflow job nodes to DAG jobs.
316 Parameters
317 ----------
318 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
319 Generic workflow that is being converted.
320 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob`
321 The generic job to convert to a HTCondor job.
322 run_attrs : `dict` [`str`, `str`]
323 Attributes common to entire run that should be added to job.
324 out_prefix : `str`
325 Directory prefix for HTCondor files.
327 Returns
328 -------
329 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
330 The HTCondor job equivalent to the given generic job.
331 """
332 htc_job = HTCJob(gwf_job.name, label=gwf_job.label)
334 htc_job_cmds = {
335 "universe": "vanilla",
336 "should_transfer_files": "YES",
337 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
338 "transfer_executable": "False",
339 "getenv": "True",
340 }
342 htc_job_cmds.update(_translate_job_cmds(generic_workflow, gwf_job))
344 # job stdout, stderr, htcondor user log.
345 htc_job_cmds["output"] = f"{gwf_job.name}.$(Cluster).out"
346 htc_job_cmds["error"] = f"{gwf_job.name}.$(Cluster).err"
347 htc_job_cmds["log"] = f"{gwf_job.name}.$(Cluster).log"
348 for key in ("output", "error", "log"):
349 htc_job_cmds[key] = f"{gwf_job.name}.$(Cluster).{key[:3]}"
350 if gwf_job.label:
351 htc_job_cmds[key] = os.path.join(gwf_job.label, htc_job_cmds[key])
352 htc_job_cmds[key] = os.path.join("jobs", htc_job_cmds[key])
353 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
355 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwf_job.name, out_prefix))
357 # Add the job cmds dict to the job object.
358 htc_job.add_job_cmds(htc_job_cmds)
360 # Add run level attributes to job.
361 htc_job.add_job_attrs(run_attrs)
363 # Add job attributes to job.
364 _LOG.debug("gwf_job.attrs = %s", gwf_job.attrs)
365 htc_job.add_job_attrs(gwf_job.attrs)
366 htc_job.add_job_attrs({"bps_job_name": gwf_job.name,
367 "bps_job_label": gwf_job.label,
368 "bps_job_quanta": gwf_job.tags.get("quanta_summary", "")})
370 return htc_job
372 def write(self, out_prefix):
373 """Output HTCondor DAGMan files needed for workflow submission.
375 Parameters
376 ----------
377 out_prefix : `str`
378 Directory prefix for HTCondor files.
379 """
380 self.submit_path = out_prefix
381 os.makedirs(out_prefix, exist_ok=True)
383 # Write down the workflow in HTCondor format.
384 self.dag.write(out_prefix, "jobs/{self.label}")
387def _translate_job_cmds(generic_workflow, generic_workflow_job):
388 """Translate the job data that are one to one mapping
390 Parameters
391 ----------
392 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
393 Generic workflow that contains job to being converted.
394 generic_workflow_job : `lsst.ctrl.bps.GenericWorkflowJob`
395 Generic workflow job to be converted.
397 Returns
398 -------
399 htc_job_commands : `dict` [`str`, `Any`]
400 Contains commands which can appear in the HTCondor submit description
401 file.
402 """
403 jobcmds = {}
405 if generic_workflow_job.mail_to:
406 jobcmds["notify_user"] = generic_workflow_job.mail_to
408 if generic_workflow_job.when_to_mail:
409 jobcmds["notification"] = generic_workflow_job.when_to_mail
411 if generic_workflow_job.request_cpus:
412 jobcmds["request_cpus"] = generic_workflow_job.request_cpus
414 if generic_workflow_job.request_disk:
415 jobcmds["request_disk"] = f"{generic_workflow_job.request_disk}MB"
417 if generic_workflow_job.request_memory:
418 jobcmds["request_memory"] = f"{generic_workflow_job.request_memory}MB"
420 if generic_workflow_job.priority:
421 jobcmds["priority"] = generic_workflow_job.priority
423 try:
424 cmd_parts = generic_workflow_job.cmdline.split(" ", 1)
425 except AttributeError:
426 print(generic_workflow_job)
427 raise
428 jobcmds["executable"] = _fix_env_var_syntax(cmd_parts[0])
429 if len(cmd_parts) > 1:
430 jobcmds["arguments"] = cmd_parts[1]
431 arguments = cmd_parts[1]
432 arguments = _replace_cmd_vars(arguments,
433 generic_workflow_job)
434 arguments = _replace_file_vars(arguments,
435 generic_workflow,
436 generic_workflow_job)
437 arguments = _fix_env_var_syntax(arguments)
438 jobcmds["arguments"] = arguments
440 # Add extra "pass-thru" job commands
441 if generic_workflow_job.profile:
442 for key, val in generic_workflow_job.profile.items():
443 jobcmds[key] = htc_escape(val)
445 return jobcmds
448def _fix_env_var_syntax(oldstr):
449 """Change ENV place holders to HTCondor Env var syntax.
451 Parameters
452 ----------
453 oldstr : `str`
454 String in which environment variable syntax is to be fixed.
456 Returns
457 -------
458 newstr : `str`
459 Given string with environment variable syntax fixed.
460 """
461 newstr = oldstr
462 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
463 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
464 return newstr
467def _replace_file_vars(arguments, workflow, gwjob):
468 """Replace file placeholders in command line arguments with correct
469 physical file names.
471 Parameters
472 ----------
473 arguments : `str`
474 Arguments string in which to replace file placeholders.
475 workflow : `lsst.ctrl.bps.GenericWorkflow`
476 Generic workflow that contains file information.
477 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
478 The job corresponding to the arguments.
480 Returns
481 -------
482 arguments : `str`
483 Given arguments string with file placeholders replaced.
484 """
485 # Replace input file placeholders with paths.
486 for gwfile in workflow.get_job_inputs(gwjob.name):
487 if gwfile.wms_transfer:
488 uri = os.path.basename(gwfile.src_uri)
489 else:
490 uri = gwfile.src_uri
491 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
493 # Replace input file placeholders with paths.
494 for gwfile in workflow.get_job_outputs(gwjob.name):
495 if gwfile.wms_transfer:
496 uri = os.path.basename(gwfile.src_uri)
497 else:
498 uri = gwfile.src_uri
499 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
500 return arguments
503def _replace_cmd_vars(arguments, gwjob):
504 """Replace format-style placeholders in arguments.
506 Parameters
507 ----------
508 arguments : `str`
509 Arguments string in which to replace placeholders.
510 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
511 Job containing values to be used to replace placeholders
512 (in particular gwjob.cmdvals).
514 Returns
515 -------
516 arguments : `str`
517 Given arguments string with placeholders replaced.
518 """
519 arguments = arguments.format(**gwjob.cmdvals)
520 return arguments
523def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, out_prefix):
524 """Add job input files from generic workflow to job.
526 Parameters
527 ----------
528 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
529 The generic workflow (e.g., has executable name and arguments).
530 job_name : `str`
531 Unique name for the job.
532 out_prefix : `str`
533 The root directory into which all WMS-specific files are written.
535 Returns
536 -------
537 htc_commands : `dict` [`str`, `str`]
538 HTCondor commands for the job submission.
539 """
540 htc_commands = {}
541 inputs = []
542 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
543 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix))
545 if inputs:
546 htc_commands["transfer_input_files"] = ",".join(inputs)
547 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
548 return htc_commands
551def _report_from_path(wms_path):
552 """Gather run information from a given run directory.
554 Parameters
555 ----------
556 wms_path : `str`
557 The directory containing the submit side files (e.g., HTCondor files).
559 Returns
560 -------
561 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
562 Run information for the detailed report. The key is the HTCondor id
563 and the value is a collection of report information for that run.
564 message : `str`
565 Message to be printed with the summary report.
566 """
567 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
568 if wms_workflow_id == MISSING_ID:
569 run_reports = {}
570 else:
571 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
572 return run_reports, message
575def _report_from_id(wms_workflow_id, hist):
576 """Gather run information from a given run directory.
578 Parameters
579 ----------
580 wms_workflow_id : `int` or `str`
581 Limit to specific run based on id.
582 hist : `float`
583 Limit history search to this many days.
585 Returns
586 -------
587 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
588 Run information for the detailed report. The key is the HTCondor id
589 and the value is a collection of report information for that run.
590 message : `str`
591 Message to be printed with the summary report.
592 """
593 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
594 f"{int(float(wms_workflow_id))})"
595 jobs = condor_q(constraint)
596 if hist:
597 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
598 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
599 hist_jobs = condor_history(constraint)
600 _update_jobs(jobs, hist_jobs)
602 # keys in dictionary will be strings of format "ClusterId.ProcId"
603 wms_workflow_id = str(wms_workflow_id)
604 if not wms_workflow_id.endswith(".0"):
605 wms_workflow_id += ".0"
607 if wms_workflow_id in jobs:
608 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
609 _update_jobs(jobs, path_jobs)
610 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
611 else:
612 run_reports = {}
613 message = f"Found 0 records for run id {wms_workflow_id}"
614 return run_reports, message
617def _get_info_from_path(wms_path):
618 """Gather run information from a given run directory.
620 Parameters
621 ----------
622 wms_path : `str`
623 Directory containing HTCondor files.
625 Returns
626 -------
627 wms_workflow_id : `str`
628 The run id which is a DAGman job id.
629 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
630 Information about jobs read from files in the given directory.
631 The key is the HTCondor id and the value is a dictionary of HTCondor
632 keys and values.
633 message : `str`
634 Message to be printed with the summary report.
635 """
636 try:
637 wms_workflow_id, jobs = read_dag_log(wms_path)
638 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
639 _update_jobs(jobs, read_node_status(wms_path))
640 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
642 # Add more info for DAGman job
643 job = jobs[wms_workflow_id]
644 job.update(read_dag_status(wms_path))
645 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
646 if "bps_run" not in job:
647 _add_run_info(wms_path, job)
649 message = htc_check_dagman_output(wms_path)
650 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
651 jobs[wms_workflow_id]["total_jobs"])
652 except StopIteration:
653 message = f"Could not find HTCondor files in {wms_path}"
654 _LOG.warning(message)
655 wms_workflow_id = MISSING_ID
656 jobs = {}
658 return wms_workflow_id, jobs, message
661def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
662 """Gather run information to be used in generating summary reports.
664 Parameters
665 ----------
666 wms_workflow_id : `str`
667 Run lookup restricted to given user.
668 jobs : `float`
669 How many previous days to search for run information.
671 Returns
672 -------
673 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
674 Run information for the detailed report. The key is the given HTCondor
675 id and the value is a collection of report information for that run.
676 """
677 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
678 dag_job = jobs[wms_workflow_id]
679 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
680 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
681 return {}
682 report = WmsRunReport(wms_id=wms_workflow_id,
683 path=dag_job["Iwd"],
684 label=dag_job.get("bps_job_label", "MISS"),
685 run=dag_job.get("bps_run", "MISS"),
686 project=dag_job.get("bps_project", "MISS"),
687 campaign=dag_job.get("bps_campaign", "MISS"),
688 payload=dag_job.get("bps_payload", "MISS"),
689 operator=_get_owner(dag_job),
690 run_summary=_get_run_summary(dag_job),
691 state=_htc_status_to_wms_state(dag_job),
692 jobs=[],
693 total_number_jobs=dag_job["total_jobs"],
694 job_state_counts=dag_job["state_counts"])
696 try:
697 for job in jobs.values():
698 if job["ClusterId"] != int(float(wms_workflow_id)):
699 job_report = WmsJobReport(wms_id=job["ClusterId"],
700 name=job.get("DAGNodeName", str(job["ClusterId"])),
701 label=job.get("bps_job_label",
702 pegasus_name_to_label(job["DAGNodeName"])),
703 state=_htc_status_to_wms_state(job))
704 if job_report.label == "init":
705 job_report.label = "pipetaskInit"
706 report.jobs.append(job_report)
707 except KeyError as ex:
708 _LOG.error("Job missing key '%s': %s", str(ex), job)
709 raise
711 run_reports = {report.wms_id: report}
712 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
713 return run_reports
716def _summary_report(user, hist, pass_thru):
717 """Gather run information to be used in generating summary reports.
719 Parameters
720 ----------
721 user : `str`
722 Run lookup restricted to given user.
723 hist : `float`
724 How many previous days to search for run information.
725 pass_thru : `str`
726 Advanced users can define the HTCondor constraint to be used
727 when searching queue and history.
729 Returns
730 -------
731 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
732 Run information for the summary report. The keys are HTCondor ids and
733 the values are collections of report information for each run.
734 message : `str`
735 Message to be printed with the summary report.
736 """
737 # only doing summary report so only look for dagman jobs
738 if pass_thru:
739 constraint = pass_thru
740 else:
741 # Notes:
742 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
743 # manually restarted.
744 # * Any job with DAGManJobID isn't a DAG job
745 constraint = 'bps_isjob == "True" && JobUniverse == 7'
746 if user:
747 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
749 # Check runs in queue.
750 jobs = condor_q(constraint)
752 if hist:
753 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
754 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
755 hist_jobs = condor_history(constraint)
756 _update_jobs(jobs, hist_jobs)
758 _LOG.debug("Job ids from queue and history %s", jobs.keys())
760 # Have list of DAGMan jobs, need to get run_report info.
761 run_reports = {}
762 for job in jobs.values():
763 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
764 # If didn't get from queue information (e.g., Kerberos bug),
765 # try reading from file.
766 if total_jobs == 0:
767 try:
768 job.update(read_dag_status(job["Iwd"]))
769 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
770 except StopIteration:
771 pass # don't kill report can't find htcondor files
773 if "bps_run" not in job:
774 _add_run_info(job["Iwd"], job)
775 report = WmsRunReport(wms_id=job.get("ClusterId", MISSING_ID),
776 path=job["Iwd"],
777 label=job.get("bps_job_label", "MISS"),
778 run=job.get("bps_run", "MISS"),
779 project=job.get("bps_project", "MISS"),
780 campaign=job.get("bps_campaign", "MISS"),
781 payload=job.get("bps_payload", "MISS"),
782 operator=_get_owner(job),
783 run_summary=_get_run_summary(job),
784 state=_htc_status_to_wms_state(job),
785 jobs=[],
786 total_number_jobs=total_jobs,
787 job_state_counts=state_counts)
789 run_reports[report.wms_id] = report
791 return run_reports, ""
794def _add_run_info(wms_path, job):
795 """Find BPS run information elsewhere for runs without bps attributes.
797 Parameters
798 ----------
799 wms_path : `str`
800 Path to submit files for the run.
801 job : `dict` [`str`, `Any`]
802 HTCondor dag job information.
804 Raises
805 ------
806 StopIteration
807 If cannot find file it is looking for. Permission errors are
808 caught and job's run is marked with error.
809 """
810 path = Path(wms_path) / "jobs"
811 try:
812 jobdir = next(path.glob("*"), Path(wms_path))
813 try:
814 subfile = next(jobdir.glob("*.sub"))
815 _LOG.debug("_add_run_info: subfile = %s", subfile)
816 with open(subfile, "r") as fh:
817 for line in fh:
818 if line.startswith("+bps_"):
819 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
820 if m:
821 _LOG.debug("Matching line: %s", line)
822 job[m.group(1)] = m.group(2).replace('"', "")
823 else:
824 _LOG.debug("Could not parse attribute: %s", line)
825 except StopIteration:
826 job["bps_run"] = "Missing"
828 except PermissionError:
829 job["bps_run"] = "PermissionError"
830 _LOG.debug("After adding job = %s", job)
833def _get_owner(job):
834 """Get the owner of a dag job.
836 Parameters
837 ----------
838 job : `dict` [`str`, `Any`]
839 HTCondor dag job information.
841 Returns
842 -------
843 owner : `str`
844 Owner of the dag job.
845 """
846 owner = job.get("bps_operator", None)
847 if not owner:
848 owner = job.get("Owner", None)
849 if not owner:
850 _LOG.warning("Could not get Owner from htcondor job: %s", job)
851 owner = "MISS"
852 return owner
855def _get_run_summary(job):
856 """Get the run summary for a job.
858 Parameters
859 ----------
860 job : `dict` [`str`, `Any`]
861 HTCondor dag job information.
863 Returns
864 -------
865 summary : `str`
866 Number of jobs per PipelineTask label in approximate pipeline order.
867 Format: <label>:<count>[;<label>:<count>]+
868 """
869 summary = job.get("bps_run_summary", None)
870 if not summary:
871 summary, _ = summary_from_dag(job["Iwd"])
872 if not summary:
873 _LOG.warning("Could not get run summary for htcondor job: %s", job)
874 _LOG.debug("_get_run_summary: summary=%s", summary)
876 # Workaround sometimes using init vs pipetaskInit
877 summary = summary.replace("init:", "pipetaskInit:")
879 if "pegasus_version" in job and "pegasus" not in summary:
880 summary += ";pegasus:0"
882 return summary
885def _get_state_counts_from_jobs(wms_workflow_id, jobs):
886 """Count number of jobs per WMS state.
888 Parameters
889 ----------
890 wms_workflow_id : `str`
891 HTCondor job id.
892 jobs : `dict` [`str`, `Any`]
893 HTCondor dag job information.
895 Returns
896 -------
897 total_count : `int`
898 Total number of dag nodes.
899 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
900 Keys are the different WMS states and values are counts of jobs
901 that are in that WMS state.
902 """
903 state_counts = dict.fromkeys(WmsStates, 0)
905 for jid, jinfo in jobs.items():
906 if jid != wms_workflow_id:
907 state_counts[_htc_status_to_wms_state(jinfo)] += 1
909 total_counted = sum(state_counts.values())
910 if "NodesTotal" in jobs[wms_workflow_id]:
911 total_count = jobs[wms_workflow_id]["NodesTotal"]
912 else:
913 total_count = total_counted
915 state_counts[WmsStates.UNREADY] += total_count - total_counted
917 return total_count, state_counts
920def _get_state_counts_from_dag_job(job):
921 """Count number of jobs per WMS state.
923 Parameters
924 ----------
925 job : `dict` [`str`, `Any`]
926 HTCondor dag job information.
928 Returns
929 -------
930 total_count : `int`
931 Total number of dag nodes.
932 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
933 Keys are the different WMS states and values are counts of jobs
934 that are in that WMS state.
935 """
936 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
937 state_counts = dict.fromkeys(WmsStates, 0)
938 if "DAG_NodesReady" in job:
939 state_counts = {
940 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
941 WmsStates.READY: job.get("DAG_NodesReady", 0),
942 WmsStates.HELD: job.get("JobProcsHeld", 0),
943 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
944 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
945 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
946 total_jobs = job.get("DAG_NodesTotal")
947 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
948 elif "NodesFailed" in job:
949 state_counts = {
950 WmsStates.UNREADY: job.get("NodesUnready", 0),
951 WmsStates.READY: job.get("NodesReady", 0),
952 WmsStates.HELD: job.get("JobProcsHeld", 0),
953 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
954 WmsStates.FAILED: job.get("NodesFailed", 0),
955 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
956 try:
957 total_jobs = job.get("NodesTotal")
958 except KeyError as ex:
959 _LOG.error("Job missing %s. job = %s", str(ex), job)
960 raise
961 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
962 else:
963 # With Kerberos job auth and Kerberos bug, if warning would be printed
964 # for every DAG.
965 _LOG.debug("Can't get job state counts %s", job["Iwd"])
966 total_jobs = 0
968 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
969 return total_jobs, state_counts
972def _htc_status_to_wms_state(job):
973 """Convert HTCondor job status to generic wms state.
975 Parameters
976 ----------
977 job : `dict` [`str`, `Any`]
978 HTCondor job information.
980 Returns
981 -------
982 wms_state : `WmsStates`
983 The equivalent WmsState to given job's status.
984 """
985 wms_state = WmsStates.MISFIT
986 if "JobStatus" in job:
987 wms_state = _htc_job_status_to_wms_state(job)
988 elif "NodeStatus" in job:
989 wms_state = _htc_node_status_to_wms_state(job)
990 return wms_state
993def _htc_job_status_to_wms_state(job):
994 """Convert HTCondor job status to generic wms state.
996 Parameters
997 ----------
998 job : `dict` [`str`, `Any`]
999 HTCondor job information.
1001 Returns
1002 -------
1003 wms_state : `lsst.ctrl.bps.WmsStates`
1004 The equivalent WmsState to given job's status.
1005 """
1006 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1007 type(job["JobStatus"]))
1008 job_status = int(job["JobStatus"])
1009 wms_state = WmsStates.MISFIT
1011 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1012 if job_status == JobStatus.IDLE:
1013 wms_state = WmsStates.PENDING
1014 elif job_status == JobStatus.RUNNING:
1015 wms_state = WmsStates.RUNNING
1016 elif job_status == JobStatus.REMOVED:
1017 wms_state = WmsStates.DELETED
1018 elif job_status == JobStatus.COMPLETED:
1019 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1020 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1021 job.get("ReturnValue", 0):
1022 wms_state = WmsStates.FAILED
1023 else:
1024 wms_state = WmsStates.SUCCEEDED
1025 elif job_status == JobStatus.HELD:
1026 wms_state = WmsStates.HELD
1028 return wms_state
1031def _htc_node_status_to_wms_state(job):
1032 """Convert HTCondor status to generic wms state.
1034 Parameters
1035 ----------
1036 job : `dict` [`str`, `Any`]
1037 HTCondor job information.
1039 Returns
1040 -------
1041 wms_state : `lsst.ctrl.bps.WmsStates`
1042 The equivalent WmsState to given node's status.
1043 """
1044 wms_state = WmsStates.MISFIT
1046 status = job["NodeStatus"]
1047 if status == NodeStatus.NOT_READY:
1048 wms_state = WmsStates.UNREADY
1049 elif status == NodeStatus.READY:
1050 wms_state = WmsStates.READY
1051 elif status == NodeStatus.PRERUN:
1052 wms_state = WmsStates.MISFIT
1053 elif status == NodeStatus.SUBMITTED:
1054 if job["JobProcsHeld"]:
1055 wms_state = WmsStates.HELD
1056 elif job["StatusDetails"] == "not_idle":
1057 wms_state = WmsStates.RUNNING
1058 elif job["JobProcsQueued"]:
1059 wms_state = WmsStates.PENDING
1060 elif status == NodeStatus.POSTRUN:
1061 wms_state = WmsStates.MISFIT
1062 elif status == NodeStatus.DONE:
1063 wms_state = WmsStates.SUCCEEDED
1064 elif status == NodeStatus.ERROR:
1065 wms_state = WmsStates.FAILED
1067 return wms_state
1070def _update_jobs(jobs1, jobs2):
1071 """Update jobs1 with info in jobs2.
1073 (Basically an update for nested dictionaries.)
1075 Parameters
1076 ----------
1077 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1078 HTCondor job information to be updated.
1079 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1080 Additional HTCondor job information.
1081 """
1082 for jid, jinfo in jobs2.items():
1083 if jid in jobs1:
1084 jobs1[jid].update(jinfo)
1085 else:
1086 jobs1[jid] = jinfo
1089def _wms_id_to_cluster(wms_id):
1090 """Convert WMS ID to cluster ID.
1092 Parameters
1093 ----------
1094 wms_id : `int` or `float` or `str`
1095 HTCondor job id or path.
1097 Returns
1098 -------
1099 cluster_id : `int`
1100 HTCondor cluster id.
1101 """
1102 # If wms_id represents path, get numeric id.
1103 try:
1104 cluster_id = int(float(wms_id))
1105 except ValueError:
1106 wms_path = Path(wms_id)
1107 if wms_path.exists():
1108 try:
1109 cluster_id, _ = read_dag_log(wms_id)
1110 cluster_id = int(float(cluster_id))
1111 except StopIteration:
1112 cluster_id = 0
1113 else:
1114 cluster_id = 0
1115 return cluster_id