Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from datetime import datetime, timedelta
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import chdir
47from .lssthtc import (
48 HTCDag,
49 HTCJob,
50 MISSING_ID,
51 JobStatus,
52 NodeStatus,
53 htc_check_dagman_output,
54 htc_escape,
55 htc_submit_dag,
56 read_dag_log,
57 read_dag_status,
58 read_node_status,
59 condor_history,
60 condor_q,
61 condor_status,
62 pegasus_name_to_label,
63 summary_from_dag,
64)
67DEFAULT_HTC_EXEC_PATT = ".*worker.*"
68"""Default pattern for searching execute machines in an HTCondor pool.
69"""
71_LOG = logging.getLogger(__name__)
74class HTCondorService(BaseWmsService):
75 """HTCondor version of WMS service.
76 """
77 def prepare(self, config, generic_workflow, out_prefix=None):
78 """Convert generic workflow to an HTCondor DAG ready for submission.
80 Parameters
81 ----------
82 config : `lsst.ctrl.bps.BpsConfig`
83 BPS configuration that includes necessary submit/runtime
84 information.
85 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
86 The generic workflow (e.g., has executable name and arguments).
87 out_prefix : `str`
88 The root directory into which all WMS-specific files are written.
90 Returns
91 ----------
92 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
93 HTCondor workflow ready to be run.
94 """
95 _LOG.debug("out_prefix = '%s'", out_prefix)
96 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
97 f"{self.__class__.__module__}."
98 f"{self.__class__.__name__}")
99 workflow.write(out_prefix)
100 return workflow
102 def submit(self, workflow):
103 """Submit a single HTCondor workflow.
105 Parameters
106 ----------
107 workflow : `lsst.ctrl.bps.BaseWorkflow`
108 A single HTCondor workflow to submit. run_id is updated after
109 successful submission to WMS.
110 """
111 # For workflow portability, internal paths are all relative. Hence
112 # the DAG needs to be submitted to HTCondor from inside the submit
113 # directory.
114 with chdir(workflow.submit_path):
115 _LOG.info("Submitting from directory: %s", os.getcwd())
116 htc_submit_dag(workflow.dag, dict())
117 workflow.run_id = workflow.dag.run_id
119 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
120 """Query WMS for list of submitted WMS workflows/jobs.
122 This should be a quick lookup function to create list of jobs for
123 other functions.
125 Parameters
126 ----------
127 wms_id : `int` or `str`, optional
128 Id or path that can be used by WMS service to look up job.
129 user : `str`, optional
130 User whose submitted jobs should be listed.
131 require_bps : `bool`, optional
132 Whether to require jobs returned in list to be bps-submitted jobs.
133 pass_thru : `str`, optional
134 Information to pass through to WMS.
136 Returns
137 -------
138 job_ids : `list` [`Any`]
139 Only job ids to be used by cancel and other functions. Typically
140 this means top-level jobs (i.e., not children jobs).
141 """
142 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
143 wms_id, user, require_bps, pass_thru)
144 constraint = ""
146 if wms_id is None:
147 if user is not None:
148 constraint = f'(Owner == "{user}")'
149 else:
150 cluster_id = _wms_id_to_cluster(wms_id)
151 if cluster_id != 0:
152 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
154 if require_bps:
155 constraint += ' && (bps_isjob == "True")'
157 if pass_thru:
158 if "-forcex" in pass_thru:
159 pass_thru_2 = pass_thru.replace("-forcex", "")
160 if pass_thru_2 and not pass_thru_2.isspace():
161 constraint += f"&& ({pass_thru_2})"
162 else:
163 constraint += f" && ({pass_thru})"
165 _LOG.debug("constraint = %s", constraint)
166 jobs = condor_q(constraint)
168 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
169 job_ids = []
170 for job_id, job_info in jobs.items():
171 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
172 if "DAGManJobId" not in job_info: # orphaned job
173 job_ids.append(job_id)
174 else:
175 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
176 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
177 if f"{job_info['DAGManJobId']}.0" not in jobs:
178 job_ids.append(job_id)
180 _LOG.debug("job_ids = %s", job_ids)
181 return job_ids
183 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
184 """Return run information based upon given constraints.
186 Parameters
187 ----------
188 wms_workflow_id : `str`
189 Limit to specific run based on id.
190 user : `str`
191 Limit results to runs for this user.
192 hist : `float`
193 Limit history search to this many days.
194 pass_thru : `str`
195 Constraints to pass through to HTCondor.
197 Returns
198 -------
199 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
200 Information about runs from given job information.
201 message : `str`
202 Extra message for report command to print. This could be pointers
203 to documentation or to WMS specific commands.
204 """
205 message = ""
207 if wms_workflow_id:
208 # Explicitly checking if wms_workflow_id can be converted to a
209 # float instead of using try/except to avoid catching a different
210 # ValueError from _report_from_id
211 try:
212 float(wms_workflow_id)
213 is_float = True
214 except ValueError: # Don't need TypeError here as None goes to else branch.
215 is_float = False
217 if is_float:
218 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
219 else:
220 run_reports, message = _report_from_path(wms_workflow_id)
221 else:
222 run_reports, message = _summary_report(user, hist, pass_thru)
223 _LOG.debug("report: %s, %s", run_reports, message)
225 return list(run_reports.values()), message
227 def cancel(self, wms_id, pass_thru=None):
228 """Cancel submitted workflows/jobs.
230 Parameters
231 ----------
232 wms_id : `str`
233 ID or path of job that should be canceled.
234 pass_thru : `str`, optional
235 Information to pass through to WMS.
237 Returns
238 --------
239 deleted : `bool`
240 Whether successful deletion or not. Currently, if any doubt or any
241 individual jobs not deleted, return False.
242 message : `str`
243 Any message from WMS (e.g., error details).
244 """
245 _LOG.debug("Canceling wms_id = %s", wms_id)
247 cluster_id = _wms_id_to_cluster(wms_id)
248 if cluster_id == 0:
249 deleted = False
250 message = "Invalid id"
251 else:
252 _LOG.debug("Canceling cluster_id = %s", cluster_id)
253 schedd = htcondor.Schedd()
254 constraint = f"ClusterId == {cluster_id}"
255 if pass_thru is not None and "-forcex" in pass_thru:
256 pass_thru_2 = pass_thru.replace("-forcex", "")
257 if pass_thru_2 and not pass_thru_2.isspace():
258 constraint += f"&& ({pass_thru_2})"
259 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
260 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
261 else:
262 if pass_thru:
263 constraint += f"&& ({pass_thru})"
264 _LOG.debug("JobAction.Remove constraint = %s", constraint)
265 results = schedd.act(htcondor.JobAction.Remove, constraint)
266 _LOG.debug("Remove results: %s", results)
268 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
269 deleted = True
270 message = ""
271 else:
272 deleted = False
273 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
274 message = "no such bps job in batch queue"
275 else:
276 message = f"unknown problems deleting: {results}"
278 _LOG.debug("deleted: %s; message = %s", deleted, message)
279 return deleted, message
282class HTCondorWorkflow(BaseWmsWorkflow):
283 """Single HTCondor workflow.
285 Parameters
286 ----------
287 name : `str`
288 Unique name for Workflow used when naming files.
289 config : `lsst.ctrl.bps.BpsConfig`
290 BPS configuration that includes necessary submit/runtime information.
291 """
292 def __init__(self, name, config=None):
293 super().__init__(name, config)
294 self.dag = None
296 @classmethod
297 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
298 # Docstring inherited
299 htc_workflow = cls(generic_workflow.name, config)
300 htc_workflow.dag = HTCDag(name=generic_workflow.name)
302 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
303 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
304 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
305 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}"})
307 # Determine the hard limit for the memory requirement.
308 found, limit = config.search('memoryLimit')
309 if not found:
310 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
311 _, site = config.search("computeSite")
312 if site:
313 search_opts["curvals"] = {"curr_site": site}
314 _, patt = config.search("executeMachinesPattern", opt=search_opts)
316 # To reduce the amount of data, ignore dynamic slots (if any) as,
317 # by definition, they cannot have more memory than
318 # the partitionable slot they are the part of.
319 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
320 pool_info = condor_status(constraint=constraint)
321 try:
322 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
323 except ValueError:
324 _LOG.debug("No execute machine in the pool matches %s", patt)
325 if limit:
326 config[".bps_defined.memory_limit"] = limit
328 # Create all DAG jobs
329 for job_name in generic_workflow:
330 gwjob = generic_workflow.get_job(job_name)
331 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
332 htc_workflow.dag.add_job(htc_job)
334 # Add job dependencies to the DAG
335 for job_name in generic_workflow:
336 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
338 # If final job exists in generic workflow, create DAG final job
339 final = generic_workflow.get_final()
340 if final and isinstance(final, GenericWorkflowJob):
341 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
342 if "post" not in final_htjob.dagcmds:
343 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
344 f" {final.name} $DAG_STATUS $RETURN"
345 htc_workflow.dag.add_final_job(final_htjob)
346 elif final and isinstance(final, GenericWorkflow):
347 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
348 elif final:
349 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
351 return htc_workflow
353 @staticmethod
354 def _create_job(config, generic_workflow, gwjob, out_prefix):
355 """Convert GenericWorkflow job nodes to DAG jobs.
357 Parameters
358 ----------
359 config : `lsst.ctrl.bps.BpsConfig`
360 BPS configuration that includes necessary submit/runtime
361 information.
362 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
363 Generic workflow that is being converted.
364 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
365 The generic job to convert to a HTCondor job.
366 out_prefix : `str`
367 Directory prefix for HTCondor files.
369 Returns
370 -------
371 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
372 The HTCondor job equivalent to the given generic job.
373 """
374 htc_job = HTCJob(gwjob.name, label=gwjob.label)
376 curvals = dataclasses.asdict(gwjob)
377 if gwjob.tags:
378 curvals.update(gwjob.tags)
379 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
380 if not found:
381 subdir = "jobs"
382 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
384 htc_job_cmds = {
385 "universe": "vanilla",
386 "should_transfer_files": "YES",
387 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
388 "transfer_executable": "False",
389 "getenv": "True",
391 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
392 # to put SIGBUS jobs on hold.
393 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
394 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
395 "on_exit_hold_subcode": "34"
396 }
398 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
400 # job stdout, stderr, htcondor user log.
401 for key in ("output", "error", "log"):
402 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
403 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
405 _, use_shared = config.search("bpsUseShared", opt={"default": False})
406 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
408 # Add the job cmds dict to the job object.
409 htc_job.add_job_cmds(htc_job_cmds)
411 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
413 # Add run level attributes to job.
414 htc_job.add_job_attrs(generic_workflow.run_attrs)
416 # Add job attributes to job.
417 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
418 htc_job.add_job_attrs(gwjob.attrs)
419 if gwjob.tags:
420 htc_job.add_job_attrs({"bps_job_quanta": gwjob.tags.get("quanta_summary", "")})
421 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
422 "bps_job_label": gwjob.label})
424 return htc_job
426 def write(self, out_prefix):
427 """Output HTCondor DAGMan files needed for workflow submission.
429 Parameters
430 ----------
431 out_prefix : `str`
432 Directory prefix for HTCondor files.
433 """
434 self.submit_path = out_prefix
435 os.makedirs(out_prefix, exist_ok=True)
437 # Write down the workflow in HTCondor format.
438 self.dag.write(out_prefix, "jobs/{self.label}")
441def _translate_job_cmds(config, generic_workflow, gwjob):
442 """Translate the job data that are one to one mapping
444 Parameters
445 ----------
446 config : `lsst.ctrl.bps.BpsConfig`
447 BPS configuration that includes necessary submit/runtime
448 information.
449 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
450 Generic workflow that contains job to being converted.
451 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
452 Generic workflow job to be converted.
454 Returns
455 -------
456 htc_job_commands : `dict` [`str`, `Any`]
457 Contains commands which can appear in the HTCondor submit description
458 file.
459 """
460 # Values in the job script that just are name mappings.
461 job_translation = {"mail_to": "notify_user",
462 "when_to_mail": "notification",
463 "request_cpus": "request_cpus",
464 "priority": "priority",
465 "category": "category"}
467 jobcmds = {}
468 for gwkey, htckey in job_translation.items():
469 jobcmds[htckey] = getattr(gwjob, gwkey, None)
471 # job commands that need modification
472 if gwjob.number_of_retries:
473 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
475 if gwjob.retry_unless_exit:
476 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
478 if gwjob.request_disk:
479 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
481 if gwjob.request_memory:
482 jobcmds["request_memory"] = f"{gwjob.request_memory}"
484 if gwjob.memory_multiplier:
485 # Do not use try-except! At the moment, BpsConfig returns an empty
486 # string if it does not contain the key.
487 memory_limit = config[".bps_defined.memory_limit"]
488 if not memory_limit:
489 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
490 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
491 "search pattern 'executeMachinesPattern' might help.")
492 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
494 # Periodically release jobs which are being held due to exceeding
495 # memory. Stop doing that (by removing the job from the HTCondor queue)
496 # after the maximal number of retries has been reached or the memory
497 # requirements cannot be satisfied.
498 jobcmds["periodic_release"] = \
499 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
500 jobcmds["periodic_remove"] = \
501 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
502 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
504 # Assume concurrency_limit implemented using HTCondor concurrency limits.
505 # May need to move to special site-specific implementation if sites use
506 # other mechanisms.
507 if gwjob.concurrency_limit:
508 jobcmds["concurrency_limit"] = ",".join(gwjob.concurrency_limit)
510 # Handle command line
511 if gwjob.executable.transfer_executable:
512 jobcmds["transfer_executable"] = "True"
513 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
514 else:
515 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
517 if gwjob.arguments:
518 arguments = gwjob.arguments
519 arguments = _replace_cmd_vars(arguments, gwjob)
520 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
521 arguments = _fix_env_var_syntax(arguments)
522 jobcmds["arguments"] = arguments
524 # Add extra "pass-thru" job commands
525 if gwjob.profile:
526 for key, val in gwjob.profile.items():
527 jobcmds[key] = htc_escape(val)
529 return jobcmds
532def _translate_dag_cmds(gwjob):
533 """Translate job values into DAGMan commands.
535 Parameters
536 ----------
537 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
538 Job containing values to be translated.
540 Returns
541 -------
542 dagcmds : `dict` [`str`, `Any`]
543 DAGMan commands for the job.
544 """
545 # Values in the dag script that just are name mappings.
546 dag_translation = {"abort_on_value": "abort_dag_on",
547 "abort_return_value": "abort_exit"}
549 dagcmds = {}
550 for gwkey, htckey in dag_translation.items():
551 dagcmds[htckey] = getattr(gwjob, gwkey, None)
553 # Still to be coded: vars "pre_cmdline", "post_cmdline"
554 return dagcmds
557def _fix_env_var_syntax(oldstr):
558 """Change ENV place holders to HTCondor Env var syntax.
560 Parameters
561 ----------
562 oldstr : `str`
563 String in which environment variable syntax is to be fixed.
565 Returns
566 -------
567 newstr : `str`
568 Given string with environment variable syntax fixed.
569 """
570 newstr = oldstr
571 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
572 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
573 return newstr
576def _replace_file_vars(config, arguments, workflow, gwjob):
577 """Replace file placeholders in command line arguments with correct
578 physical file names.
580 Parameters
581 ----------
582 config : `lsst.ctrl.bps.BpsConfig`
583 BPS configuration that includes necessary submit/runtime
584 information.
585 arguments : `str`
586 Arguments string in which to replace file placeholders.
587 workflow : `lsst.ctrl.bps.GenericWorkflow`
588 Generic workflow that contains file information.
589 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
590 The job corresponding to the arguments.
592 Returns
593 -------
594 arguments : `str`
595 Given arguments string with file placeholders replaced.
596 """
597 _, use_shared = config.search("bpsUseShared", opt={"default": False})
599 # Replace input file placeholders with paths.
600 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
601 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared:
602 uri = os.path.basename(gwfile.src_uri)
603 else:
604 uri = gwfile.src_uri
605 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
607 # Replace output file placeholders with paths.
608 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
609 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared:
610 uri = os.path.basename(gwfile.src_uri)
611 else:
612 uri = gwfile.src_uri
613 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
614 return arguments
617def _replace_cmd_vars(arguments, gwjob):
618 """Replace format-style placeholders in arguments.
620 Parameters
621 ----------
622 arguments : `str`
623 Arguments string in which to replace placeholders.
624 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
625 Job containing values to be used to replace placeholders
626 (in particular gwjob.cmdvals).
628 Returns
629 -------
630 arguments : `str`
631 Given arguments string with placeholders replaced.
632 """
633 try:
634 arguments = arguments.format(**gwjob.cmdvals)
635 except (KeyError, TypeError): # TypeError in case None instead of {}
636 _LOG.error("Could not replace command variables:\n"
637 "arguments: %s\n"
638 "cmdvals: %s", arguments, gwjob.cmdvals)
639 raise
640 return arguments
643def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
644 """Add job input files from generic workflow to job.
646 Parameters
647 ----------
648 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
649 The generic workflow (e.g., has executable name and arguments).
650 job_name : `str`
651 Unique name for the job.
652 use_shared : `bool`
653 Whether job has access to files via shared filesystem.
654 out_prefix : `str`
655 The root directory into which all WMS-specific files are written.
657 Returns
658 -------
659 htc_commands : `dict` [`str`, `str`]
660 HTCondor commands for the job submission script.
661 """
662 htc_commands = {}
663 inputs = []
664 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
665 _LOG.debug("src_uri=%s", gwf_file.src_uri)
666 if not use_shared or not gwf_file.job_shared:
667 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix))
669 if inputs:
670 htc_commands["transfer_input_files"] = ",".join(inputs)
671 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
672 return htc_commands
675def _report_from_path(wms_path):
676 """Gather run information from a given run directory.
678 Parameters
679 ----------
680 wms_path : `str`
681 The directory containing the submit side files (e.g., HTCondor files).
683 Returns
684 -------
685 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
686 Run information for the detailed report. The key is the HTCondor id
687 and the value is a collection of report information for that run.
688 message : `str`
689 Message to be printed with the summary report.
690 """
691 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
692 if wms_workflow_id == MISSING_ID:
693 run_reports = {}
694 else:
695 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
696 return run_reports, message
699def _report_from_id(wms_workflow_id, hist):
700 """Gather run information from a given run directory.
702 Parameters
703 ----------
704 wms_workflow_id : `int` or `str`
705 Limit to specific run based on id.
706 hist : `float`
707 Limit history search to this many days.
709 Returns
710 -------
711 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
712 Run information for the detailed report. The key is the HTCondor id
713 and the value is a collection of report information for that run.
714 message : `str`
715 Message to be printed with the summary report.
716 """
717 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
718 f"{int(float(wms_workflow_id))})"
719 jobs = condor_q(constraint)
720 if hist:
721 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
722 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
723 hist_jobs = condor_history(constraint)
724 _update_jobs(jobs, hist_jobs)
726 # keys in dictionary will be strings of format "ClusterId.ProcId"
727 wms_workflow_id = str(wms_workflow_id)
728 if not wms_workflow_id.endswith(".0"):
729 wms_workflow_id += ".0"
731 if wms_workflow_id in jobs:
732 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
733 _update_jobs(jobs, path_jobs)
734 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
735 else:
736 run_reports = {}
737 message = f"Found 0 records for run id {wms_workflow_id}"
738 return run_reports, message
741def _get_info_from_path(wms_path):
742 """Gather run information from a given run directory.
744 Parameters
745 ----------
746 wms_path : `str`
747 Directory containing HTCondor files.
749 Returns
750 -------
751 wms_workflow_id : `str`
752 The run id which is a DAGman job id.
753 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
754 Information about jobs read from files in the given directory.
755 The key is the HTCondor id and the value is a dictionary of HTCondor
756 keys and values.
757 message : `str`
758 Message to be printed with the summary report.
759 """
760 try:
761 wms_workflow_id, jobs = read_dag_log(wms_path)
762 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
763 _update_jobs(jobs, read_node_status(wms_path))
764 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
766 # Add more info for DAGman job
767 job = jobs[wms_workflow_id]
768 job.update(read_dag_status(wms_path))
769 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
770 if "bps_run" not in job:
771 _add_run_info(wms_path, job)
773 message = htc_check_dagman_output(wms_path)
774 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
775 jobs[wms_workflow_id]["total_jobs"])
776 except StopIteration:
777 message = f"Could not find HTCondor files in {wms_path}"
778 _LOG.warning(message)
779 wms_workflow_id = MISSING_ID
780 jobs = {}
782 return wms_workflow_id, jobs, message
785def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
786 """Gather run information to be used in generating summary reports.
788 Parameters
789 ----------
790 wms_workflow_id : `str`
791 Run lookup restricted to given user.
792 jobs : `float`
793 How many previous days to search for run information.
795 Returns
796 -------
797 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
798 Run information for the detailed report. The key is the given HTCondor
799 id and the value is a collection of report information for that run.
800 """
801 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
802 dag_job = jobs[wms_workflow_id]
803 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
804 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
805 return {}
806 report = WmsRunReport(wms_id=wms_workflow_id,
807 path=dag_job["Iwd"],
808 label=dag_job.get("bps_job_label", "MISS"),
809 run=dag_job.get("bps_run", "MISS"),
810 project=dag_job.get("bps_project", "MISS"),
811 campaign=dag_job.get("bps_campaign", "MISS"),
812 payload=dag_job.get("bps_payload", "MISS"),
813 operator=_get_owner(dag_job),
814 run_summary=_get_run_summary(dag_job),
815 state=_htc_status_to_wms_state(dag_job),
816 jobs=[],
817 total_number_jobs=dag_job["total_jobs"],
818 job_state_counts=dag_job["state_counts"])
820 try:
821 for job in jobs.values():
822 if job["ClusterId"] != int(float(wms_workflow_id)):
823 job_report = WmsJobReport(wms_id=job["ClusterId"],
824 name=job.get("DAGNodeName", str(job["ClusterId"])),
825 label=job.get("bps_job_label",
826 pegasus_name_to_label(job["DAGNodeName"])),
827 state=_htc_status_to_wms_state(job))
828 if job_report.label == "init":
829 job_report.label = "pipetaskInit"
830 report.jobs.append(job_report)
831 except KeyError as ex:
832 _LOG.error("Job missing key '%s': %s", str(ex), job)
833 raise
835 run_reports = {report.wms_id: report}
836 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
837 return run_reports
840def _summary_report(user, hist, pass_thru):
841 """Gather run information to be used in generating summary reports.
843 Parameters
844 ----------
845 user : `str`
846 Run lookup restricted to given user.
847 hist : `float`
848 How many previous days to search for run information.
849 pass_thru : `str`
850 Advanced users can define the HTCondor constraint to be used
851 when searching queue and history.
853 Returns
854 -------
855 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
856 Run information for the summary report. The keys are HTCondor ids and
857 the values are collections of report information for each run.
858 message : `str`
859 Message to be printed with the summary report.
860 """
861 # only doing summary report so only look for dagman jobs
862 if pass_thru:
863 constraint = pass_thru
864 else:
865 # Notes:
866 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
867 # manually restarted.
868 # * Any job with DAGManJobID isn't a DAG job
869 constraint = 'bps_isjob == "True" && JobUniverse == 7'
870 if user:
871 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
873 # Check runs in queue.
874 jobs = condor_q(constraint)
876 if hist:
877 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
878 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
879 hist_jobs = condor_history(constraint)
880 _update_jobs(jobs, hist_jobs)
882 _LOG.debug("Job ids from queue and history %s", jobs.keys())
884 # Have list of DAGMan jobs, need to get run_report info.
885 run_reports = {}
886 for job in jobs.values():
887 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
888 # If didn't get from queue information (e.g., Kerberos bug),
889 # try reading from file.
890 if total_jobs == 0:
891 try:
892 job.update(read_dag_status(job["Iwd"]))
893 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
894 except StopIteration:
895 pass # don't kill report can't find htcondor files
897 if "bps_run" not in job:
898 _add_run_info(job["Iwd"], job)
899 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
900 path=job["Iwd"],
901 label=job.get("bps_job_label", "MISS"),
902 run=job.get("bps_run", "MISS"),
903 project=job.get("bps_project", "MISS"),
904 campaign=job.get("bps_campaign", "MISS"),
905 payload=job.get("bps_payload", "MISS"),
906 operator=_get_owner(job),
907 run_summary=_get_run_summary(job),
908 state=_htc_status_to_wms_state(job),
909 jobs=[],
910 total_number_jobs=total_jobs,
911 job_state_counts=state_counts)
913 run_reports[report.wms_id] = report
915 return run_reports, ""
918def _add_run_info(wms_path, job):
919 """Find BPS run information elsewhere for runs without bps attributes.
921 Parameters
922 ----------
923 wms_path : `str`
924 Path to submit files for the run.
925 job : `dict` [`str`, `Any`]
926 HTCondor dag job information.
928 Raises
929 ------
930 StopIteration
931 If cannot find file it is looking for. Permission errors are
932 caught and job's run is marked with error.
933 """
934 path = Path(wms_path) / "jobs"
935 try:
936 subfile = next(path.glob("**/*.sub"))
937 except (StopIteration, PermissionError):
938 job["bps_run"] = "Unavailable"
939 else:
940 _LOG.debug("_add_run_info: subfile = %s", subfile)
941 try:
942 with open(subfile, "r") as fh:
943 for line in fh:
944 if line.startswith("+bps_"):
945 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
946 if m:
947 _LOG.debug("Matching line: %s", line)
948 job[m.group(1)] = m.group(2).replace('"', "")
949 else:
950 _LOG.debug("Could not parse attribute: %s", line)
951 except PermissionError:
952 job["bps_run"] = "PermissionError"
953 _LOG.debug("After adding job = %s", job)
956def _get_owner(job):
957 """Get the owner of a dag job.
959 Parameters
960 ----------
961 job : `dict` [`str`, `Any`]
962 HTCondor dag job information.
964 Returns
965 -------
966 owner : `str`
967 Owner of the dag job.
968 """
969 owner = job.get("bps_operator", None)
970 if not owner:
971 owner = job.get("Owner", None)
972 if not owner:
973 _LOG.warning("Could not get Owner from htcondor job: %s", job)
974 owner = "MISS"
975 return owner
978def _get_run_summary(job):
979 """Get the run summary for a job.
981 Parameters
982 ----------
983 job : `dict` [`str`, `Any`]
984 HTCondor dag job information.
986 Returns
987 -------
988 summary : `str`
989 Number of jobs per PipelineTask label in approximate pipeline order.
990 Format: <label>:<count>[;<label>:<count>]+
991 """
992 summary = job.get("bps_run_summary", None)
993 if not summary:
994 summary, _ = summary_from_dag(job["Iwd"])
995 if not summary:
996 _LOG.warning("Could not get run summary for htcondor job: %s", job)
997 _LOG.debug("_get_run_summary: summary=%s", summary)
999 # Workaround sometimes using init vs pipetaskInit
1000 summary = summary.replace("init:", "pipetaskInit:")
1002 if "pegasus_version" in job and "pegasus" not in summary:
1003 summary += ";pegasus:0"
1005 return summary
1008def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1009 """Count number of jobs per WMS state.
1011 Parameters
1012 ----------
1013 wms_workflow_id : `str`
1014 HTCondor job id.
1015 jobs : `dict` [`str`, `Any`]
1016 HTCondor dag job information.
1018 Returns
1019 -------
1020 total_count : `int`
1021 Total number of dag nodes.
1022 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1023 Keys are the different WMS states and values are counts of jobs
1024 that are in that WMS state.
1025 """
1026 state_counts = dict.fromkeys(WmsStates, 0)
1028 for jid, jinfo in jobs.items():
1029 if jid != wms_workflow_id:
1030 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1032 total_counted = sum(state_counts.values())
1033 if "NodesTotal" in jobs[wms_workflow_id]:
1034 total_count = jobs[wms_workflow_id]["NodesTotal"]
1035 else:
1036 total_count = total_counted
1038 state_counts[WmsStates.UNREADY] += total_count - total_counted
1040 return total_count, state_counts
1043def _get_state_counts_from_dag_job(job):
1044 """Count number of jobs per WMS state.
1046 Parameters
1047 ----------
1048 job : `dict` [`str`, `Any`]
1049 HTCondor dag job information.
1051 Returns
1052 -------
1053 total_count : `int`
1054 Total number of dag nodes.
1055 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1056 Keys are the different WMS states and values are counts of jobs
1057 that are in that WMS state.
1058 """
1059 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1060 state_counts = dict.fromkeys(WmsStates, 0)
1061 if "DAG_NodesReady" in job:
1062 state_counts = {
1063 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1064 WmsStates.READY: job.get("DAG_NodesReady", 0),
1065 WmsStates.HELD: job.get("JobProcsHeld", 0),
1066 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1067 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1068 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1069 total_jobs = job.get("DAG_NodesTotal")
1070 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1071 elif "NodesFailed" in job:
1072 state_counts = {
1073 WmsStates.UNREADY: job.get("NodesUnready", 0),
1074 WmsStates.READY: job.get("NodesReady", 0),
1075 WmsStates.HELD: job.get("JobProcsHeld", 0),
1076 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1077 WmsStates.FAILED: job.get("NodesFailed", 0),
1078 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1079 try:
1080 total_jobs = job.get("NodesTotal")
1081 except KeyError as ex:
1082 _LOG.error("Job missing %s. job = %s", str(ex), job)
1083 raise
1084 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1085 else:
1086 # With Kerberos job auth and Kerberos bug, if warning would be printed
1087 # for every DAG.
1088 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1089 total_jobs = 0
1091 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1092 return total_jobs, state_counts
1095def _htc_status_to_wms_state(job):
1096 """Convert HTCondor job status to generic wms state.
1098 Parameters
1099 ----------
1100 job : `dict` [`str`, `Any`]
1101 HTCondor job information.
1103 Returns
1104 -------
1105 wms_state : `WmsStates`
1106 The equivalent WmsState to given job's status.
1107 """
1108 wms_state = WmsStates.MISFIT
1109 if "JobStatus" in job:
1110 wms_state = _htc_job_status_to_wms_state(job)
1111 elif "NodeStatus" in job:
1112 wms_state = _htc_node_status_to_wms_state(job)
1113 return wms_state
1116def _htc_job_status_to_wms_state(job):
1117 """Convert HTCondor job status to generic wms state.
1119 Parameters
1120 ----------
1121 job : `dict` [`str`, `Any`]
1122 HTCondor job information.
1124 Returns
1125 -------
1126 wms_state : `lsst.ctrl.bps.WmsStates`
1127 The equivalent WmsState to given job's status.
1128 """
1129 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1130 type(job["JobStatus"]))
1131 job_status = int(job["JobStatus"])
1132 wms_state = WmsStates.MISFIT
1134 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1135 if job_status == JobStatus.IDLE:
1136 wms_state = WmsStates.PENDING
1137 elif job_status == JobStatus.RUNNING:
1138 wms_state = WmsStates.RUNNING
1139 elif job_status == JobStatus.REMOVED:
1140 wms_state = WmsStates.DELETED
1141 elif job_status == JobStatus.COMPLETED:
1142 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1143 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1144 job.get("ReturnValue", 0):
1145 wms_state = WmsStates.FAILED
1146 else:
1147 wms_state = WmsStates.SUCCEEDED
1148 elif job_status == JobStatus.HELD:
1149 wms_state = WmsStates.HELD
1151 return wms_state
1154def _htc_node_status_to_wms_state(job):
1155 """Convert HTCondor status to generic wms state.
1157 Parameters
1158 ----------
1159 job : `dict` [`str`, `Any`]
1160 HTCondor job information.
1162 Returns
1163 -------
1164 wms_state : `lsst.ctrl.bps.WmsStates`
1165 The equivalent WmsState to given node's status.
1166 """
1167 wms_state = WmsStates.MISFIT
1169 status = job["NodeStatus"]
1170 if status == NodeStatus.NOT_READY:
1171 wms_state = WmsStates.UNREADY
1172 elif status == NodeStatus.READY:
1173 wms_state = WmsStates.READY
1174 elif status == NodeStatus.PRERUN:
1175 wms_state = WmsStates.MISFIT
1176 elif status == NodeStatus.SUBMITTED:
1177 if job["JobProcsHeld"]:
1178 wms_state = WmsStates.HELD
1179 elif job["StatusDetails"] == "not_idle":
1180 wms_state = WmsStates.RUNNING
1181 elif job["JobProcsQueued"]:
1182 wms_state = WmsStates.PENDING
1183 elif status == NodeStatus.POSTRUN:
1184 wms_state = WmsStates.MISFIT
1185 elif status == NodeStatus.DONE:
1186 wms_state = WmsStates.SUCCEEDED
1187 elif status == NodeStatus.ERROR:
1188 wms_state = WmsStates.FAILED
1190 return wms_state
1193def _update_jobs(jobs1, jobs2):
1194 """Update jobs1 with info in jobs2.
1196 (Basically an update for nested dictionaries.)
1198 Parameters
1199 ----------
1200 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1201 HTCondor job information to be updated.
1202 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1203 Additional HTCondor job information.
1204 """
1205 for jid, jinfo in jobs2.items():
1206 if jid in jobs1:
1207 jobs1[jid].update(jinfo)
1208 else:
1209 jobs1[jid] = jinfo
1212def _wms_id_to_cluster(wms_id):
1213 """Convert WMS ID to cluster ID.
1215 Parameters
1216 ----------
1217 wms_id : `int` or `float` or `str`
1218 HTCondor job id or path.
1220 Returns
1221 -------
1222 cluster_id : `int`
1223 HTCondor cluster id.
1224 """
1225 # If wms_id represents path, get numeric id.
1226 try:
1227 cluster_id = int(float(wms_id))
1228 except ValueError:
1229 wms_path = Path(wms_id)
1230 if wms_path.exists():
1231 try:
1232 cluster_id, _ = read_dag_log(wms_id)
1233 cluster_id = int(float(cluster_id))
1234 except StopIteration:
1235 cluster_id = 0
1236 else:
1237 cluster_id = 0
1238 return cluster_id
1241def _create_request_memory_expr(memory, multiplier):
1242 """Construct an HTCondor ClassAd expression for safe memory scaling.
1244 Parameters
1245 ----------
1246 memory : `int`
1247 Requested memory in MB.
1248 multiplier : `float`
1249 Memory growth rate between retires.
1251 Returns
1252 -------
1253 ad : `str`
1254 A string representing an HTCondor ClassAd expression enabling safe
1255 memory scaling between job retries.
1256 """
1257 was_mem_exceeded = "LastJobStatus =?= 5 " \
1258 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1259 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1261 # If job runs the first time ('MemoryUsage' is not defined), set the
1262 # required memory to a given value.
1263 ad = f"ifThenElse({was_mem_exceeded}, " \
1264 f"ifThenElse(isUndefined(MemoryUsage), {memory}, int({multiplier} * MemoryUsage)), " \
1265 f"ifThenElse(isUndefined(MemoryUsage), {memory}, max({memory}, MemoryUsage)))"
1266 return ad