Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from datetime import datetime, timedelta
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import (
47 chdir,
48 create_count_summary
49)
50from .lssthtc import (
51 HTCDag,
52 HTCJob,
53 MISSING_ID,
54 JobStatus,
55 NodeStatus,
56 htc_check_dagman_output,
57 htc_escape,
58 htc_submit_dag,
59 read_dag_log,
60 read_dag_status,
61 read_node_status,
62 condor_history,
63 condor_q,
64 condor_status,
65 pegasus_name_to_label,
66 summary_from_dag,
67)
70DEFAULT_HTC_EXEC_PATT = ".*worker.*"
71"""Default pattern for searching execute machines in an HTCondor pool.
72"""
74_LOG = logging.getLogger(__name__)
77class HTCondorService(BaseWmsService):
78 """HTCondor version of WMS service.
79 """
80 def prepare(self, config, generic_workflow, out_prefix=None):
81 """Convert generic workflow to an HTCondor DAG ready for submission.
83 Parameters
84 ----------
85 config : `lsst.ctrl.bps.BpsConfig`
86 BPS configuration that includes necessary submit/runtime
87 information.
88 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
89 The generic workflow (e.g., has executable name and arguments).
90 out_prefix : `str`
91 The root directory into which all WMS-specific files are written.
93 Returns
94 ----------
95 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
96 HTCondor workflow ready to be run.
97 """
98 _LOG.debug("out_prefix = '%s'", out_prefix)
99 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
100 f"{self.__class__.__module__}."
101 f"{self.__class__.__name__}")
102 workflow.write(out_prefix)
103 return workflow
105 def submit(self, workflow):
106 """Submit a single HTCondor workflow.
108 Parameters
109 ----------
110 workflow : `lsst.ctrl.bps.BaseWorkflow`
111 A single HTCondor workflow to submit. run_id is updated after
112 successful submission to WMS.
113 """
114 # For workflow portability, internal paths are all relative. Hence
115 # the DAG needs to be submitted to HTCondor from inside the submit
116 # directory.
117 with chdir(workflow.submit_path):
118 _LOG.info("Submitting from directory: %s", os.getcwd())
119 htc_submit_dag(workflow.dag, {})
120 workflow.run_id = workflow.dag.run_id
122 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
123 """Query WMS for list of submitted WMS workflows/jobs.
125 This should be a quick lookup function to create list of jobs for
126 other functions.
128 Parameters
129 ----------
130 wms_id : `int` or `str`, optional
131 Id or path that can be used by WMS service to look up job.
132 user : `str`, optional
133 User whose submitted jobs should be listed.
134 require_bps : `bool`, optional
135 Whether to require jobs returned in list to be bps-submitted jobs.
136 pass_thru : `str`, optional
137 Information to pass through to WMS.
139 Returns
140 -------
141 job_ids : `list` [`Any`]
142 Only job ids to be used by cancel and other functions. Typically
143 this means top-level jobs (i.e., not children jobs).
144 """
145 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
146 wms_id, user, require_bps, pass_thru)
147 constraint = ""
149 if wms_id is None:
150 if user is not None:
151 constraint = f'(Owner == "{user}")'
152 else:
153 cluster_id = _wms_id_to_cluster(wms_id)
154 if cluster_id != 0:
155 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
157 if require_bps:
158 constraint += ' && (bps_isjob == "True")'
160 if pass_thru:
161 if "-forcex" in pass_thru:
162 pass_thru_2 = pass_thru.replace("-forcex", "")
163 if pass_thru_2 and not pass_thru_2.isspace():
164 constraint += f"&& ({pass_thru_2})"
165 else:
166 constraint += f" && ({pass_thru})"
168 _LOG.debug("constraint = %s", constraint)
169 jobs = condor_q(constraint)
171 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
172 job_ids = []
173 for job_id, job_info in jobs.items():
174 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
175 if "DAGManJobId" not in job_info: # orphaned job
176 job_ids.append(job_id)
177 else:
178 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
179 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
180 if f"{job_info['DAGManJobId']}.0" not in jobs:
181 job_ids.append(job_id)
183 _LOG.debug("job_ids = %s", job_ids)
184 return job_ids
186 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
187 """Return run information based upon given constraints.
189 Parameters
190 ----------
191 wms_workflow_id : `str`
192 Limit to specific run based on id.
193 user : `str`
194 Limit results to runs for this user.
195 hist : `float`
196 Limit history search to this many days.
197 pass_thru : `str`
198 Constraints to pass through to HTCondor.
200 Returns
201 -------
202 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
203 Information about runs from given job information.
204 message : `str`
205 Extra message for report command to print. This could be pointers
206 to documentation or to WMS specific commands.
207 """
208 message = ""
210 if wms_workflow_id:
211 # Explicitly checking if wms_workflow_id can be converted to a
212 # float instead of using try/except to avoid catching a different
213 # ValueError from _report_from_id
214 try:
215 float(wms_workflow_id)
216 is_float = True
217 except ValueError: # Don't need TypeError here as None goes to else branch.
218 is_float = False
220 if is_float:
221 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
222 else:
223 run_reports, message = _report_from_path(wms_workflow_id)
224 else:
225 run_reports, message = _summary_report(user, hist, pass_thru)
226 _LOG.debug("report: %s, %s", run_reports, message)
228 return list(run_reports.values()), message
230 def cancel(self, wms_id, pass_thru=None):
231 """Cancel submitted workflows/jobs.
233 Parameters
234 ----------
235 wms_id : `str`
236 ID or path of job that should be canceled.
237 pass_thru : `str`, optional
238 Information to pass through to WMS.
240 Returns
241 --------
242 deleted : `bool`
243 Whether successful deletion or not. Currently, if any doubt or any
244 individual jobs not deleted, return False.
245 message : `str`
246 Any message from WMS (e.g., error details).
247 """
248 _LOG.debug("Canceling wms_id = %s", wms_id)
250 cluster_id = _wms_id_to_cluster(wms_id)
251 if cluster_id == 0:
252 deleted = False
253 message = "Invalid id"
254 else:
255 _LOG.debug("Canceling cluster_id = %s", cluster_id)
256 schedd = htcondor.Schedd()
257 constraint = f"ClusterId == {cluster_id}"
258 if pass_thru is not None and "-forcex" in pass_thru:
259 pass_thru_2 = pass_thru.replace("-forcex", "")
260 if pass_thru_2 and not pass_thru_2.isspace():
261 constraint += f"&& ({pass_thru_2})"
262 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
263 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
264 else:
265 if pass_thru:
266 constraint += f"&& ({pass_thru})"
267 _LOG.debug("JobAction.Remove constraint = %s", constraint)
268 results = schedd.act(htcondor.JobAction.Remove, constraint)
269 _LOG.debug("Remove results: %s", results)
271 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
272 deleted = True
273 message = ""
274 else:
275 deleted = False
276 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
277 message = "no such bps job in batch queue"
278 else:
279 message = f"unknown problems deleting: {results}"
281 _LOG.debug("deleted: %s; message = %s", deleted, message)
282 return deleted, message
285class HTCondorWorkflow(BaseWmsWorkflow):
286 """Single HTCondor workflow.
288 Parameters
289 ----------
290 name : `str`
291 Unique name for Workflow used when naming files.
292 config : `lsst.ctrl.bps.BpsConfig`
293 BPS configuration that includes necessary submit/runtime information.
294 """
295 def __init__(self, name, config=None):
296 super().__init__(name, config)
297 self.dag = None
299 @classmethod
300 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
301 # Docstring inherited
302 htc_workflow = cls(generic_workflow.name, config)
303 htc_workflow.dag = HTCDag(name=generic_workflow.name)
305 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
306 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
307 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
308 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
309 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
310 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
312 # Determine the hard limit for the memory requirement.
313 found, limit = config.search('memoryLimit')
314 if not found:
315 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
316 _, site = config.search("computeSite")
317 if site:
318 search_opts["curvals"] = {"curr_site": site}
319 _, patt = config.search("executeMachinesPattern", opt=search_opts)
321 # To reduce the amount of data, ignore dynamic slots (if any) as,
322 # by definition, they cannot have more memory than
323 # the partitionable slot they are the part of.
324 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
325 pool_info = condor_status(constraint=constraint)
326 try:
327 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
328 except ValueError:
329 _LOG.debug("No execute machine in the pool matches %s", patt)
330 if limit:
331 config[".bps_defined.memory_limit"] = limit
333 # Create all DAG jobs
334 for job_name in generic_workflow:
335 gwjob = generic_workflow.get_job(job_name)
336 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
337 htc_workflow.dag.add_job(htc_job)
339 # Add job dependencies to the DAG
340 for job_name in generic_workflow:
341 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
343 # If final job exists in generic workflow, create DAG final job
344 final = generic_workflow.get_final()
345 if final and isinstance(final, GenericWorkflowJob):
346 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
347 if "post" not in final_htjob.dagcmds:
348 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
349 f" {final.name} $DAG_STATUS $RETURN"
350 htc_workflow.dag.add_final_job(final_htjob)
351 elif final and isinstance(final, GenericWorkflow):
352 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
353 elif final:
354 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
356 return htc_workflow
358 @staticmethod
359 def _create_job(config, generic_workflow, gwjob, out_prefix):
360 """Convert GenericWorkflow job nodes to DAG jobs.
362 Parameters
363 ----------
364 config : `lsst.ctrl.bps.BpsConfig`
365 BPS configuration that includes necessary submit/runtime
366 information.
367 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
368 Generic workflow that is being converted.
369 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
370 The generic job to convert to a HTCondor job.
371 out_prefix : `str`
372 Directory prefix for HTCondor files.
374 Returns
375 -------
376 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
377 The HTCondor job equivalent to the given generic job.
378 """
379 htc_job = HTCJob(gwjob.name, label=gwjob.label)
381 curvals = dataclasses.asdict(gwjob)
382 if gwjob.tags:
383 curvals.update(gwjob.tags)
384 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
385 if not found:
386 subdir = "jobs"
387 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
389 htc_job_cmds = {
390 "universe": "vanilla",
391 "should_transfer_files": "YES",
392 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
393 "transfer_executable": "False",
394 "getenv": "True",
396 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
397 # to put SIGBUS jobs on hold.
398 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
399 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
400 "on_exit_hold_subcode": "34"
401 }
403 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
405 # job stdout, stderr, htcondor user log.
406 for key in ("output", "error", "log"):
407 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
408 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
410 _, use_shared = config.search("bpsUseShared", opt={"default": False})
411 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
413 # Add the job cmds dict to the job object.
414 htc_job.add_job_cmds(htc_job_cmds)
416 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
418 # Add job attributes to job.
419 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
420 htc_job.add_job_attrs(gwjob.attrs)
421 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
422 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
423 "bps_job_label": gwjob.label})
425 return htc_job
427 def write(self, out_prefix):
428 """Output HTCondor DAGMan files needed for workflow submission.
430 Parameters
431 ----------
432 out_prefix : `str`
433 Directory prefix for HTCondor files.
434 """
435 self.submit_path = out_prefix
436 os.makedirs(out_prefix, exist_ok=True)
438 # Write down the workflow in HTCondor format.
439 self.dag.write(out_prefix, "jobs/{self.label}")
442def _translate_job_cmds(config, generic_workflow, gwjob):
443 """Translate the job data that are one to one mapping
445 Parameters
446 ----------
447 config : `lsst.ctrl.bps.BpsConfig`
448 BPS configuration that includes necessary submit/runtime
449 information.
450 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
451 Generic workflow that contains job to being converted.
452 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
453 Generic workflow job to be converted.
455 Returns
456 -------
457 htc_job_commands : `dict` [`str`, `Any`]
458 Contains commands which can appear in the HTCondor submit description
459 file.
460 """
461 # Values in the job script that just are name mappings.
462 job_translation = {"mail_to": "notify_user",
463 "when_to_mail": "notification",
464 "request_cpus": "request_cpus",
465 "priority": "priority",
466 "category": "category"}
468 jobcmds = {}
469 for gwkey, htckey in job_translation.items():
470 jobcmds[htckey] = getattr(gwjob, gwkey, None)
472 # job commands that need modification
473 if gwjob.number_of_retries:
474 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
476 if gwjob.retry_unless_exit:
477 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
479 if gwjob.request_disk:
480 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
482 if gwjob.request_memory:
483 jobcmds["request_memory"] = f"{gwjob.request_memory}"
485 if gwjob.memory_multiplier:
486 # Do not use try-except! At the moment, BpsConfig returns an empty
487 # string if it does not contain the key.
488 memory_limit = config[".bps_defined.memory_limit"]
489 if not memory_limit:
490 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
491 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
492 "search pattern 'executeMachinesPattern' might help.")
493 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
495 # Periodically release jobs which are being held due to exceeding
496 # memory. Stop doing that (by removing the job from the HTCondor queue)
497 # after the maximal number of retries has been reached or the memory
498 # requirements cannot be satisfied.
499 jobcmds["periodic_release"] = \
500 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
501 jobcmds["periodic_remove"] = \
502 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
503 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
505 # Assume concurrency_limit implemented using HTCondor concurrency limits.
506 # May need to move to special site-specific implementation if sites use
507 # other mechanisms.
508 if gwjob.concurrency_limit:
509 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
511 # Handle command line
512 if gwjob.executable.transfer_executable:
513 jobcmds["transfer_executable"] = "True"
514 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
515 else:
516 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
518 if gwjob.arguments:
519 arguments = gwjob.arguments
520 arguments = _replace_cmd_vars(arguments, gwjob)
521 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
522 arguments = _fix_env_var_syntax(arguments)
523 jobcmds["arguments"] = arguments
525 # Add extra "pass-thru" job commands
526 if gwjob.profile:
527 for key, val in gwjob.profile.items():
528 jobcmds[key] = htc_escape(val)
530 return jobcmds
533def _translate_dag_cmds(gwjob):
534 """Translate job values into DAGMan commands.
536 Parameters
537 ----------
538 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
539 Job containing values to be translated.
541 Returns
542 -------
543 dagcmds : `dict` [`str`, `Any`]
544 DAGMan commands for the job.
545 """
546 # Values in the dag script that just are name mappings.
547 dag_translation = {"abort_on_value": "abort_dag_on",
548 "abort_return_value": "abort_exit"}
550 dagcmds = {}
551 for gwkey, htckey in dag_translation.items():
552 dagcmds[htckey] = getattr(gwjob, gwkey, None)
554 # Still to be coded: vars "pre_cmdline", "post_cmdline"
555 return dagcmds
558def _fix_env_var_syntax(oldstr):
559 """Change ENV place holders to HTCondor Env var syntax.
561 Parameters
562 ----------
563 oldstr : `str`
564 String in which environment variable syntax is to be fixed.
566 Returns
567 -------
568 newstr : `str`
569 Given string with environment variable syntax fixed.
570 """
571 newstr = oldstr
572 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
573 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
574 return newstr
577def _replace_file_vars(config, arguments, workflow, gwjob):
578 """Replace file placeholders in command line arguments with correct
579 physical file names.
581 Parameters
582 ----------
583 config : `lsst.ctrl.bps.BpsConfig`
584 BPS configuration that includes necessary submit/runtime
585 information.
586 arguments : `str`
587 Arguments string in which to replace file placeholders.
588 workflow : `lsst.ctrl.bps.GenericWorkflow`
589 Generic workflow that contains file information.
590 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
591 The job corresponding to the arguments.
593 Returns
594 -------
595 arguments : `str`
596 Given arguments string with file placeholders replaced.
597 """
598 _, use_shared = config.search("bpsUseShared", opt={"default": False})
600 # Replace input file placeholders with paths.
601 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
602 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared:
603 uri = os.path.basename(gwfile.src_uri)
604 else:
605 uri = gwfile.src_uri
606 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
608 # Replace output file placeholders with paths.
609 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
610 if gwfile.wms_transfer and not use_shared or not gwfile.job_shared:
611 uri = os.path.basename(gwfile.src_uri)
612 else:
613 uri = gwfile.src_uri
614 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
615 return arguments
618def _replace_cmd_vars(arguments, gwjob):
619 """Replace format-style placeholders in arguments.
621 Parameters
622 ----------
623 arguments : `str`
624 Arguments string in which to replace placeholders.
625 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
626 Job containing values to be used to replace placeholders
627 (in particular gwjob.cmdvals).
629 Returns
630 -------
631 arguments : `str`
632 Given arguments string with placeholders replaced.
633 """
634 try:
635 arguments = arguments.format(**gwjob.cmdvals)
636 except (KeyError, TypeError): # TypeError in case None instead of {}
637 _LOG.error("Could not replace command variables:\n"
638 "arguments: %s\n"
639 "cmdvals: %s", arguments, gwjob.cmdvals)
640 raise
641 return arguments
644def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
645 """Add job input files from generic workflow to job.
647 Parameters
648 ----------
649 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
650 The generic workflow (e.g., has executable name and arguments).
651 job_name : `str`
652 Unique name for the job.
653 use_shared : `bool`
654 Whether job has access to files via shared filesystem.
655 out_prefix : `str`
656 The root directory into which all WMS-specific files are written.
658 Returns
659 -------
660 htc_commands : `dict` [`str`, `str`]
661 HTCondor commands for the job submission script.
662 """
663 htc_commands = {}
664 inputs = []
665 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
666 _LOG.debug("src_uri=%s", gwf_file.src_uri)
667 if not use_shared or not gwf_file.job_shared:
668 inputs.append(os.path.relpath(gwf_file.src_uri, out_prefix))
670 if inputs:
671 htc_commands["transfer_input_files"] = ",".join(inputs)
672 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
673 return htc_commands
676def _report_from_path(wms_path):
677 """Gather run information from a given run directory.
679 Parameters
680 ----------
681 wms_path : `str`
682 The directory containing the submit side files (e.g., HTCondor files).
684 Returns
685 -------
686 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
687 Run information for the detailed report. The key is the HTCondor id
688 and the value is a collection of report information for that run.
689 message : `str`
690 Message to be printed with the summary report.
691 """
692 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
693 if wms_workflow_id == MISSING_ID:
694 run_reports = {}
695 else:
696 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
697 return run_reports, message
700def _report_from_id(wms_workflow_id, hist):
701 """Gather run information from a given run directory.
703 Parameters
704 ----------
705 wms_workflow_id : `int` or `str`
706 Limit to specific run based on id.
707 hist : `float`
708 Limit history search to this many days.
710 Returns
711 -------
712 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
713 Run information for the detailed report. The key is the HTCondor id
714 and the value is a collection of report information for that run.
715 message : `str`
716 Message to be printed with the summary report.
717 """
718 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
719 f"{int(float(wms_workflow_id))})"
720 jobs = condor_q(constraint)
721 if hist:
722 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
723 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
724 hist_jobs = condor_history(constraint)
725 _update_jobs(jobs, hist_jobs)
727 # keys in dictionary will be strings of format "ClusterId.ProcId"
728 wms_workflow_id = str(wms_workflow_id)
729 if not wms_workflow_id.endswith(".0"):
730 wms_workflow_id += ".0"
732 if wms_workflow_id in jobs:
733 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
734 _update_jobs(jobs, path_jobs)
735 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
736 else:
737 run_reports = {}
738 message = f"Found 0 records for run id {wms_workflow_id}"
739 return run_reports, message
742def _get_info_from_path(wms_path):
743 """Gather run information from a given run directory.
745 Parameters
746 ----------
747 wms_path : `str`
748 Directory containing HTCondor files.
750 Returns
751 -------
752 wms_workflow_id : `str`
753 The run id which is a DAGman job id.
754 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
755 Information about jobs read from files in the given directory.
756 The key is the HTCondor id and the value is a dictionary of HTCondor
757 keys and values.
758 message : `str`
759 Message to be printed with the summary report.
760 """
761 try:
762 wms_workflow_id, jobs = read_dag_log(wms_path)
763 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
764 _update_jobs(jobs, read_node_status(wms_path))
765 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
767 # Add more info for DAGman job
768 job = jobs[wms_workflow_id]
769 job.update(read_dag_status(wms_path))
770 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
771 if "bps_run" not in job:
772 _add_run_info(wms_path, job)
774 message = htc_check_dagman_output(wms_path)
775 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
776 jobs[wms_workflow_id]["total_jobs"])
777 except StopIteration:
778 message = f"Could not find HTCondor files in {wms_path}"
779 _LOG.warning(message)
780 wms_workflow_id = MISSING_ID
781 jobs = {}
783 return wms_workflow_id, jobs, message
786def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
787 """Gather run information to be used in generating summary reports.
789 Parameters
790 ----------
791 wms_workflow_id : `str`
792 Run lookup restricted to given user.
793 jobs : `float`
794 How many previous days to search for run information.
796 Returns
797 -------
798 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
799 Run information for the detailed report. The key is the given HTCondor
800 id and the value is a collection of report information for that run.
801 """
802 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
803 dag_job = jobs[wms_workflow_id]
804 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
805 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
806 return {}
807 report = WmsRunReport(wms_id=wms_workflow_id,
808 path=dag_job["Iwd"],
809 label=dag_job.get("bps_job_label", "MISS"),
810 run=dag_job.get("bps_run", "MISS"),
811 project=dag_job.get("bps_project", "MISS"),
812 campaign=dag_job.get("bps_campaign", "MISS"),
813 payload=dag_job.get("bps_payload", "MISS"),
814 operator=_get_owner(dag_job),
815 run_summary=_get_run_summary(dag_job),
816 state=_htc_status_to_wms_state(dag_job),
817 jobs=[],
818 total_number_jobs=dag_job["total_jobs"],
819 job_state_counts=dag_job["state_counts"])
821 try:
822 for job in jobs.values():
823 if job["ClusterId"] != int(float(wms_workflow_id)):
824 job_report = WmsJobReport(wms_id=job["ClusterId"],
825 name=job.get("DAGNodeName", str(job["ClusterId"])),
826 label=job.get("bps_job_label",
827 pegasus_name_to_label(job["DAGNodeName"])),
828 state=_htc_status_to_wms_state(job))
829 if job_report.label == "init":
830 job_report.label = "pipetaskInit"
831 report.jobs.append(job_report)
832 except KeyError as ex:
833 _LOG.error("Job missing key '%s': %s", str(ex), job)
834 raise
836 run_reports = {report.wms_id: report}
837 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
838 return run_reports
841def _summary_report(user, hist, pass_thru):
842 """Gather run information to be used in generating summary reports.
844 Parameters
845 ----------
846 user : `str`
847 Run lookup restricted to given user.
848 hist : `float`
849 How many previous days to search for run information.
850 pass_thru : `str`
851 Advanced users can define the HTCondor constraint to be used
852 when searching queue and history.
854 Returns
855 -------
856 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
857 Run information for the summary report. The keys are HTCondor ids and
858 the values are collections of report information for each run.
859 message : `str`
860 Message to be printed with the summary report.
861 """
862 # only doing summary report so only look for dagman jobs
863 if pass_thru:
864 constraint = pass_thru
865 else:
866 # Notes:
867 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
868 # manually restarted.
869 # * Any job with DAGManJobID isn't a DAG job
870 constraint = 'bps_isjob == "True" && JobUniverse == 7'
871 if user:
872 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
874 # Check runs in queue.
875 jobs = condor_q(constraint)
877 if hist:
878 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
879 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
880 hist_jobs = condor_history(constraint)
881 _update_jobs(jobs, hist_jobs)
883 _LOG.debug("Job ids from queue and history %s", jobs.keys())
885 # Have list of DAGMan jobs, need to get run_report info.
886 run_reports = {}
887 for job in jobs.values():
888 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
889 # If didn't get from queue information (e.g., Kerberos bug),
890 # try reading from file.
891 if total_jobs == 0:
892 try:
893 job.update(read_dag_status(job["Iwd"]))
894 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
895 except StopIteration:
896 pass # don't kill report can't find htcondor files
898 if "bps_run" not in job:
899 _add_run_info(job["Iwd"], job)
900 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
901 path=job["Iwd"],
902 label=job.get("bps_job_label", "MISS"),
903 run=job.get("bps_run", "MISS"),
904 project=job.get("bps_project", "MISS"),
905 campaign=job.get("bps_campaign", "MISS"),
906 payload=job.get("bps_payload", "MISS"),
907 operator=_get_owner(job),
908 run_summary=_get_run_summary(job),
909 state=_htc_status_to_wms_state(job),
910 jobs=[],
911 total_number_jobs=total_jobs,
912 job_state_counts=state_counts)
914 run_reports[report.wms_id] = report
916 return run_reports, ""
919def _add_run_info(wms_path, job):
920 """Find BPS run information elsewhere for runs without bps attributes.
922 Parameters
923 ----------
924 wms_path : `str`
925 Path to submit files for the run.
926 job : `dict` [`str`, `Any`]
927 HTCondor dag job information.
929 Raises
930 ------
931 StopIteration
932 If cannot find file it is looking for. Permission errors are
933 caught and job's run is marked with error.
934 """
935 path = Path(wms_path) / "jobs"
936 try:
937 subfile = next(path.glob("**/*.sub"))
938 except (StopIteration, PermissionError):
939 job["bps_run"] = "Unavailable"
940 else:
941 _LOG.debug("_add_run_info: subfile = %s", subfile)
942 try:
943 with open(subfile, "r") as fh:
944 for line in fh:
945 if line.startswith("+bps_"):
946 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
947 if m:
948 _LOG.debug("Matching line: %s", line)
949 job[m.group(1)] = m.group(2).replace('"', "")
950 else:
951 _LOG.debug("Could not parse attribute: %s", line)
952 except PermissionError:
953 job["bps_run"] = "PermissionError"
954 _LOG.debug("After adding job = %s", job)
957def _get_owner(job):
958 """Get the owner of a dag job.
960 Parameters
961 ----------
962 job : `dict` [`str`, `Any`]
963 HTCondor dag job information.
965 Returns
966 -------
967 owner : `str`
968 Owner of the dag job.
969 """
970 owner = job.get("bps_operator", None)
971 if not owner:
972 owner = job.get("Owner", None)
973 if not owner:
974 _LOG.warning("Could not get Owner from htcondor job: %s", job)
975 owner = "MISS"
976 return owner
979def _get_run_summary(job):
980 """Get the run summary for a job.
982 Parameters
983 ----------
984 job : `dict` [`str`, `Any`]
985 HTCondor dag job information.
987 Returns
988 -------
989 summary : `str`
990 Number of jobs per PipelineTask label in approximate pipeline order.
991 Format: <label>:<count>[;<label>:<count>]+
992 """
993 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
994 if not summary:
995 summary, _ = summary_from_dag(job["Iwd"])
996 if not summary:
997 _LOG.warning("Could not get run summary for htcondor job: %s", job)
998 _LOG.debug("_get_run_summary: summary=%s", summary)
1000 # Workaround sometimes using init vs pipetaskInit
1001 summary = summary.replace("init:", "pipetaskInit:")
1003 if "pegasus_version" in job and "pegasus" not in summary:
1004 summary += ";pegasus:0"
1006 return summary
1009def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1010 """Count number of jobs per WMS state.
1012 Parameters
1013 ----------
1014 wms_workflow_id : `str`
1015 HTCondor job id.
1016 jobs : `dict` [`str`, `Any`]
1017 HTCondor dag job information.
1019 Returns
1020 -------
1021 total_count : `int`
1022 Total number of dag nodes.
1023 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1024 Keys are the different WMS states and values are counts of jobs
1025 that are in that WMS state.
1026 """
1027 state_counts = dict.fromkeys(WmsStates, 0)
1029 for jid, jinfo in jobs.items():
1030 if jid != wms_workflow_id:
1031 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1033 total_counted = sum(state_counts.values())
1034 if "NodesTotal" in jobs[wms_workflow_id]:
1035 total_count = jobs[wms_workflow_id]["NodesTotal"]
1036 else:
1037 total_count = total_counted
1039 state_counts[WmsStates.UNREADY] += total_count - total_counted
1041 return total_count, state_counts
1044def _get_state_counts_from_dag_job(job):
1045 """Count number of jobs per WMS state.
1047 Parameters
1048 ----------
1049 job : `dict` [`str`, `Any`]
1050 HTCondor dag job information.
1052 Returns
1053 -------
1054 total_count : `int`
1055 Total number of dag nodes.
1056 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1057 Keys are the different WMS states and values are counts of jobs
1058 that are in that WMS state.
1059 """
1060 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1061 state_counts = dict.fromkeys(WmsStates, 0)
1062 if "DAG_NodesReady" in job:
1063 state_counts = {
1064 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1065 WmsStates.READY: job.get("DAG_NodesReady", 0),
1066 WmsStates.HELD: job.get("JobProcsHeld", 0),
1067 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1068 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1069 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1070 total_jobs = job.get("DAG_NodesTotal")
1071 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1072 elif "NodesFailed" in job:
1073 state_counts = {
1074 WmsStates.UNREADY: job.get("NodesUnready", 0),
1075 WmsStates.READY: job.get("NodesReady", 0),
1076 WmsStates.HELD: job.get("JobProcsHeld", 0),
1077 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1078 WmsStates.FAILED: job.get("NodesFailed", 0),
1079 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1080 try:
1081 total_jobs = job.get("NodesTotal")
1082 except KeyError as ex:
1083 _LOG.error("Job missing %s. job = %s", str(ex), job)
1084 raise
1085 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1086 else:
1087 # With Kerberos job auth and Kerberos bug, if warning would be printed
1088 # for every DAG.
1089 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1090 total_jobs = 0
1092 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1093 return total_jobs, state_counts
1096def _htc_status_to_wms_state(job):
1097 """Convert HTCondor job status to generic wms state.
1099 Parameters
1100 ----------
1101 job : `dict` [`str`, `Any`]
1102 HTCondor job information.
1104 Returns
1105 -------
1106 wms_state : `WmsStates`
1107 The equivalent WmsState to given job's status.
1108 """
1109 wms_state = WmsStates.MISFIT
1110 if "JobStatus" in job:
1111 wms_state = _htc_job_status_to_wms_state(job)
1112 elif "NodeStatus" in job:
1113 wms_state = _htc_node_status_to_wms_state(job)
1114 return wms_state
1117def _htc_job_status_to_wms_state(job):
1118 """Convert HTCondor job status to generic wms state.
1120 Parameters
1121 ----------
1122 job : `dict` [`str`, `Any`]
1123 HTCondor job information.
1125 Returns
1126 -------
1127 wms_state : `lsst.ctrl.bps.WmsStates`
1128 The equivalent WmsState to given job's status.
1129 """
1130 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1131 type(job["JobStatus"]))
1132 job_status = int(job["JobStatus"])
1133 wms_state = WmsStates.MISFIT
1135 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1136 if job_status == JobStatus.IDLE:
1137 wms_state = WmsStates.PENDING
1138 elif job_status == JobStatus.RUNNING:
1139 wms_state = WmsStates.RUNNING
1140 elif job_status == JobStatus.REMOVED:
1141 wms_state = WmsStates.DELETED
1142 elif job_status == JobStatus.COMPLETED:
1143 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1144 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1145 job.get("ReturnValue", 0):
1146 wms_state = WmsStates.FAILED
1147 else:
1148 wms_state = WmsStates.SUCCEEDED
1149 elif job_status == JobStatus.HELD:
1150 wms_state = WmsStates.HELD
1152 return wms_state
1155def _htc_node_status_to_wms_state(job):
1156 """Convert HTCondor status to generic wms state.
1158 Parameters
1159 ----------
1160 job : `dict` [`str`, `Any`]
1161 HTCondor job information.
1163 Returns
1164 -------
1165 wms_state : `lsst.ctrl.bps.WmsStates`
1166 The equivalent WmsState to given node's status.
1167 """
1168 wms_state = WmsStates.MISFIT
1170 status = job["NodeStatus"]
1171 if status == NodeStatus.NOT_READY:
1172 wms_state = WmsStates.UNREADY
1173 elif status == NodeStatus.READY:
1174 wms_state = WmsStates.READY
1175 elif status == NodeStatus.PRERUN:
1176 wms_state = WmsStates.MISFIT
1177 elif status == NodeStatus.SUBMITTED:
1178 if job["JobProcsHeld"]:
1179 wms_state = WmsStates.HELD
1180 elif job["StatusDetails"] == "not_idle":
1181 wms_state = WmsStates.RUNNING
1182 elif job["JobProcsQueued"]:
1183 wms_state = WmsStates.PENDING
1184 elif status == NodeStatus.POSTRUN:
1185 wms_state = WmsStates.MISFIT
1186 elif status == NodeStatus.DONE:
1187 wms_state = WmsStates.SUCCEEDED
1188 elif status == NodeStatus.ERROR:
1189 # Use job exist instead of post script exit
1190 if "DAGMAN error 0" in job["StatusDetails"]:
1191 wms_state = WmsStates.SUCCEEDED
1192 else:
1193 wms_state = WmsStates.FAILED
1195 return wms_state
1198def _update_jobs(jobs1, jobs2):
1199 """Update jobs1 with info in jobs2.
1201 (Basically an update for nested dictionaries.)
1203 Parameters
1204 ----------
1205 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1206 HTCondor job information to be updated.
1207 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1208 Additional HTCondor job information.
1209 """
1210 for jid, jinfo in jobs2.items():
1211 if jid in jobs1:
1212 jobs1[jid].update(jinfo)
1213 else:
1214 jobs1[jid] = jinfo
1217def _wms_id_to_cluster(wms_id):
1218 """Convert WMS ID to cluster ID.
1220 Parameters
1221 ----------
1222 wms_id : `int` or `float` or `str`
1223 HTCondor job id or path.
1225 Returns
1226 -------
1227 cluster_id : `int`
1228 HTCondor cluster id.
1229 """
1230 # If wms_id represents path, get numeric id.
1231 try:
1232 cluster_id = int(float(wms_id))
1233 except ValueError:
1234 wms_path = Path(wms_id)
1235 if wms_path.exists():
1236 try:
1237 cluster_id, _ = read_dag_log(wms_id)
1238 cluster_id = int(float(cluster_id))
1239 except StopIteration:
1240 cluster_id = 0
1241 else:
1242 cluster_id = 0
1243 return cluster_id
1246def _create_request_memory_expr(memory, multiplier):
1247 """Construct an HTCondor ClassAd expression for safe memory scaling.
1249 Parameters
1250 ----------
1251 memory : `int`
1252 Requested memory in MB.
1253 multiplier : `float`
1254 Memory growth rate between retires.
1256 Returns
1257 -------
1258 ad : `str`
1259 A string representing an HTCondor ClassAd expression enabling safe
1260 memory scaling between job retries.
1261 """
1262 was_mem_exceeded = "LastJobStatus =?= 5 " \
1263 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1264 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1266 # If job runs the first time ('MemoryUsage' is not defined), set the
1267 # required memory to a given value.
1268 ad = f"ifThenElse({was_mem_exceeded}, " \
1269 f"ifThenElse(isUndefined(MemoryUsage), {memory}, int({multiplier} * MemoryUsage)), " \
1270 f"ifThenElse(isUndefined(MemoryUsage), {memory}, max({memory}, MemoryUsage)))"
1271 return ad