Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from datetime import datetime, timedelta
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import (
47 chdir,
48 create_count_summary
49)
50from .lssthtc import (
51 HTCDag,
52 HTCJob,
53 MISSING_ID,
54 JobStatus,
55 NodeStatus,
56 htc_check_dagman_output,
57 htc_escape,
58 htc_submit_dag,
59 read_dag_log,
60 read_dag_status,
61 read_node_status,
62 condor_history,
63 condor_q,
64 condor_status,
65 pegasus_name_to_label,
66 summary_from_dag,
67)
70DEFAULT_HTC_EXEC_PATT = ".*worker.*"
71"""Default pattern for searching execute machines in an HTCondor pool.
72"""
74_LOG = logging.getLogger(__name__)
77class HTCondorService(BaseWmsService):
78 """HTCondor version of WMS service.
79 """
80 def prepare(self, config, generic_workflow, out_prefix=None):
81 """Convert generic workflow to an HTCondor DAG ready for submission.
83 Parameters
84 ----------
85 config : `lsst.ctrl.bps.BpsConfig`
86 BPS configuration that includes necessary submit/runtime
87 information.
88 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
89 The generic workflow (e.g., has executable name and arguments).
90 out_prefix : `str`
91 The root directory into which all WMS-specific files are written.
93 Returns
94 ----------
95 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
96 HTCondor workflow ready to be run.
97 """
98 _LOG.debug("out_prefix = '%s'", out_prefix)
99 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
100 f"{self.__class__.__module__}."
101 f"{self.__class__.__name__}")
102 workflow.write(out_prefix)
103 return workflow
105 def submit(self, workflow):
106 """Submit a single HTCondor workflow.
108 Parameters
109 ----------
110 workflow : `lsst.ctrl.bps.BaseWorkflow`
111 A single HTCondor workflow to submit. run_id is updated after
112 successful submission to WMS.
113 """
114 # For workflow portability, internal paths are all relative. Hence
115 # the DAG needs to be submitted to HTCondor from inside the submit
116 # directory.
117 with chdir(workflow.submit_path):
118 _LOG.info("Submitting from directory: %s", os.getcwd())
119 htc_submit_dag(workflow.dag, {})
120 workflow.run_id = workflow.dag.run_id
122 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
123 """Query WMS for list of submitted WMS workflows/jobs.
125 This should be a quick lookup function to create list of jobs for
126 other functions.
128 Parameters
129 ----------
130 wms_id : `int` or `str`, optional
131 Id or path that can be used by WMS service to look up job.
132 user : `str`, optional
133 User whose submitted jobs should be listed.
134 require_bps : `bool`, optional
135 Whether to require jobs returned in list to be bps-submitted jobs.
136 pass_thru : `str`, optional
137 Information to pass through to WMS.
139 Returns
140 -------
141 job_ids : `list` [`Any`]
142 Only job ids to be used by cancel and other functions. Typically
143 this means top-level jobs (i.e., not children jobs).
144 """
145 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
146 wms_id, user, require_bps, pass_thru)
147 constraint = ""
149 if wms_id is None:
150 if user is not None:
151 constraint = f'(Owner == "{user}")'
152 else:
153 cluster_id = _wms_id_to_cluster(wms_id)
154 if cluster_id != 0:
155 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
157 if require_bps:
158 constraint += ' && (bps_isjob == "True")'
160 if pass_thru:
161 if "-forcex" in pass_thru:
162 pass_thru_2 = pass_thru.replace("-forcex", "")
163 if pass_thru_2 and not pass_thru_2.isspace():
164 constraint += f"&& ({pass_thru_2})"
165 else:
166 constraint += f" && ({pass_thru})"
168 _LOG.debug("constraint = %s", constraint)
169 jobs = condor_q(constraint)
171 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
172 job_ids = []
173 for job_id, job_info in jobs.items():
174 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
175 if "DAGManJobId" not in job_info: # orphaned job
176 job_ids.append(job_id)
177 else:
178 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
179 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
180 if f"{job_info['DAGManJobId']}.0" not in jobs:
181 job_ids.append(job_id)
183 _LOG.debug("job_ids = %s", job_ids)
184 return job_ids
186 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
187 """Return run information based upon given constraints.
189 Parameters
190 ----------
191 wms_workflow_id : `str`
192 Limit to specific run based on id.
193 user : `str`
194 Limit results to runs for this user.
195 hist : `float`
196 Limit history search to this many days.
197 pass_thru : `str`
198 Constraints to pass through to HTCondor.
200 Returns
201 -------
202 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
203 Information about runs from given job information.
204 message : `str`
205 Extra message for report command to print. This could be pointers
206 to documentation or to WMS specific commands.
207 """
208 message = ""
210 if wms_workflow_id:
211 # Explicitly checking if wms_workflow_id can be converted to a
212 # float instead of using try/except to avoid catching a different
213 # ValueError from _report_from_id
214 try:
215 float(wms_workflow_id)
216 is_float = True
217 except ValueError: # Don't need TypeError here as None goes to else branch.
218 is_float = False
220 if is_float:
221 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
222 else:
223 run_reports, message = _report_from_path(wms_workflow_id)
224 else:
225 run_reports, message = _summary_report(user, hist, pass_thru)
226 _LOG.debug("report: %s, %s", run_reports, message)
228 return list(run_reports.values()), message
230 def cancel(self, wms_id, pass_thru=None):
231 """Cancel submitted workflows/jobs.
233 Parameters
234 ----------
235 wms_id : `str`
236 ID or path of job that should be canceled.
237 pass_thru : `str`, optional
238 Information to pass through to WMS.
240 Returns
241 --------
242 deleted : `bool`
243 Whether successful deletion or not. Currently, if any doubt or any
244 individual jobs not deleted, return False.
245 message : `str`
246 Any message from WMS (e.g., error details).
247 """
248 _LOG.debug("Canceling wms_id = %s", wms_id)
250 cluster_id = _wms_id_to_cluster(wms_id)
251 if cluster_id == 0:
252 deleted = False
253 message = "Invalid id"
254 else:
255 _LOG.debug("Canceling cluster_id = %s", cluster_id)
256 schedd = htcondor.Schedd()
257 constraint = f"ClusterId == {cluster_id}"
258 if pass_thru is not None and "-forcex" in pass_thru:
259 pass_thru_2 = pass_thru.replace("-forcex", "")
260 if pass_thru_2 and not pass_thru_2.isspace():
261 constraint += f"&& ({pass_thru_2})"
262 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
263 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
264 else:
265 if pass_thru:
266 constraint += f"&& ({pass_thru})"
267 _LOG.debug("JobAction.Remove constraint = %s", constraint)
268 results = schedd.act(htcondor.JobAction.Remove, constraint)
269 _LOG.debug("Remove results: %s", results)
271 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
272 deleted = True
273 message = ""
274 else:
275 deleted = False
276 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
277 message = "no such bps job in batch queue"
278 else:
279 message = f"unknown problems deleting: {results}"
281 _LOG.debug("deleted: %s; message = %s", deleted, message)
282 return deleted, message
285class HTCondorWorkflow(BaseWmsWorkflow):
286 """Single HTCondor workflow.
288 Parameters
289 ----------
290 name : `str`
291 Unique name for Workflow used when naming files.
292 config : `lsst.ctrl.bps.BpsConfig`
293 BPS configuration that includes necessary submit/runtime information.
294 """
295 def __init__(self, name, config=None):
296 super().__init__(name, config)
297 self.dag = None
299 @classmethod
300 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
301 # Docstring inherited
302 htc_workflow = cls(generic_workflow.name, config)
303 htc_workflow.dag = HTCDag(name=generic_workflow.name)
305 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
306 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
307 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
308 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
309 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
310 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
312 # Determine the hard limit for the memory requirement.
313 found, limit = config.search('memoryLimit')
314 if not found:
315 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
316 _, site = config.search("computeSite")
317 if site:
318 search_opts["curvals"] = {"curr_site": site}
319 _, patt = config.search("executeMachinesPattern", opt=search_opts)
321 # To reduce the amount of data, ignore dynamic slots (if any) as,
322 # by definition, they cannot have more memory than
323 # the partitionable slot they are the part of.
324 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
325 pool_info = condor_status(constraint=constraint)
326 try:
327 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
328 except ValueError:
329 _LOG.debug("No execute machine in the pool matches %s", patt)
330 if limit:
331 config[".bps_defined.memory_limit"] = limit
333 # Create all DAG jobs
334 for job_name in generic_workflow:
335 gwjob = generic_workflow.get_job(job_name)
336 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
337 htc_workflow.dag.add_job(htc_job)
339 # Add job dependencies to the DAG
340 for job_name in generic_workflow:
341 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
343 # If final job exists in generic workflow, create DAG final job
344 final = generic_workflow.get_final()
345 if final and isinstance(final, GenericWorkflowJob):
346 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
347 if "post" not in final_htjob.dagcmds:
348 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
349 f" {final.name} $DAG_STATUS $RETURN"
350 htc_workflow.dag.add_final_job(final_htjob)
351 elif final and isinstance(final, GenericWorkflow):
352 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
353 elif final:
354 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
356 return htc_workflow
358 @staticmethod
359 def _create_job(config, generic_workflow, gwjob, out_prefix):
360 """Convert GenericWorkflow job nodes to DAG jobs.
362 Parameters
363 ----------
364 config : `lsst.ctrl.bps.BpsConfig`
365 BPS configuration that includes necessary submit/runtime
366 information.
367 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
368 Generic workflow that is being converted.
369 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
370 The generic job to convert to a HTCondor job.
371 out_prefix : `str`
372 Directory prefix for HTCondor files.
374 Returns
375 -------
376 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
377 The HTCondor job equivalent to the given generic job.
378 """
379 htc_job = HTCJob(gwjob.name, label=gwjob.label)
381 curvals = dataclasses.asdict(gwjob)
382 if gwjob.tags:
383 curvals.update(gwjob.tags)
384 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
385 if not found:
386 subdir = "jobs"
387 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
389 htc_job_cmds = {
390 "universe": "vanilla",
391 "should_transfer_files": "YES",
392 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
393 "transfer_executable": "False",
394 "getenv": "True",
396 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
397 # to put SIGBUS jobs on hold.
398 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
399 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
400 "on_exit_hold_subcode": "34"
401 }
403 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
405 # job stdout, stderr, htcondor user log.
406 for key in ("output", "error", "log"):
407 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
408 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
410 _, use_shared = config.search("bpsUseShared", opt={"default": False})
411 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
413 # Add the job cmds dict to the job object.
414 htc_job.add_job_cmds(htc_job_cmds)
416 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
418 # Add job attributes to job.
419 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
420 htc_job.add_job_attrs(gwjob.attrs)
421 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
422 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
423 "bps_job_label": gwjob.label})
425 return htc_job
427 def write(self, out_prefix):
428 """Output HTCondor DAGMan files needed for workflow submission.
430 Parameters
431 ----------
432 out_prefix : `str`
433 Directory prefix for HTCondor files.
434 """
435 self.submit_path = out_prefix
436 os.makedirs(out_prefix, exist_ok=True)
438 # Write down the workflow in HTCondor format.
439 self.dag.write(out_prefix, "jobs/{self.label}")
442def _translate_job_cmds(config, generic_workflow, gwjob):
443 """Translate the job data that are one to one mapping
445 Parameters
446 ----------
447 config : `lsst.ctrl.bps.BpsConfig`
448 BPS configuration that includes necessary submit/runtime
449 information.
450 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
451 Generic workflow that contains job to being converted.
452 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
453 Generic workflow job to be converted.
455 Returns
456 -------
457 htc_job_commands : `dict` [`str`, `Any`]
458 Contains commands which can appear in the HTCondor submit description
459 file.
460 """
461 # Values in the job script that just are name mappings.
462 job_translation = {"mail_to": "notify_user",
463 "when_to_mail": "notification",
464 "request_cpus": "request_cpus",
465 "priority": "priority",
466 "category": "category"}
468 jobcmds = {}
469 for gwkey, htckey in job_translation.items():
470 jobcmds[htckey] = getattr(gwjob, gwkey, None)
472 # job commands that need modification
473 if gwjob.number_of_retries:
474 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
476 if gwjob.retry_unless_exit:
477 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
479 if gwjob.request_disk:
480 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
482 if gwjob.request_memory:
483 jobcmds["request_memory"] = f"{gwjob.request_memory}"
485 if gwjob.memory_multiplier:
486 # Do not use try-except! At the moment, BpsConfig returns an empty
487 # string if it does not contain the key.
488 memory_limit = config[".bps_defined.memory_limit"]
489 if not memory_limit:
490 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
491 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
492 "search pattern 'executeMachinesPattern' might help.")
493 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
495 # Periodically release jobs which are being held due to exceeding
496 # memory. Stop doing that (by removing the job from the HTCondor queue)
497 # after the maximal number of retries has been reached or the memory
498 # requirements cannot be satisfied.
499 jobcmds["periodic_release"] = \
500 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
501 jobcmds["periodic_remove"] = \
502 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
503 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
505 # Assume concurrency_limit implemented using HTCondor concurrency limits.
506 # May need to move to special site-specific implementation if sites use
507 # other mechanisms.
508 if gwjob.concurrency_limit:
509 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
511 # Handle command line
512 if gwjob.executable.transfer_executable:
513 jobcmds["transfer_executable"] = "True"
514 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
515 else:
516 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
518 if gwjob.arguments:
519 arguments = gwjob.arguments
520 arguments = _replace_cmd_vars(arguments, gwjob)
521 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
522 arguments = _fix_env_var_syntax(arguments)
523 jobcmds["arguments"] = arguments
525 # Add extra "pass-thru" job commands
526 if gwjob.profile:
527 for key, val in gwjob.profile.items():
528 jobcmds[key] = htc_escape(val)
530 return jobcmds
533def _translate_dag_cmds(gwjob):
534 """Translate job values into DAGMan commands.
536 Parameters
537 ----------
538 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
539 Job containing values to be translated.
541 Returns
542 -------
543 dagcmds : `dict` [`str`, `Any`]
544 DAGMan commands for the job.
545 """
546 # Values in the dag script that just are name mappings.
547 dag_translation = {"abort_on_value": "abort_dag_on",
548 "abort_return_value": "abort_exit"}
550 dagcmds = {}
551 for gwkey, htckey in dag_translation.items():
552 dagcmds[htckey] = getattr(gwjob, gwkey, None)
554 # Still to be coded: vars "pre_cmdline", "post_cmdline"
555 return dagcmds
558def _fix_env_var_syntax(oldstr):
559 """Change ENV place holders to HTCondor Env var syntax.
561 Parameters
562 ----------
563 oldstr : `str`
564 String in which environment variable syntax is to be fixed.
566 Returns
567 -------
568 newstr : `str`
569 Given string with environment variable syntax fixed.
570 """
571 newstr = oldstr
572 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
573 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
574 return newstr
577def _replace_file_vars(config, arguments, workflow, gwjob):
578 """Replace file placeholders in command line arguments with correct
579 physical file names.
581 Parameters
582 ----------
583 config : `lsst.ctrl.bps.BpsConfig`
584 BPS configuration that includes necessary submit/runtime
585 information.
586 arguments : `str`
587 Arguments string in which to replace file placeholders.
588 workflow : `lsst.ctrl.bps.GenericWorkflow`
589 Generic workflow that contains file information.
590 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
591 The job corresponding to the arguments.
593 Returns
594 -------
595 arguments : `str`
596 Given arguments string with file placeholders replaced.
597 """
598 _, use_shared = config.search("bpsUseShared", opt={"default": False})
600 # Replace input file placeholders with paths.
601 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
602 if not gwfile.wms_transfer:
603 # Must assume full URI if in command line and told WMS is not
604 # responsible for transferring file.
605 uri = gwfile.src_uri
606 elif use_shared:
607 if gwfile.job_shared:
608 # Have shared filesystems and jobs can share file.
609 uri = gwfile.src_uri
610 else:
611 # Taking advantage of inside knowledge. Not future-proof.
612 # Temporary fix until have job wrapper that pulls files
613 # within job.
614 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
615 uri = "butler.yaml"
616 else:
617 uri = os.path.basename(gwfile.src_uri)
618 else: # Using push transfer
619 uri = os.path.basename(gwfile.src_uri)
620 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
622 # Replace output file placeholders with paths.
623 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
624 if not gwfile.wms_transfer:
625 # Must assume full URI if in command line and told WMS is not
626 # responsible for transferring file.
627 uri = gwfile.src_uri
628 elif use_shared:
629 if gwfile.job_shared:
630 # Have shared filesystems and jobs can share file.
631 uri = gwfile.src_uri
632 else:
633 uri = os.path.basename(gwfile.src_uri)
634 else: # Using push transfer
635 uri = os.path.basename(gwfile.src_uri)
636 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
637 return arguments
640def _replace_cmd_vars(arguments, gwjob):
641 """Replace format-style placeholders in arguments.
643 Parameters
644 ----------
645 arguments : `str`
646 Arguments string in which to replace placeholders.
647 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
648 Job containing values to be used to replace placeholders
649 (in particular gwjob.cmdvals).
651 Returns
652 -------
653 arguments : `str`
654 Given arguments string with placeholders replaced.
655 """
656 try:
657 arguments = arguments.format(**gwjob.cmdvals)
658 except (KeyError, TypeError): # TypeError in case None instead of {}
659 _LOG.error("Could not replace command variables:\n"
660 "arguments: %s\n"
661 "cmdvals: %s", arguments, gwjob.cmdvals)
662 raise
663 return arguments
666def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
667 """Add job input files from generic workflow to job.
669 Parameters
670 ----------
671 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
672 The generic workflow (e.g., has executable name and arguments).
673 job_name : `str`
674 Unique name for the job.
675 use_shared : `bool`
676 Whether job has access to files via shared filesystem.
677 out_prefix : `str`
678 The root directory into which all WMS-specific files are written.
680 Returns
681 -------
682 htc_commands : `dict` [`str`, `str`]
683 HTCondor commands for the job submission script.
684 """
685 htc_commands = {}
686 inputs = []
687 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
688 _LOG.debug("src_uri=%s", gwf_file.src_uri)
690 uri = Path(gwf_file.src_uri)
692 # Note if use_shared and job_shared, don't need to transfer file.
694 if not use_shared: # Copy file using push to job
695 inputs.append(str(uri.relative_to(out_prefix)))
696 elif not gwf_file.job_shared: # Jobs require own copy
698 # if using shared filesystem, but still need copy in job. Use
699 # HTCondor's curl plugin for a local copy.
701 # Execution butler is represented as a directory which the
702 # curl plugin does not handle. Taking advantage of inside
703 # knowledge for temporary fix until have job wrapper that pulls
704 # files within job.
705 if gwf_file.name == "butlerConfig":
706 # The execution butler directory doesn't normally exist until
707 # the submit phase so checking for suffix instead of using
708 # is_dir(). If other non-yaml file exists they would have a
709 # different gwf_file.name.
710 if uri.suffix == ".yaml": # Single file, so just copy.
711 inputs.append(f"file://{uri}")
712 else:
713 inputs.append(f"file://{uri / 'butler.yaml'}")
714 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
715 elif uri.is_dir():
716 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)",
717 gwf_file.src_uri)
718 else:
719 inputs.append(f"file://{uri}")
721 if inputs:
722 htc_commands["transfer_input_files"] = ",".join(inputs)
723 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
724 return htc_commands
727def _report_from_path(wms_path):
728 """Gather run information from a given run directory.
730 Parameters
731 ----------
732 wms_path : `str`
733 The directory containing the submit side files (e.g., HTCondor files).
735 Returns
736 -------
737 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
738 Run information for the detailed report. The key is the HTCondor id
739 and the value is a collection of report information for that run.
740 message : `str`
741 Message to be printed with the summary report.
742 """
743 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
744 if wms_workflow_id == MISSING_ID:
745 run_reports = {}
746 else:
747 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
748 return run_reports, message
751def _report_from_id(wms_workflow_id, hist):
752 """Gather run information from a given run directory.
754 Parameters
755 ----------
756 wms_workflow_id : `int` or `str`
757 Limit to specific run based on id.
758 hist : `float`
759 Limit history search to this many days.
761 Returns
762 -------
763 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
764 Run information for the detailed report. The key is the HTCondor id
765 and the value is a collection of report information for that run.
766 message : `str`
767 Message to be printed with the summary report.
768 """
769 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
770 f"{int(float(wms_workflow_id))})"
771 jobs = condor_q(constraint)
772 if hist:
773 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
774 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
775 hist_jobs = condor_history(constraint)
776 _update_jobs(jobs, hist_jobs)
778 # keys in dictionary will be strings of format "ClusterId.ProcId"
779 wms_workflow_id = str(wms_workflow_id)
780 if not wms_workflow_id.endswith(".0"):
781 wms_workflow_id += ".0"
783 if wms_workflow_id in jobs:
784 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
785 _update_jobs(jobs, path_jobs)
786 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
787 else:
788 run_reports = {}
789 message = f"Found 0 records for run id {wms_workflow_id}"
790 return run_reports, message
793def _get_info_from_path(wms_path):
794 """Gather run information from a given run directory.
796 Parameters
797 ----------
798 wms_path : `str`
799 Directory containing HTCondor files.
801 Returns
802 -------
803 wms_workflow_id : `str`
804 The run id which is a DAGman job id.
805 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
806 Information about jobs read from files in the given directory.
807 The key is the HTCondor id and the value is a dictionary of HTCondor
808 keys and values.
809 message : `str`
810 Message to be printed with the summary report.
811 """
812 try:
813 wms_workflow_id, jobs = read_dag_log(wms_path)
814 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
815 _update_jobs(jobs, read_node_status(wms_path))
816 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
818 # Add more info for DAGman job
819 job = jobs[wms_workflow_id]
820 job.update(read_dag_status(wms_path))
821 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
822 if "bps_run" not in job:
823 _add_run_info(wms_path, job)
825 message = htc_check_dagman_output(wms_path)
826 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
827 jobs[wms_workflow_id]["total_jobs"])
828 except StopIteration:
829 message = f"Could not find HTCondor files in {wms_path}"
830 _LOG.warning(message)
831 wms_workflow_id = MISSING_ID
832 jobs = {}
834 return wms_workflow_id, jobs, message
837def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
838 """Gather run information to be used in generating summary reports.
840 Parameters
841 ----------
842 wms_workflow_id : `str`
843 Run lookup restricted to given user.
844 jobs : `float`
845 How many previous days to search for run information.
847 Returns
848 -------
849 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
850 Run information for the detailed report. The key is the given HTCondor
851 id and the value is a collection of report information for that run.
852 """
853 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
854 dag_job = jobs[wms_workflow_id]
855 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
856 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
857 return {}
858 report = WmsRunReport(wms_id=wms_workflow_id,
859 path=dag_job["Iwd"],
860 label=dag_job.get("bps_job_label", "MISS"),
861 run=dag_job.get("bps_run", "MISS"),
862 project=dag_job.get("bps_project", "MISS"),
863 campaign=dag_job.get("bps_campaign", "MISS"),
864 payload=dag_job.get("bps_payload", "MISS"),
865 operator=_get_owner(dag_job),
866 run_summary=_get_run_summary(dag_job),
867 state=_htc_status_to_wms_state(dag_job),
868 jobs=[],
869 total_number_jobs=dag_job["total_jobs"],
870 job_state_counts=dag_job["state_counts"])
872 try:
873 for job in jobs.values():
874 if job["ClusterId"] != int(float(wms_workflow_id)):
875 job_report = WmsJobReport(wms_id=job["ClusterId"],
876 name=job.get("DAGNodeName", str(job["ClusterId"])),
877 label=job.get("bps_job_label",
878 pegasus_name_to_label(job["DAGNodeName"])),
879 state=_htc_status_to_wms_state(job))
880 if job_report.label == "init":
881 job_report.label = "pipetaskInit"
882 report.jobs.append(job_report)
883 except KeyError as ex:
884 _LOG.error("Job missing key '%s': %s", str(ex), job)
885 raise
887 run_reports = {report.wms_id: report}
888 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
889 return run_reports
892def _summary_report(user, hist, pass_thru):
893 """Gather run information to be used in generating summary reports.
895 Parameters
896 ----------
897 user : `str`
898 Run lookup restricted to given user.
899 hist : `float`
900 How many previous days to search for run information.
901 pass_thru : `str`
902 Advanced users can define the HTCondor constraint to be used
903 when searching queue and history.
905 Returns
906 -------
907 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
908 Run information for the summary report. The keys are HTCondor ids and
909 the values are collections of report information for each run.
910 message : `str`
911 Message to be printed with the summary report.
912 """
913 # only doing summary report so only look for dagman jobs
914 if pass_thru:
915 constraint = pass_thru
916 else:
917 # Notes:
918 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
919 # manually restarted.
920 # * Any job with DAGManJobID isn't a DAG job
921 constraint = 'bps_isjob == "True" && JobUniverse == 7'
922 if user:
923 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
925 # Check runs in queue.
926 jobs = condor_q(constraint)
928 if hist:
929 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
930 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
931 hist_jobs = condor_history(constraint)
932 _update_jobs(jobs, hist_jobs)
934 _LOG.debug("Job ids from queue and history %s", jobs.keys())
936 # Have list of DAGMan jobs, need to get run_report info.
937 run_reports = {}
938 for job in jobs.values():
939 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
940 # If didn't get from queue information (e.g., Kerberos bug),
941 # try reading from file.
942 if total_jobs == 0:
943 try:
944 job.update(read_dag_status(job["Iwd"]))
945 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
946 except StopIteration:
947 pass # don't kill report can't find htcondor files
949 if "bps_run" not in job:
950 _add_run_info(job["Iwd"], job)
951 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
952 path=job["Iwd"],
953 label=job.get("bps_job_label", "MISS"),
954 run=job.get("bps_run", "MISS"),
955 project=job.get("bps_project", "MISS"),
956 campaign=job.get("bps_campaign", "MISS"),
957 payload=job.get("bps_payload", "MISS"),
958 operator=_get_owner(job),
959 run_summary=_get_run_summary(job),
960 state=_htc_status_to_wms_state(job),
961 jobs=[],
962 total_number_jobs=total_jobs,
963 job_state_counts=state_counts)
965 run_reports[report.wms_id] = report
967 return run_reports, ""
970def _add_run_info(wms_path, job):
971 """Find BPS run information elsewhere for runs without bps attributes.
973 Parameters
974 ----------
975 wms_path : `str`
976 Path to submit files for the run.
977 job : `dict` [`str`, `Any`]
978 HTCondor dag job information.
980 Raises
981 ------
982 StopIteration
983 If cannot find file it is looking for. Permission errors are
984 caught and job's run is marked with error.
985 """
986 path = Path(wms_path) / "jobs"
987 try:
988 subfile = next(path.glob("**/*.sub"))
989 except (StopIteration, PermissionError):
990 job["bps_run"] = "Unavailable"
991 else:
992 _LOG.debug("_add_run_info: subfile = %s", subfile)
993 try:
994 with open(subfile, "r") as fh:
995 for line in fh:
996 if line.startswith("+bps_"):
997 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
998 if m:
999 _LOG.debug("Matching line: %s", line)
1000 job[m.group(1)] = m.group(2).replace('"', "")
1001 else:
1002 _LOG.debug("Could not parse attribute: %s", line)
1003 except PermissionError:
1004 job["bps_run"] = "PermissionError"
1005 _LOG.debug("After adding job = %s", job)
1008def _get_owner(job):
1009 """Get the owner of a dag job.
1011 Parameters
1012 ----------
1013 job : `dict` [`str`, `Any`]
1014 HTCondor dag job information.
1016 Returns
1017 -------
1018 owner : `str`
1019 Owner of the dag job.
1020 """
1021 owner = job.get("bps_operator", None)
1022 if not owner:
1023 owner = job.get("Owner", None)
1024 if not owner:
1025 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1026 owner = "MISS"
1027 return owner
1030def _get_run_summary(job):
1031 """Get the run summary for a job.
1033 Parameters
1034 ----------
1035 job : `dict` [`str`, `Any`]
1036 HTCondor dag job information.
1038 Returns
1039 -------
1040 summary : `str`
1041 Number of jobs per PipelineTask label in approximate pipeline order.
1042 Format: <label>:<count>[;<label>:<count>]+
1043 """
1044 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1045 if not summary:
1046 summary, _ = summary_from_dag(job["Iwd"])
1047 if not summary:
1048 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1049 _LOG.debug("_get_run_summary: summary=%s", summary)
1051 # Workaround sometimes using init vs pipetaskInit
1052 summary = summary.replace("init:", "pipetaskInit:")
1054 if "pegasus_version" in job and "pegasus" not in summary:
1055 summary += ";pegasus:0"
1057 return summary
1060def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1061 """Count number of jobs per WMS state.
1063 Parameters
1064 ----------
1065 wms_workflow_id : `str`
1066 HTCondor job id.
1067 jobs : `dict` [`str`, `Any`]
1068 HTCondor dag job information.
1070 Returns
1071 -------
1072 total_count : `int`
1073 Total number of dag nodes.
1074 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1075 Keys are the different WMS states and values are counts of jobs
1076 that are in that WMS state.
1077 """
1078 state_counts = dict.fromkeys(WmsStates, 0)
1080 for jid, jinfo in jobs.items():
1081 if jid != wms_workflow_id:
1082 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1084 total_counted = sum(state_counts.values())
1085 if "NodesTotal" in jobs[wms_workflow_id]:
1086 total_count = jobs[wms_workflow_id]["NodesTotal"]
1087 else:
1088 total_count = total_counted
1090 state_counts[WmsStates.UNREADY] += total_count - total_counted
1092 return total_count, state_counts
1095def _get_state_counts_from_dag_job(job):
1096 """Count number of jobs per WMS state.
1098 Parameters
1099 ----------
1100 job : `dict` [`str`, `Any`]
1101 HTCondor dag job information.
1103 Returns
1104 -------
1105 total_count : `int`
1106 Total number of dag nodes.
1107 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1108 Keys are the different WMS states and values are counts of jobs
1109 that are in that WMS state.
1110 """
1111 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1112 state_counts = dict.fromkeys(WmsStates, 0)
1113 if "DAG_NodesReady" in job:
1114 state_counts = {
1115 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1116 WmsStates.READY: job.get("DAG_NodesReady", 0),
1117 WmsStates.HELD: job.get("JobProcsHeld", 0),
1118 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1119 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1120 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1121 total_jobs = job.get("DAG_NodesTotal")
1122 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1123 elif "NodesFailed" in job:
1124 state_counts = {
1125 WmsStates.UNREADY: job.get("NodesUnready", 0),
1126 WmsStates.READY: job.get("NodesReady", 0),
1127 WmsStates.HELD: job.get("JobProcsHeld", 0),
1128 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1129 WmsStates.FAILED: job.get("NodesFailed", 0),
1130 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1131 try:
1132 total_jobs = job.get("NodesTotal")
1133 except KeyError as ex:
1134 _LOG.error("Job missing %s. job = %s", str(ex), job)
1135 raise
1136 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1137 else:
1138 # With Kerberos job auth and Kerberos bug, if warning would be printed
1139 # for every DAG.
1140 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1141 total_jobs = 0
1143 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1144 return total_jobs, state_counts
1147def _htc_status_to_wms_state(job):
1148 """Convert HTCondor job status to generic wms state.
1150 Parameters
1151 ----------
1152 job : `dict` [`str`, `Any`]
1153 HTCondor job information.
1155 Returns
1156 -------
1157 wms_state : `WmsStates`
1158 The equivalent WmsState to given job's status.
1159 """
1160 wms_state = WmsStates.MISFIT
1161 if "JobStatus" in job:
1162 wms_state = _htc_job_status_to_wms_state(job)
1163 elif "NodeStatus" in job:
1164 wms_state = _htc_node_status_to_wms_state(job)
1165 return wms_state
1168def _htc_job_status_to_wms_state(job):
1169 """Convert HTCondor job status to generic wms state.
1171 Parameters
1172 ----------
1173 job : `dict` [`str`, `Any`]
1174 HTCondor job information.
1176 Returns
1177 -------
1178 wms_state : `lsst.ctrl.bps.WmsStates`
1179 The equivalent WmsState to given job's status.
1180 """
1181 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1182 type(job["JobStatus"]))
1183 job_status = int(job["JobStatus"])
1184 wms_state = WmsStates.MISFIT
1186 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1187 if job_status == JobStatus.IDLE:
1188 wms_state = WmsStates.PENDING
1189 elif job_status == JobStatus.RUNNING:
1190 wms_state = WmsStates.RUNNING
1191 elif job_status == JobStatus.REMOVED:
1192 wms_state = WmsStates.DELETED
1193 elif job_status == JobStatus.COMPLETED:
1194 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1195 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1196 job.get("ReturnValue", 0):
1197 wms_state = WmsStates.FAILED
1198 else:
1199 wms_state = WmsStates.SUCCEEDED
1200 elif job_status == JobStatus.HELD:
1201 wms_state = WmsStates.HELD
1203 return wms_state
1206def _htc_node_status_to_wms_state(job):
1207 """Convert HTCondor status to generic wms state.
1209 Parameters
1210 ----------
1211 job : `dict` [`str`, `Any`]
1212 HTCondor job information.
1214 Returns
1215 -------
1216 wms_state : `lsst.ctrl.bps.WmsStates`
1217 The equivalent WmsState to given node's status.
1218 """
1219 wms_state = WmsStates.MISFIT
1221 status = job["NodeStatus"]
1222 if status == NodeStatus.NOT_READY:
1223 wms_state = WmsStates.UNREADY
1224 elif status == NodeStatus.READY:
1225 wms_state = WmsStates.READY
1226 elif status == NodeStatus.PRERUN:
1227 wms_state = WmsStates.MISFIT
1228 elif status == NodeStatus.SUBMITTED:
1229 if job["JobProcsHeld"]:
1230 wms_state = WmsStates.HELD
1231 elif job["StatusDetails"] == "not_idle":
1232 wms_state = WmsStates.RUNNING
1233 elif job["JobProcsQueued"]:
1234 wms_state = WmsStates.PENDING
1235 elif status == NodeStatus.POSTRUN:
1236 wms_state = WmsStates.MISFIT
1237 elif status == NodeStatus.DONE:
1238 wms_state = WmsStates.SUCCEEDED
1239 elif status == NodeStatus.ERROR:
1240 # Use job exist instead of post script exit
1241 if "DAGMAN error 0" in job["StatusDetails"]:
1242 wms_state = WmsStates.SUCCEEDED
1243 else:
1244 wms_state = WmsStates.FAILED
1246 return wms_state
1249def _update_jobs(jobs1, jobs2):
1250 """Update jobs1 with info in jobs2.
1252 (Basically an update for nested dictionaries.)
1254 Parameters
1255 ----------
1256 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1257 HTCondor job information to be updated.
1258 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1259 Additional HTCondor job information.
1260 """
1261 for jid, jinfo in jobs2.items():
1262 if jid in jobs1:
1263 jobs1[jid].update(jinfo)
1264 else:
1265 jobs1[jid] = jinfo
1268def _wms_id_to_cluster(wms_id):
1269 """Convert WMS ID to cluster ID.
1271 Parameters
1272 ----------
1273 wms_id : `int` or `float` or `str`
1274 HTCondor job id or path.
1276 Returns
1277 -------
1278 cluster_id : `int`
1279 HTCondor cluster id.
1280 """
1281 # If wms_id represents path, get numeric id.
1282 try:
1283 cluster_id = int(float(wms_id))
1284 except ValueError:
1285 wms_path = Path(wms_id)
1286 if wms_path.exists():
1287 try:
1288 cluster_id, _ = read_dag_log(wms_id)
1289 cluster_id = int(float(cluster_id))
1290 except StopIteration:
1291 cluster_id = 0
1292 else:
1293 cluster_id = 0
1294 return cluster_id
1297def _create_request_memory_expr(memory, multiplier):
1298 """Construct an HTCondor ClassAd expression for safe memory scaling.
1300 Parameters
1301 ----------
1302 memory : `int`
1303 Requested memory in MB.
1304 multiplier : `float`
1305 Memory growth rate between retires.
1307 Returns
1308 -------
1309 ad : `str`
1310 A string representing an HTCondor ClassAd expression enabling safe
1311 memory scaling between job retries.
1312 """
1313 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue.
1314 # The special comparison operators ensure that all comparisons below will
1315 # evaluate to FALSE in this case.
1316 was_mem_exceeded = "LastJobStatus =?= 5 " \
1317 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1318 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1320 # If job runs the first time or was held for reasons other than exceeding
1321 # the memory, set the required memory to the requested value or use
1322 # the memory value measured by HTCondor (MemoryUsage) depending on
1323 # whichever is greater.
1324 ad = f"({was_mem_exceeded}) " \
1325 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \
1326 f": max({{{memory}, MemoryUsage ?: 0}}))"
1327 return ad