Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from datetime import datetime, timedelta
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import (
47 chdir,
48 create_count_summary
49)
50from .lssthtc import (
51 HTCDag,
52 HTCJob,
53 MISSING_ID,
54 JobStatus,
55 NodeStatus,
56 htc_check_dagman_output,
57 htc_escape,
58 htc_submit_dag,
59 read_dag_log,
60 read_dag_status,
61 read_node_status,
62 condor_history,
63 condor_q,
64 condor_status,
65 pegasus_name_to_label,
66 summary_from_dag,
67)
70DEFAULT_HTC_EXEC_PATT = ".*worker.*"
71"""Default pattern for searching execute machines in an HTCondor pool.
72"""
74_LOG = logging.getLogger(__name__)
77class HTCondorService(BaseWmsService):
78 """HTCondor version of WMS service.
79 """
80 def prepare(self, config, generic_workflow, out_prefix=None):
81 """Convert generic workflow to an HTCondor DAG ready for submission.
83 Parameters
84 ----------
85 config : `lsst.ctrl.bps.BpsConfig`
86 BPS configuration that includes necessary submit/runtime
87 information.
88 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
89 The generic workflow (e.g., has executable name and arguments).
90 out_prefix : `str`
91 The root directory into which all WMS-specific files are written.
93 Returns
94 ----------
95 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
96 HTCondor workflow ready to be run.
97 """
98 _LOG.debug("out_prefix = '%s'", out_prefix)
99 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
100 f"{self.__class__.__module__}."
101 f"{self.__class__.__name__}")
102 workflow.write(out_prefix)
103 return workflow
105 def submit(self, workflow):
106 """Submit a single HTCondor workflow.
108 Parameters
109 ----------
110 workflow : `lsst.ctrl.bps.BaseWorkflow`
111 A single HTCondor workflow to submit. run_id is updated after
112 successful submission to WMS.
113 """
114 # For workflow portability, internal paths are all relative. Hence
115 # the DAG needs to be submitted to HTCondor from inside the submit
116 # directory.
117 with chdir(workflow.submit_path):
118 _LOG.info("Submitting from directory: %s", os.getcwd())
119 htc_submit_dag(workflow.dag, {})
120 workflow.run_id = workflow.dag.run_id
122 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
123 """Query WMS for list of submitted WMS workflows/jobs.
125 This should be a quick lookup function to create list of jobs for
126 other functions.
128 Parameters
129 ----------
130 wms_id : `int` or `str`, optional
131 Id or path that can be used by WMS service to look up job.
132 user : `str`, optional
133 User whose submitted jobs should be listed.
134 require_bps : `bool`, optional
135 Whether to require jobs returned in list to be bps-submitted jobs.
136 pass_thru : `str`, optional
137 Information to pass through to WMS.
139 Returns
140 -------
141 job_ids : `list` [`Any`]
142 Only job ids to be used by cancel and other functions. Typically
143 this means top-level jobs (i.e., not children jobs).
144 """
145 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
146 wms_id, user, require_bps, pass_thru)
147 constraint = ""
149 if wms_id is None:
150 if user is not None:
151 constraint = f'(Owner == "{user}")'
152 else:
153 cluster_id = _wms_id_to_cluster(wms_id)
154 if cluster_id != 0:
155 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
157 if require_bps:
158 constraint += ' && (bps_isjob == "True")'
160 if pass_thru:
161 if "-forcex" in pass_thru:
162 pass_thru_2 = pass_thru.replace("-forcex", "")
163 if pass_thru_2 and not pass_thru_2.isspace():
164 constraint += f"&& ({pass_thru_2})"
165 else:
166 constraint += f" && ({pass_thru})"
168 _LOG.debug("constraint = %s", constraint)
169 jobs = condor_q(constraint)
171 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
172 job_ids = []
173 for job_id, job_info in jobs.items():
174 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
175 if "DAGManJobId" not in job_info: # orphaned job
176 job_ids.append(job_id)
177 else:
178 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
179 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
180 if f"{job_info['DAGManJobId']}.0" not in jobs:
181 job_ids.append(job_id)
183 _LOG.debug("job_ids = %s", job_ids)
184 return job_ids
186 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
187 """Return run information based upon given constraints.
189 Parameters
190 ----------
191 wms_workflow_id : `str`
192 Limit to specific run based on id.
193 user : `str`
194 Limit results to runs for this user.
195 hist : `float`
196 Limit history search to this many days.
197 pass_thru : `str`
198 Constraints to pass through to HTCondor.
200 Returns
201 -------
202 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
203 Information about runs from given job information.
204 message : `str`
205 Extra message for report command to print. This could be pointers
206 to documentation or to WMS specific commands.
207 """
208 message = ""
210 if wms_workflow_id:
211 # Explicitly checking if wms_workflow_id can be converted to a
212 # float instead of using try/except to avoid catching a different
213 # ValueError from _report_from_id
214 try:
215 float(wms_workflow_id)
216 is_float = True
217 except ValueError: # Don't need TypeError here as None goes to else branch.
218 is_float = False
220 if is_float:
221 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
222 else:
223 run_reports, message = _report_from_path(wms_workflow_id)
224 else:
225 run_reports, message = _summary_report(user, hist, pass_thru)
226 _LOG.debug("report: %s, %s", run_reports, message)
228 return list(run_reports.values()), message
230 def cancel(self, wms_id, pass_thru=None):
231 """Cancel submitted workflows/jobs.
233 Parameters
234 ----------
235 wms_id : `str`
236 ID or path of job that should be canceled.
237 pass_thru : `str`, optional
238 Information to pass through to WMS.
240 Returns
241 --------
242 deleted : `bool`
243 Whether successful deletion or not. Currently, if any doubt or any
244 individual jobs not deleted, return False.
245 message : `str`
246 Any message from WMS (e.g., error details).
247 """
248 _LOG.debug("Canceling wms_id = %s", wms_id)
250 cluster_id = _wms_id_to_cluster(wms_id)
251 if cluster_id == 0:
252 deleted = False
253 message = "Invalid id"
254 else:
255 _LOG.debug("Canceling cluster_id = %s", cluster_id)
256 schedd = htcondor.Schedd()
257 constraint = f"ClusterId == {cluster_id}"
258 if pass_thru is not None and "-forcex" in pass_thru:
259 pass_thru_2 = pass_thru.replace("-forcex", "")
260 if pass_thru_2 and not pass_thru_2.isspace():
261 constraint += f"&& ({pass_thru_2})"
262 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
263 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
264 else:
265 if pass_thru:
266 constraint += f"&& ({pass_thru})"
267 _LOG.debug("JobAction.Remove constraint = %s", constraint)
268 results = schedd.act(htcondor.JobAction.Remove, constraint)
269 _LOG.debug("Remove results: %s", results)
271 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
272 deleted = True
273 message = ""
274 else:
275 deleted = False
276 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
277 message = "no such bps job in batch queue"
278 else:
279 message = f"unknown problems deleting: {results}"
281 _LOG.debug("deleted: %s; message = %s", deleted, message)
282 return deleted, message
285class HTCondorWorkflow(BaseWmsWorkflow):
286 """Single HTCondor workflow.
288 Parameters
289 ----------
290 name : `str`
291 Unique name for Workflow used when naming files.
292 config : `lsst.ctrl.bps.BpsConfig`
293 BPS configuration that includes necessary submit/runtime information.
294 """
295 def __init__(self, name, config=None):
296 super().__init__(name, config)
297 self.dag = None
299 @classmethod
300 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
301 # Docstring inherited
302 htc_workflow = cls(generic_workflow.name, config)
303 htc_workflow.dag = HTCDag(name=generic_workflow.name)
305 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
306 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
307 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
308 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
309 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
310 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
312 # Determine the hard limit for the memory requirement.
313 found, limit = config.search('memoryLimit')
314 if not found:
315 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
316 _, site = config.search("computeSite")
317 if site:
318 search_opts["curvals"] = {"curr_site": site}
319 _, patt = config.search("executeMachinesPattern", opt=search_opts)
321 # To reduce the amount of data, ignore dynamic slots (if any) as,
322 # by definition, they cannot have more memory than
323 # the partitionable slot they are the part of.
324 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
325 pool_info = condor_status(constraint=constraint)
326 try:
327 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
328 except ValueError:
329 _LOG.debug("No execute machine in the pool matches %s", patt)
330 if limit:
331 config[".bps_defined.memory_limit"] = limit
333 # Create all DAG jobs
334 for job_name in generic_workflow:
335 gwjob = generic_workflow.get_job(job_name)
336 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
337 htc_workflow.dag.add_job(htc_job)
339 # Add job dependencies to the DAG
340 for job_name in generic_workflow:
341 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
343 # If final job exists in generic workflow, create DAG final job
344 final = generic_workflow.get_final()
345 if final and isinstance(final, GenericWorkflowJob):
346 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
347 if "post" not in final_htjob.dagcmds:
348 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
349 f" {final.name} $DAG_STATUS $RETURN"
350 htc_workflow.dag.add_final_job(final_htjob)
351 elif final and isinstance(final, GenericWorkflow):
352 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
353 elif final:
354 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
356 return htc_workflow
358 @staticmethod
359 def _create_job(config, generic_workflow, gwjob, out_prefix):
360 """Convert GenericWorkflow job nodes to DAG jobs.
362 Parameters
363 ----------
364 config : `lsst.ctrl.bps.BpsConfig`
365 BPS configuration that includes necessary submit/runtime
366 information.
367 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
368 Generic workflow that is being converted.
369 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
370 The generic job to convert to a HTCondor job.
371 out_prefix : `str`
372 Directory prefix for HTCondor files.
374 Returns
375 -------
376 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
377 The HTCondor job equivalent to the given generic job.
378 """
379 htc_job = HTCJob(gwjob.name, label=gwjob.label)
381 curvals = dataclasses.asdict(gwjob)
382 if gwjob.tags:
383 curvals.update(gwjob.tags)
384 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
385 if not found:
386 subdir = "jobs"
387 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
389 htc_job_cmds = {
390 "universe": "vanilla",
391 "should_transfer_files": "YES",
392 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
393 "transfer_executable": "False",
394 "getenv": "True",
396 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
397 # to put SIGBUS jobs on hold.
398 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
399 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
400 "on_exit_hold_subcode": "34"
401 }
403 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
405 # job stdout, stderr, htcondor user log.
406 for key in ("output", "error", "log"):
407 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
408 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
410 _, use_shared = config.search("bpsUseShared", opt={"default": False})
411 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
413 # Add the job cmds dict to the job object.
414 htc_job.add_job_cmds(htc_job_cmds)
416 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
418 # Add job attributes to job.
419 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
420 htc_job.add_job_attrs(gwjob.attrs)
421 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
422 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
423 "bps_job_label": gwjob.label})
425 return htc_job
427 def write(self, out_prefix):
428 """Output HTCondor DAGMan files needed for workflow submission.
430 Parameters
431 ----------
432 out_prefix : `str`
433 Directory prefix for HTCondor files.
434 """
435 self.submit_path = out_prefix
436 os.makedirs(out_prefix, exist_ok=True)
438 # Write down the workflow in HTCondor format.
439 self.dag.write(out_prefix, "jobs/{self.label}")
442def _translate_job_cmds(config, generic_workflow, gwjob):
443 """Translate the job data that are one to one mapping
445 Parameters
446 ----------
447 config : `lsst.ctrl.bps.BpsConfig`
448 BPS configuration that includes necessary submit/runtime
449 information.
450 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
451 Generic workflow that contains job to being converted.
452 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
453 Generic workflow job to be converted.
455 Returns
456 -------
457 htc_job_commands : `dict` [`str`, `Any`]
458 Contains commands which can appear in the HTCondor submit description
459 file.
460 """
461 # Values in the job script that just are name mappings.
462 job_translation = {"mail_to": "notify_user",
463 "when_to_mail": "notification",
464 "request_cpus": "request_cpus",
465 "priority": "priority",
466 "category": "category"}
468 jobcmds = {}
469 for gwkey, htckey in job_translation.items():
470 jobcmds[htckey] = getattr(gwjob, gwkey, None)
472 # job commands that need modification
473 if gwjob.number_of_retries:
474 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
476 if gwjob.retry_unless_exit:
477 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
479 if gwjob.request_disk:
480 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
482 if gwjob.request_memory:
483 jobcmds["request_memory"] = f"{gwjob.request_memory}"
485 if gwjob.memory_multiplier:
486 # Do not use try-except! At the moment, BpsConfig returns an empty
487 # string if it does not contain the key.
488 memory_limit = config[".bps_defined.memory_limit"]
489 if not memory_limit:
490 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
491 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
492 "search pattern 'executeMachinesPattern' might help.")
493 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
495 # Periodically release jobs which are being held due to exceeding
496 # memory. Stop doing that (by removing the job from the HTCondor queue)
497 # after the maximal number of retries has been reached or the memory
498 # requirements cannot be satisfied.
499 jobcmds["periodic_release"] = \
500 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
501 jobcmds["periodic_remove"] = \
502 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
503 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
505 # Assume concurrency_limit implemented using HTCondor concurrency limits.
506 # May need to move to special site-specific implementation if sites use
507 # other mechanisms.
508 if gwjob.concurrency_limit:
509 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
511 # Handle command line
512 if gwjob.executable.transfer_executable:
513 jobcmds["transfer_executable"] = "True"
514 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
515 else:
516 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
518 if gwjob.arguments:
519 arguments = gwjob.arguments
520 arguments = _replace_cmd_vars(arguments, gwjob)
521 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
522 arguments = _fix_env_var_syntax(arguments)
523 jobcmds["arguments"] = arguments
525 # Add extra "pass-thru" job commands
526 if gwjob.profile:
527 for key, val in gwjob.profile.items():
528 jobcmds[key] = htc_escape(val)
530 return jobcmds
533def _translate_dag_cmds(gwjob):
534 """Translate job values into DAGMan commands.
536 Parameters
537 ----------
538 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
539 Job containing values to be translated.
541 Returns
542 -------
543 dagcmds : `dict` [`str`, `Any`]
544 DAGMan commands for the job.
545 """
546 # Values in the dag script that just are name mappings.
547 dag_translation = {"abort_on_value": "abort_dag_on",
548 "abort_return_value": "abort_exit"}
550 dagcmds = {}
551 for gwkey, htckey in dag_translation.items():
552 dagcmds[htckey] = getattr(gwjob, gwkey, None)
554 # Still to be coded: vars "pre_cmdline", "post_cmdline"
555 return dagcmds
558def _fix_env_var_syntax(oldstr):
559 """Change ENV place holders to HTCondor Env var syntax.
561 Parameters
562 ----------
563 oldstr : `str`
564 String in which environment variable syntax is to be fixed.
566 Returns
567 -------
568 newstr : `str`
569 Given string with environment variable syntax fixed.
570 """
571 newstr = oldstr
572 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
573 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
574 return newstr
577def _replace_file_vars(config, arguments, workflow, gwjob):
578 """Replace file placeholders in command line arguments with correct
579 physical file names.
581 Parameters
582 ----------
583 config : `lsst.ctrl.bps.BpsConfig`
584 BPS configuration that includes necessary submit/runtime
585 information.
586 arguments : `str`
587 Arguments string in which to replace file placeholders.
588 workflow : `lsst.ctrl.bps.GenericWorkflow`
589 Generic workflow that contains file information.
590 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
591 The job corresponding to the arguments.
593 Returns
594 -------
595 arguments : `str`
596 Given arguments string with file placeholders replaced.
597 """
598 _, use_shared = config.search("bpsUseShared", opt={"default": False})
600 # Replace input file placeholders with paths.
601 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
602 if not gwfile.wms_transfer:
603 # Must assume full URI if in command line and told WMS is not
604 # responsible for transferring file.
605 uri = gwfile.src_uri
606 elif use_shared:
607 if gwfile.job_shared:
608 # Have shared filesystems and jobs can share file.
609 uri = gwfile.src_uri
610 else:
611 # Taking advantage of inside knowledge. Not future-proof.
612 # Temporary fix until have job wrapper that pulls files
613 # within job.
614 if gwfile.name == "butlerConfig" and not Path(gwfile.src_uri).suffix:
615 uri = "butler.yaml"
616 else:
617 uri = os.path.basename(gwfile.src_uri)
618 else: # Using push transfer
619 uri = os.path.basename(gwfile.src_uri)
620 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
622 # Replace output file placeholders with paths.
623 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
624 if not gwfile.wms_transfer:
625 # Must assume full URI if in command line and told WMS is not
626 # responsible for transferring file.
627 uri = gwfile.src_uri
628 elif use_shared:
629 if gwfile.job_shared:
630 # Have shared filesystems and jobs can share file.
631 uri = gwfile.src_uri
632 else:
633 uri = os.path.basename(gwfile.src_uri)
634 else: # Using push transfer
635 uri = os.path.basename(gwfile.src_uri)
636 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
637 return arguments
640def _replace_cmd_vars(arguments, gwjob):
641 """Replace format-style placeholders in arguments.
643 Parameters
644 ----------
645 arguments : `str`
646 Arguments string in which to replace placeholders.
647 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
648 Job containing values to be used to replace placeholders
649 (in particular gwjob.cmdvals).
651 Returns
652 -------
653 arguments : `str`
654 Given arguments string with placeholders replaced.
655 """
656 try:
657 arguments = arguments.format(**gwjob.cmdvals)
658 except (KeyError, TypeError): # TypeError in case None instead of {}
659 _LOG.error("Could not replace command variables:\n"
660 "arguments: %s\n"
661 "cmdvals: %s", arguments, gwjob.cmdvals)
662 raise
663 return arguments
666def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
667 """Add job input files from generic workflow to job.
669 Parameters
670 ----------
671 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
672 The generic workflow (e.g., has executable name and arguments).
673 job_name : `str`
674 Unique name for the job.
675 use_shared : `bool`
676 Whether job has access to files via shared filesystem.
677 out_prefix : `str`
678 The root directory into which all WMS-specific files are written.
680 Returns
681 -------
682 htc_commands : `dict` [`str`, `str`]
683 HTCondor commands for the job submission script.
684 """
685 htc_commands = {}
686 inputs = []
687 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
688 _LOG.debug("src_uri=%s", gwf_file.src_uri)
690 uri = Path(gwf_file.src_uri)
692 # Note if use_shared and job_shared, don't need to transfer file.
694 if not use_shared: # Copy file using push to job
695 inputs.append(str(uri.relative_to(out_prefix)))
696 elif not gwf_file.job_shared: # Jobs require own copy
698 # if using shared filesystem, but still need copy in job. Use
699 # HTCondor's curl plugin for a local copy.
701 # Execution butler is represented as a directory which the
702 # curl plugin does not handle. Taking advantage of inside
703 # knowledge for temporary fix until have job wrapper that pulls
704 # files within job.
705 if gwf_file.name == "butlerConfig":
706 # The execution butler directory doesn't normally exist
707 # until the submit phase so checking for suffix instead
708 # of using is_dir().
709 if uri.suffix: # Single file, so just copy.
710 inputs.append(f"file://{uri}")
711 else:
712 inputs.append(f"file://{uri / 'butler.yaml'}")
713 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
714 elif uri.is_dir():
715 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)",
716 gwf_file.src_uri)
717 else:
718 inputs.append(f"file://{uri}")
720 if inputs:
721 htc_commands["transfer_input_files"] = ",".join(inputs)
722 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
723 return htc_commands
726def _report_from_path(wms_path):
727 """Gather run information from a given run directory.
729 Parameters
730 ----------
731 wms_path : `str`
732 The directory containing the submit side files (e.g., HTCondor files).
734 Returns
735 -------
736 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
737 Run information for the detailed report. The key is the HTCondor id
738 and the value is a collection of report information for that run.
739 message : `str`
740 Message to be printed with the summary report.
741 """
742 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
743 if wms_workflow_id == MISSING_ID:
744 run_reports = {}
745 else:
746 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
747 return run_reports, message
750def _report_from_id(wms_workflow_id, hist):
751 """Gather run information from a given run directory.
753 Parameters
754 ----------
755 wms_workflow_id : `int` or `str`
756 Limit to specific run based on id.
757 hist : `float`
758 Limit history search to this many days.
760 Returns
761 -------
762 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
763 Run information for the detailed report. The key is the HTCondor id
764 and the value is a collection of report information for that run.
765 message : `str`
766 Message to be printed with the summary report.
767 """
768 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
769 f"{int(float(wms_workflow_id))})"
770 jobs = condor_q(constraint)
771 if hist:
772 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
773 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
774 hist_jobs = condor_history(constraint)
775 _update_jobs(jobs, hist_jobs)
777 # keys in dictionary will be strings of format "ClusterId.ProcId"
778 wms_workflow_id = str(wms_workflow_id)
779 if not wms_workflow_id.endswith(".0"):
780 wms_workflow_id += ".0"
782 if wms_workflow_id in jobs:
783 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
784 _update_jobs(jobs, path_jobs)
785 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
786 else:
787 run_reports = {}
788 message = f"Found 0 records for run id {wms_workflow_id}"
789 return run_reports, message
792def _get_info_from_path(wms_path):
793 """Gather run information from a given run directory.
795 Parameters
796 ----------
797 wms_path : `str`
798 Directory containing HTCondor files.
800 Returns
801 -------
802 wms_workflow_id : `str`
803 The run id which is a DAGman job id.
804 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
805 Information about jobs read from files in the given directory.
806 The key is the HTCondor id and the value is a dictionary of HTCondor
807 keys and values.
808 message : `str`
809 Message to be printed with the summary report.
810 """
811 try:
812 wms_workflow_id, jobs = read_dag_log(wms_path)
813 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
814 _update_jobs(jobs, read_node_status(wms_path))
815 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
817 # Add more info for DAGman job
818 job = jobs[wms_workflow_id]
819 job.update(read_dag_status(wms_path))
820 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
821 if "bps_run" not in job:
822 _add_run_info(wms_path, job)
824 message = htc_check_dagman_output(wms_path)
825 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
826 jobs[wms_workflow_id]["total_jobs"])
827 except StopIteration:
828 message = f"Could not find HTCondor files in {wms_path}"
829 _LOG.warning(message)
830 wms_workflow_id = MISSING_ID
831 jobs = {}
833 return wms_workflow_id, jobs, message
836def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
837 """Gather run information to be used in generating summary reports.
839 Parameters
840 ----------
841 wms_workflow_id : `str`
842 Run lookup restricted to given user.
843 jobs : `float`
844 How many previous days to search for run information.
846 Returns
847 -------
848 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
849 Run information for the detailed report. The key is the given HTCondor
850 id and the value is a collection of report information for that run.
851 """
852 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
853 dag_job = jobs[wms_workflow_id]
854 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
855 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
856 return {}
857 report = WmsRunReport(wms_id=wms_workflow_id,
858 path=dag_job["Iwd"],
859 label=dag_job.get("bps_job_label", "MISS"),
860 run=dag_job.get("bps_run", "MISS"),
861 project=dag_job.get("bps_project", "MISS"),
862 campaign=dag_job.get("bps_campaign", "MISS"),
863 payload=dag_job.get("bps_payload", "MISS"),
864 operator=_get_owner(dag_job),
865 run_summary=_get_run_summary(dag_job),
866 state=_htc_status_to_wms_state(dag_job),
867 jobs=[],
868 total_number_jobs=dag_job["total_jobs"],
869 job_state_counts=dag_job["state_counts"])
871 try:
872 for job in jobs.values():
873 if job["ClusterId"] != int(float(wms_workflow_id)):
874 job_report = WmsJobReport(wms_id=job["ClusterId"],
875 name=job.get("DAGNodeName", str(job["ClusterId"])),
876 label=job.get("bps_job_label",
877 pegasus_name_to_label(job["DAGNodeName"])),
878 state=_htc_status_to_wms_state(job))
879 if job_report.label == "init":
880 job_report.label = "pipetaskInit"
881 report.jobs.append(job_report)
882 except KeyError as ex:
883 _LOG.error("Job missing key '%s': %s", str(ex), job)
884 raise
886 run_reports = {report.wms_id: report}
887 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
888 return run_reports
891def _summary_report(user, hist, pass_thru):
892 """Gather run information to be used in generating summary reports.
894 Parameters
895 ----------
896 user : `str`
897 Run lookup restricted to given user.
898 hist : `float`
899 How many previous days to search for run information.
900 pass_thru : `str`
901 Advanced users can define the HTCondor constraint to be used
902 when searching queue and history.
904 Returns
905 -------
906 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
907 Run information for the summary report. The keys are HTCondor ids and
908 the values are collections of report information for each run.
909 message : `str`
910 Message to be printed with the summary report.
911 """
912 # only doing summary report so only look for dagman jobs
913 if pass_thru:
914 constraint = pass_thru
915 else:
916 # Notes:
917 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
918 # manually restarted.
919 # * Any job with DAGManJobID isn't a DAG job
920 constraint = 'bps_isjob == "True" && JobUniverse == 7'
921 if user:
922 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
924 # Check runs in queue.
925 jobs = condor_q(constraint)
927 if hist:
928 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
929 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
930 hist_jobs = condor_history(constraint)
931 _update_jobs(jobs, hist_jobs)
933 _LOG.debug("Job ids from queue and history %s", jobs.keys())
935 # Have list of DAGMan jobs, need to get run_report info.
936 run_reports = {}
937 for job in jobs.values():
938 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
939 # If didn't get from queue information (e.g., Kerberos bug),
940 # try reading from file.
941 if total_jobs == 0:
942 try:
943 job.update(read_dag_status(job["Iwd"]))
944 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
945 except StopIteration:
946 pass # don't kill report can't find htcondor files
948 if "bps_run" not in job:
949 _add_run_info(job["Iwd"], job)
950 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
951 path=job["Iwd"],
952 label=job.get("bps_job_label", "MISS"),
953 run=job.get("bps_run", "MISS"),
954 project=job.get("bps_project", "MISS"),
955 campaign=job.get("bps_campaign", "MISS"),
956 payload=job.get("bps_payload", "MISS"),
957 operator=_get_owner(job),
958 run_summary=_get_run_summary(job),
959 state=_htc_status_to_wms_state(job),
960 jobs=[],
961 total_number_jobs=total_jobs,
962 job_state_counts=state_counts)
964 run_reports[report.wms_id] = report
966 return run_reports, ""
969def _add_run_info(wms_path, job):
970 """Find BPS run information elsewhere for runs without bps attributes.
972 Parameters
973 ----------
974 wms_path : `str`
975 Path to submit files for the run.
976 job : `dict` [`str`, `Any`]
977 HTCondor dag job information.
979 Raises
980 ------
981 StopIteration
982 If cannot find file it is looking for. Permission errors are
983 caught and job's run is marked with error.
984 """
985 path = Path(wms_path) / "jobs"
986 try:
987 subfile = next(path.glob("**/*.sub"))
988 except (StopIteration, PermissionError):
989 job["bps_run"] = "Unavailable"
990 else:
991 _LOG.debug("_add_run_info: subfile = %s", subfile)
992 try:
993 with open(subfile, "r") as fh:
994 for line in fh:
995 if line.startswith("+bps_"):
996 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
997 if m:
998 _LOG.debug("Matching line: %s", line)
999 job[m.group(1)] = m.group(2).replace('"', "")
1000 else:
1001 _LOG.debug("Could not parse attribute: %s", line)
1002 except PermissionError:
1003 job["bps_run"] = "PermissionError"
1004 _LOG.debug("After adding job = %s", job)
1007def _get_owner(job):
1008 """Get the owner of a dag job.
1010 Parameters
1011 ----------
1012 job : `dict` [`str`, `Any`]
1013 HTCondor dag job information.
1015 Returns
1016 -------
1017 owner : `str`
1018 Owner of the dag job.
1019 """
1020 owner = job.get("bps_operator", None)
1021 if not owner:
1022 owner = job.get("Owner", None)
1023 if not owner:
1024 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1025 owner = "MISS"
1026 return owner
1029def _get_run_summary(job):
1030 """Get the run summary for a job.
1032 Parameters
1033 ----------
1034 job : `dict` [`str`, `Any`]
1035 HTCondor dag job information.
1037 Returns
1038 -------
1039 summary : `str`
1040 Number of jobs per PipelineTask label in approximate pipeline order.
1041 Format: <label>:<count>[;<label>:<count>]+
1042 """
1043 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1044 if not summary:
1045 summary, _ = summary_from_dag(job["Iwd"])
1046 if not summary:
1047 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1048 _LOG.debug("_get_run_summary: summary=%s", summary)
1050 # Workaround sometimes using init vs pipetaskInit
1051 summary = summary.replace("init:", "pipetaskInit:")
1053 if "pegasus_version" in job and "pegasus" not in summary:
1054 summary += ";pegasus:0"
1056 return summary
1059def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1060 """Count number of jobs per WMS state.
1062 Parameters
1063 ----------
1064 wms_workflow_id : `str`
1065 HTCondor job id.
1066 jobs : `dict` [`str`, `Any`]
1067 HTCondor dag job information.
1069 Returns
1070 -------
1071 total_count : `int`
1072 Total number of dag nodes.
1073 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1074 Keys are the different WMS states and values are counts of jobs
1075 that are in that WMS state.
1076 """
1077 state_counts = dict.fromkeys(WmsStates, 0)
1079 for jid, jinfo in jobs.items():
1080 if jid != wms_workflow_id:
1081 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1083 total_counted = sum(state_counts.values())
1084 if "NodesTotal" in jobs[wms_workflow_id]:
1085 total_count = jobs[wms_workflow_id]["NodesTotal"]
1086 else:
1087 total_count = total_counted
1089 state_counts[WmsStates.UNREADY] += total_count - total_counted
1091 return total_count, state_counts
1094def _get_state_counts_from_dag_job(job):
1095 """Count number of jobs per WMS state.
1097 Parameters
1098 ----------
1099 job : `dict` [`str`, `Any`]
1100 HTCondor dag job information.
1102 Returns
1103 -------
1104 total_count : `int`
1105 Total number of dag nodes.
1106 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1107 Keys are the different WMS states and values are counts of jobs
1108 that are in that WMS state.
1109 """
1110 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1111 state_counts = dict.fromkeys(WmsStates, 0)
1112 if "DAG_NodesReady" in job:
1113 state_counts = {
1114 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1115 WmsStates.READY: job.get("DAG_NodesReady", 0),
1116 WmsStates.HELD: job.get("JobProcsHeld", 0),
1117 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1118 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1119 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1120 total_jobs = job.get("DAG_NodesTotal")
1121 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1122 elif "NodesFailed" in job:
1123 state_counts = {
1124 WmsStates.UNREADY: job.get("NodesUnready", 0),
1125 WmsStates.READY: job.get("NodesReady", 0),
1126 WmsStates.HELD: job.get("JobProcsHeld", 0),
1127 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1128 WmsStates.FAILED: job.get("NodesFailed", 0),
1129 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1130 try:
1131 total_jobs = job.get("NodesTotal")
1132 except KeyError as ex:
1133 _LOG.error("Job missing %s. job = %s", str(ex), job)
1134 raise
1135 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1136 else:
1137 # With Kerberos job auth and Kerberos bug, if warning would be printed
1138 # for every DAG.
1139 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1140 total_jobs = 0
1142 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1143 return total_jobs, state_counts
1146def _htc_status_to_wms_state(job):
1147 """Convert HTCondor job status to generic wms state.
1149 Parameters
1150 ----------
1151 job : `dict` [`str`, `Any`]
1152 HTCondor job information.
1154 Returns
1155 -------
1156 wms_state : `WmsStates`
1157 The equivalent WmsState to given job's status.
1158 """
1159 wms_state = WmsStates.MISFIT
1160 if "JobStatus" in job:
1161 wms_state = _htc_job_status_to_wms_state(job)
1162 elif "NodeStatus" in job:
1163 wms_state = _htc_node_status_to_wms_state(job)
1164 return wms_state
1167def _htc_job_status_to_wms_state(job):
1168 """Convert HTCondor job status to generic wms state.
1170 Parameters
1171 ----------
1172 job : `dict` [`str`, `Any`]
1173 HTCondor job information.
1175 Returns
1176 -------
1177 wms_state : `lsst.ctrl.bps.WmsStates`
1178 The equivalent WmsState to given job's status.
1179 """
1180 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1181 type(job["JobStatus"]))
1182 job_status = int(job["JobStatus"])
1183 wms_state = WmsStates.MISFIT
1185 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1186 if job_status == JobStatus.IDLE:
1187 wms_state = WmsStates.PENDING
1188 elif job_status == JobStatus.RUNNING:
1189 wms_state = WmsStates.RUNNING
1190 elif job_status == JobStatus.REMOVED:
1191 wms_state = WmsStates.DELETED
1192 elif job_status == JobStatus.COMPLETED:
1193 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1194 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1195 job.get("ReturnValue", 0):
1196 wms_state = WmsStates.FAILED
1197 else:
1198 wms_state = WmsStates.SUCCEEDED
1199 elif job_status == JobStatus.HELD:
1200 wms_state = WmsStates.HELD
1202 return wms_state
1205def _htc_node_status_to_wms_state(job):
1206 """Convert HTCondor status to generic wms state.
1208 Parameters
1209 ----------
1210 job : `dict` [`str`, `Any`]
1211 HTCondor job information.
1213 Returns
1214 -------
1215 wms_state : `lsst.ctrl.bps.WmsStates`
1216 The equivalent WmsState to given node's status.
1217 """
1218 wms_state = WmsStates.MISFIT
1220 status = job["NodeStatus"]
1221 if status == NodeStatus.NOT_READY:
1222 wms_state = WmsStates.UNREADY
1223 elif status == NodeStatus.READY:
1224 wms_state = WmsStates.READY
1225 elif status == NodeStatus.PRERUN:
1226 wms_state = WmsStates.MISFIT
1227 elif status == NodeStatus.SUBMITTED:
1228 if job["JobProcsHeld"]:
1229 wms_state = WmsStates.HELD
1230 elif job["StatusDetails"] == "not_idle":
1231 wms_state = WmsStates.RUNNING
1232 elif job["JobProcsQueued"]:
1233 wms_state = WmsStates.PENDING
1234 elif status == NodeStatus.POSTRUN:
1235 wms_state = WmsStates.MISFIT
1236 elif status == NodeStatus.DONE:
1237 wms_state = WmsStates.SUCCEEDED
1238 elif status == NodeStatus.ERROR:
1239 # Use job exist instead of post script exit
1240 if "DAGMAN error 0" in job["StatusDetails"]:
1241 wms_state = WmsStates.SUCCEEDED
1242 else:
1243 wms_state = WmsStates.FAILED
1245 return wms_state
1248def _update_jobs(jobs1, jobs2):
1249 """Update jobs1 with info in jobs2.
1251 (Basically an update for nested dictionaries.)
1253 Parameters
1254 ----------
1255 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1256 HTCondor job information to be updated.
1257 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1258 Additional HTCondor job information.
1259 """
1260 for jid, jinfo in jobs2.items():
1261 if jid in jobs1:
1262 jobs1[jid].update(jinfo)
1263 else:
1264 jobs1[jid] = jinfo
1267def _wms_id_to_cluster(wms_id):
1268 """Convert WMS ID to cluster ID.
1270 Parameters
1271 ----------
1272 wms_id : `int` or `float` or `str`
1273 HTCondor job id or path.
1275 Returns
1276 -------
1277 cluster_id : `int`
1278 HTCondor cluster id.
1279 """
1280 # If wms_id represents path, get numeric id.
1281 try:
1282 cluster_id = int(float(wms_id))
1283 except ValueError:
1284 wms_path = Path(wms_id)
1285 if wms_path.exists():
1286 try:
1287 cluster_id, _ = read_dag_log(wms_id)
1288 cluster_id = int(float(cluster_id))
1289 except StopIteration:
1290 cluster_id = 0
1291 else:
1292 cluster_id = 0
1293 return cluster_id
1296def _create_request_memory_expr(memory, multiplier):
1297 """Construct an HTCondor ClassAd expression for safe memory scaling.
1299 Parameters
1300 ----------
1301 memory : `int`
1302 Requested memory in MB.
1303 multiplier : `float`
1304 Memory growth rate between retires.
1306 Returns
1307 -------
1308 ad : `str`
1309 A string representing an HTCondor ClassAd expression enabling safe
1310 memory scaling between job retries.
1311 """
1312 was_mem_exceeded = "LastJobStatus =?= 5 " \
1313 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1314 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1316 # If job runs the first time ('MemoryUsage' is not defined), set the
1317 # required memory to a given value.
1318 ad = f"ifThenElse({was_mem_exceeded}, " \
1319 f"ifThenElse(isUndefined(MemoryUsage), {memory}, int({multiplier} * MemoryUsage)), " \
1320 f"ifThenElse(isUndefined(MemoryUsage), {memory}, max({memory}, MemoryUsage)))"
1321 return ad