Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from datetime import datetime, timedelta
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import (
47 chdir,
48 create_count_summary
49)
50from .lssthtc import (
51 HTCDag,
52 HTCJob,
53 MISSING_ID,
54 JobStatus,
55 NodeStatus,
56 htc_check_dagman_output,
57 htc_escape,
58 htc_submit_dag,
59 read_dag_log,
60 read_dag_status,
61 read_node_status,
62 condor_history,
63 condor_q,
64 condor_status,
65 pegasus_name_to_label,
66 summary_from_dag,
67)
70DEFAULT_HTC_EXEC_PATT = ".*worker.*"
71"""Default pattern for searching execute machines in an HTCondor pool.
72"""
74_LOG = logging.getLogger(__name__)
77class HTCondorService(BaseWmsService):
78 """HTCondor version of WMS service.
79 """
80 def prepare(self, config, generic_workflow, out_prefix=None):
81 """Convert generic workflow to an HTCondor DAG ready for submission.
83 Parameters
84 ----------
85 config : `lsst.ctrl.bps.BpsConfig`
86 BPS configuration that includes necessary submit/runtime
87 information.
88 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
89 The generic workflow (e.g., has executable name and arguments).
90 out_prefix : `str`
91 The root directory into which all WMS-specific files are written.
93 Returns
94 ----------
95 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
96 HTCondor workflow ready to be run.
97 """
98 _LOG.debug("out_prefix = '%s'", out_prefix)
99 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
100 f"{self.__class__.__module__}."
101 f"{self.__class__.__name__}")
102 workflow.write(out_prefix)
103 return workflow
105 def submit(self, workflow):
106 """Submit a single HTCondor workflow.
108 Parameters
109 ----------
110 workflow : `lsst.ctrl.bps.BaseWorkflow`
111 A single HTCondor workflow to submit. run_id is updated after
112 successful submission to WMS.
113 """
114 # For workflow portability, internal paths are all relative. Hence
115 # the DAG needs to be submitted to HTCondor from inside the submit
116 # directory.
117 with chdir(workflow.submit_path):
118 _LOG.info("Submitting from directory: %s", os.getcwd())
119 htc_submit_dag(workflow.dag, {})
120 workflow.run_id = workflow.dag.run_id
122 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
123 """Query WMS for list of submitted WMS workflows/jobs.
125 This should be a quick lookup function to create list of jobs for
126 other functions.
128 Parameters
129 ----------
130 wms_id : `int` or `str`, optional
131 Id or path that can be used by WMS service to look up job.
132 user : `str`, optional
133 User whose submitted jobs should be listed.
134 require_bps : `bool`, optional
135 Whether to require jobs returned in list to be bps-submitted jobs.
136 pass_thru : `str`, optional
137 Information to pass through to WMS.
139 Returns
140 -------
141 job_ids : `list` [`Any`]
142 Only job ids to be used by cancel and other functions. Typically
143 this means top-level jobs (i.e., not children jobs).
144 """
145 _LOG.debug("list_submitted_jobs params: wms_id=%s, user=%s, require_bps=%s, pass_thru=%s",
146 wms_id, user, require_bps, pass_thru)
147 constraint = ""
149 if wms_id is None:
150 if user is not None:
151 constraint = f'(Owner == "{user}")'
152 else:
153 cluster_id = _wms_id_to_cluster(wms_id)
154 if cluster_id != 0:
155 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
157 if require_bps:
158 constraint += ' && (bps_isjob == "True")'
160 if pass_thru:
161 if "-forcex" in pass_thru:
162 pass_thru_2 = pass_thru.replace("-forcex", "")
163 if pass_thru_2 and not pass_thru_2.isspace():
164 constraint += f"&& ({pass_thru_2})"
165 else:
166 constraint += f" && ({pass_thru})"
168 _LOG.debug("constraint = %s", constraint)
169 jobs = condor_q(constraint)
171 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
172 job_ids = []
173 for job_id, job_info in jobs.items():
174 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_info.get("DAGManJobId", "None"))
175 if "DAGManJobId" not in job_info: # orphaned job
176 job_ids.append(job_id)
177 else:
178 _LOG.debug("Looking for %s", f"{job_info['DAGManJobId']}.0")
179 _LOG.debug("\tin jobs.keys() = %s", jobs.keys())
180 if f"{job_info['DAGManJobId']}.0" not in jobs:
181 job_ids.append(job_id)
183 _LOG.debug("job_ids = %s", job_ids)
184 return job_ids
186 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
187 """Return run information based upon given constraints.
189 Parameters
190 ----------
191 wms_workflow_id : `str`
192 Limit to specific run based on id.
193 user : `str`
194 Limit results to runs for this user.
195 hist : `float`
196 Limit history search to this many days.
197 pass_thru : `str`
198 Constraints to pass through to HTCondor.
200 Returns
201 -------
202 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
203 Information about runs from given job information.
204 message : `str`
205 Extra message for report command to print. This could be pointers
206 to documentation or to WMS specific commands.
207 """
208 message = ""
210 if wms_workflow_id:
211 # Explicitly checking if wms_workflow_id can be converted to a
212 # float instead of using try/except to avoid catching a different
213 # ValueError from _report_from_id
214 try:
215 float(wms_workflow_id)
216 is_float = True
217 except ValueError: # Don't need TypeError here as None goes to else branch.
218 is_float = False
220 if is_float:
221 run_reports, message = _report_from_id(float(wms_workflow_id), hist)
222 else:
223 run_reports, message = _report_from_path(wms_workflow_id)
224 else:
225 run_reports, message = _summary_report(user, hist, pass_thru)
226 _LOG.debug("report: %s, %s", run_reports, message)
228 return list(run_reports.values()), message
230 def cancel(self, wms_id, pass_thru=None):
231 """Cancel submitted workflows/jobs.
233 Parameters
234 ----------
235 wms_id : `str`
236 ID or path of job that should be canceled.
237 pass_thru : `str`, optional
238 Information to pass through to WMS.
240 Returns
241 --------
242 deleted : `bool`
243 Whether successful deletion or not. Currently, if any doubt or any
244 individual jobs not deleted, return False.
245 message : `str`
246 Any message from WMS (e.g., error details).
247 """
248 _LOG.debug("Canceling wms_id = %s", wms_id)
250 cluster_id = _wms_id_to_cluster(wms_id)
251 if cluster_id == 0:
252 deleted = False
253 message = "Invalid id"
254 else:
255 _LOG.debug("Canceling cluster_id = %s", cluster_id)
256 schedd = htcondor.Schedd()
257 constraint = f"ClusterId == {cluster_id}"
258 if pass_thru is not None and "-forcex" in pass_thru:
259 pass_thru_2 = pass_thru.replace("-forcex", "")
260 if pass_thru_2 and not pass_thru_2.isspace():
261 constraint += f"&& ({pass_thru_2})"
262 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
263 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
264 else:
265 if pass_thru:
266 constraint += f"&& ({pass_thru})"
267 _LOG.debug("JobAction.Remove constraint = %s", constraint)
268 results = schedd.act(htcondor.JobAction.Remove, constraint)
269 _LOG.debug("Remove results: %s", results)
271 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
272 deleted = True
273 message = ""
274 else:
275 deleted = False
276 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
277 message = "no such bps job in batch queue"
278 else:
279 message = f"unknown problems deleting: {results}"
281 _LOG.debug("deleted: %s; message = %s", deleted, message)
282 return deleted, message
285class HTCondorWorkflow(BaseWmsWorkflow):
286 """Single HTCondor workflow.
288 Parameters
289 ----------
290 name : `str`
291 Unique name for Workflow used when naming files.
292 config : `lsst.ctrl.bps.BpsConfig`
293 BPS configuration that includes necessary submit/runtime information.
294 """
295 def __init__(self, name, config=None):
296 super().__init__(name, config)
297 self.dag = None
299 @classmethod
300 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
301 # Docstring inherited
302 htc_workflow = cls(generic_workflow.name, config)
303 htc_workflow.dag = HTCDag(name=generic_workflow.name)
305 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
306 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
307 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
308 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
309 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
310 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
312 # Determine the hard limit for the memory requirement.
313 found, limit = config.search('memoryLimit')
314 if not found:
315 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
316 _, site = config.search("computeSite")
317 if site:
318 search_opts["curvals"] = {"curr_site": site}
319 _, patt = config.search("executeMachinesPattern", opt=search_opts)
321 # To reduce the amount of data, ignore dynamic slots (if any) as,
322 # by definition, they cannot have more memory than
323 # the partitionable slot they are the part of.
324 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
325 pool_info = condor_status(constraint=constraint)
326 try:
327 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
328 except ValueError:
329 _LOG.debug("No execute machine in the pool matches %s", patt)
330 if limit:
331 config[".bps_defined.memory_limit"] = limit
333 # Create all DAG jobs
334 for job_name in generic_workflow:
335 gwjob = generic_workflow.get_job(job_name)
336 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
337 htc_workflow.dag.add_job(htc_job)
339 # Add job dependencies to the DAG
340 for job_name in generic_workflow:
341 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
343 # If final job exists in generic workflow, create DAG final job
344 final = generic_workflow.get_final()
345 if final and isinstance(final, GenericWorkflowJob):
346 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
347 if "post" not in final_htjob.dagcmds:
348 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
349 f" {final.name} $DAG_STATUS $RETURN"
350 htc_workflow.dag.add_final_job(final_htjob)
351 elif final and isinstance(final, GenericWorkflow):
352 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
353 elif final:
354 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
356 return htc_workflow
358 @staticmethod
359 def _create_job(config, generic_workflow, gwjob, out_prefix):
360 """Convert GenericWorkflow job nodes to DAG jobs.
362 Parameters
363 ----------
364 config : `lsst.ctrl.bps.BpsConfig`
365 BPS configuration that includes necessary submit/runtime
366 information.
367 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
368 Generic workflow that is being converted.
369 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
370 The generic job to convert to a HTCondor job.
371 out_prefix : `str`
372 Directory prefix for HTCondor files.
374 Returns
375 -------
376 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
377 The HTCondor job equivalent to the given generic job.
378 """
379 htc_job = HTCJob(gwjob.name, label=gwjob.label)
381 curvals = dataclasses.asdict(gwjob)
382 if gwjob.tags:
383 curvals.update(gwjob.tags)
384 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
385 if not found:
386 subdir = "jobs"
387 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
389 htc_job_cmds = {
390 "universe": "vanilla",
391 "should_transfer_files": "YES",
392 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
393 "transfer_output_files": '""', # Set to empty string to disable
394 "transfer_executable": "False",
395 "getenv": "True",
397 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
398 # to put SIGBUS jobs on hold.
399 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
400 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
401 "on_exit_hold_subcode": "34"
402 }
404 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
406 # job stdout, stderr, htcondor user log.
407 for key in ("output", "error", "log"):
408 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
409 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
411 _, use_shared = config.search("bpsUseShared", opt={"default": False})
412 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
414 # Add the job cmds dict to the job object.
415 htc_job.add_job_cmds(htc_job_cmds)
417 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
419 # Add job attributes to job.
420 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
421 htc_job.add_job_attrs(gwjob.attrs)
422 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
423 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
424 "bps_job_label": gwjob.label})
426 return htc_job
428 def write(self, out_prefix):
429 """Output HTCondor DAGMan files needed for workflow submission.
431 Parameters
432 ----------
433 out_prefix : `str`
434 Directory prefix for HTCondor files.
435 """
436 self.submit_path = out_prefix
437 os.makedirs(out_prefix, exist_ok=True)
439 # Write down the workflow in HTCondor format.
440 self.dag.write(out_prefix, "jobs/{self.label}")
443def _translate_job_cmds(config, generic_workflow, gwjob):
444 """Translate the job data that are one to one mapping
446 Parameters
447 ----------
448 config : `lsst.ctrl.bps.BpsConfig`
449 BPS configuration that includes necessary submit/runtime
450 information.
451 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
452 Generic workflow that contains job to being converted.
453 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
454 Generic workflow job to be converted.
456 Returns
457 -------
458 htc_job_commands : `dict` [`str`, `Any`]
459 Contains commands which can appear in the HTCondor submit description
460 file.
461 """
462 # Values in the job script that just are name mappings.
463 job_translation = {"mail_to": "notify_user",
464 "when_to_mail": "notification",
465 "request_cpus": "request_cpus",
466 "priority": "priority",
467 "category": "category"}
469 jobcmds = {}
470 for gwkey, htckey in job_translation.items():
471 jobcmds[htckey] = getattr(gwjob, gwkey, None)
473 # job commands that need modification
474 if gwjob.number_of_retries:
475 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
477 if gwjob.retry_unless_exit:
478 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
480 if gwjob.request_disk:
481 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
483 if gwjob.request_memory:
484 jobcmds["request_memory"] = f"{gwjob.request_memory}"
486 if gwjob.memory_multiplier:
487 # Do not use try-except! At the moment, BpsConfig returns an empty
488 # string if it does not contain the key.
489 memory_limit = config[".bps_defined.memory_limit"]
490 if not memory_limit:
491 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
492 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
493 "search pattern 'executeMachinesPattern' might help.")
494 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
496 # Periodically release jobs which are being held due to exceeding
497 # memory. Stop doing that (by removing the job from the HTCondor queue)
498 # after the maximal number of retries has been reached or the memory
499 # requirements cannot be satisfied.
500 jobcmds["periodic_release"] = \
501 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
502 jobcmds["periodic_remove"] = \
503 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
504 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
506 # Assume concurrency_limit implemented using HTCondor concurrency limits.
507 # May need to move to special site-specific implementation if sites use
508 # other mechanisms.
509 if gwjob.concurrency_limit:
510 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
512 # Handle command line
513 if gwjob.executable.transfer_executable:
514 jobcmds["transfer_executable"] = "True"
515 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
516 else:
517 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
519 if gwjob.arguments:
520 arguments = gwjob.arguments
521 arguments = _replace_cmd_vars(arguments, gwjob)
522 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
523 arguments = _fix_env_var_syntax(arguments)
524 jobcmds["arguments"] = arguments
526 # Add extra "pass-thru" job commands
527 if gwjob.profile:
528 for key, val in gwjob.profile.items():
529 jobcmds[key] = htc_escape(val)
531 return jobcmds
534def _translate_dag_cmds(gwjob):
535 """Translate job values into DAGMan commands.
537 Parameters
538 ----------
539 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
540 Job containing values to be translated.
542 Returns
543 -------
544 dagcmds : `dict` [`str`, `Any`]
545 DAGMan commands for the job.
546 """
547 # Values in the dag script that just are name mappings.
548 dag_translation = {"abort_on_value": "abort_dag_on",
549 "abort_return_value": "abort_exit"}
551 dagcmds = {}
552 for gwkey, htckey in dag_translation.items():
553 dagcmds[htckey] = getattr(gwjob, gwkey, None)
555 # Still to be coded: vars "pre_cmdline", "post_cmdline"
556 return dagcmds
559def _fix_env_var_syntax(oldstr):
560 """Change ENV place holders to HTCondor Env var syntax.
562 Parameters
563 ----------
564 oldstr : `str`
565 String in which environment variable syntax is to be fixed.
567 Returns
568 -------
569 newstr : `str`
570 Given string with environment variable syntax fixed.
571 """
572 newstr = oldstr
573 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
574 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
575 return newstr
578def _replace_file_vars(config, arguments, workflow, gwjob):
579 """Replace file placeholders in command line arguments with correct
580 physical file names.
582 Parameters
583 ----------
584 config : `lsst.ctrl.bps.BpsConfig`
585 BPS configuration that includes necessary submit/runtime
586 information.
587 arguments : `str`
588 Arguments string in which to replace file placeholders.
589 workflow : `lsst.ctrl.bps.GenericWorkflow`
590 Generic workflow that contains file information.
591 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
592 The job corresponding to the arguments.
594 Returns
595 -------
596 arguments : `str`
597 Given arguments string with file placeholders replaced.
598 """
599 _, use_shared = config.search("bpsUseShared", opt={"default": False})
601 # Replace input file placeholders with paths.
602 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
603 if not gwfile.wms_transfer:
604 # Must assume full URI if in command line and told WMS is not
605 # responsible for transferring file.
606 uri = gwfile.src_uri
607 elif use_shared:
608 if gwfile.job_shared:
609 # Have shared filesystems and jobs can share file.
610 uri = gwfile.src_uri
611 else:
612 # Taking advantage of inside knowledge. Not future-proof.
613 # Temporary fix until have job wrapper that pulls files
614 # within job.
615 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
616 uri = "butler.yaml"
617 else:
618 uri = os.path.basename(gwfile.src_uri)
619 else: # Using push transfer
620 uri = os.path.basename(gwfile.src_uri)
621 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
623 # Replace output file placeholders with paths.
624 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
625 if not gwfile.wms_transfer:
626 # Must assume full URI if in command line and told WMS is not
627 # responsible for transferring file.
628 uri = gwfile.src_uri
629 elif use_shared:
630 if gwfile.job_shared:
631 # Have shared filesystems and jobs can share file.
632 uri = gwfile.src_uri
633 else:
634 uri = os.path.basename(gwfile.src_uri)
635 else: # Using push transfer
636 uri = os.path.basename(gwfile.src_uri)
637 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
638 return arguments
641def _replace_cmd_vars(arguments, gwjob):
642 """Replace format-style placeholders in arguments.
644 Parameters
645 ----------
646 arguments : `str`
647 Arguments string in which to replace placeholders.
648 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
649 Job containing values to be used to replace placeholders
650 (in particular gwjob.cmdvals).
652 Returns
653 -------
654 arguments : `str`
655 Given arguments string with placeholders replaced.
656 """
657 try:
658 arguments = arguments.format(**gwjob.cmdvals)
659 except (KeyError, TypeError): # TypeError in case None instead of {}
660 _LOG.error("Could not replace command variables:\n"
661 "arguments: %s\n"
662 "cmdvals: %s", arguments, gwjob.cmdvals)
663 raise
664 return arguments
667def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
668 """Add job input files from generic workflow to job.
670 Parameters
671 ----------
672 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
673 The generic workflow (e.g., has executable name and arguments).
674 job_name : `str`
675 Unique name for the job.
676 use_shared : `bool`
677 Whether job has access to files via shared filesystem.
678 out_prefix : `str`
679 The root directory into which all WMS-specific files are written.
681 Returns
682 -------
683 htc_commands : `dict` [`str`, `str`]
684 HTCondor commands for the job submission script.
685 """
686 htc_commands = {}
687 inputs = []
688 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
689 _LOG.debug("src_uri=%s", gwf_file.src_uri)
691 uri = Path(gwf_file.src_uri)
693 # Note if use_shared and job_shared, don't need to transfer file.
695 if not use_shared: # Copy file using push to job
696 inputs.append(str(uri.relative_to(out_prefix)))
697 elif not gwf_file.job_shared: # Jobs require own copy
699 # if using shared filesystem, but still need copy in job. Use
700 # HTCondor's curl plugin for a local copy.
702 # Execution butler is represented as a directory which the
703 # curl plugin does not handle. Taking advantage of inside
704 # knowledge for temporary fix until have job wrapper that pulls
705 # files within job.
706 if gwf_file.name == "butlerConfig":
707 # The execution butler directory doesn't normally exist until
708 # the submit phase so checking for suffix instead of using
709 # is_dir(). If other non-yaml file exists they would have a
710 # different gwf_file.name.
711 if uri.suffix == ".yaml": # Single file, so just copy.
712 inputs.append(f"file://{uri}")
713 else:
714 inputs.append(f"file://{uri / 'butler.yaml'}")
715 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
716 elif uri.is_dir():
717 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)",
718 gwf_file.src_uri)
719 else:
720 inputs.append(f"file://{uri}")
722 if inputs:
723 htc_commands["transfer_input_files"] = ",".join(inputs)
724 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
725 return htc_commands
728def _report_from_path(wms_path):
729 """Gather run information from a given run directory.
731 Parameters
732 ----------
733 wms_path : `str`
734 The directory containing the submit side files (e.g., HTCondor files).
736 Returns
737 -------
738 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
739 Run information for the detailed report. The key is the HTCondor id
740 and the value is a collection of report information for that run.
741 message : `str`
742 Message to be printed with the summary report.
743 """
744 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
745 if wms_workflow_id == MISSING_ID:
746 run_reports = {}
747 else:
748 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
749 return run_reports, message
752def _report_from_id(wms_workflow_id, hist):
753 """Gather run information from a given run directory.
755 Parameters
756 ----------
757 wms_workflow_id : `int` or `str`
758 Limit to specific run based on id.
759 hist : `float`
760 Limit history search to this many days.
762 Returns
763 -------
764 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
765 Run information for the detailed report. The key is the HTCondor id
766 and the value is a collection of report information for that run.
767 message : `str`
768 Message to be printed with the summary report.
769 """
770 constraint = f"(DAGManJobId == {int(float(wms_workflow_id))} || ClusterId == " \
771 f"{int(float(wms_workflow_id))})"
772 jobs = condor_q(constraint)
773 if hist:
774 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
775 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
776 hist_jobs = condor_history(constraint)
777 _update_jobs(jobs, hist_jobs)
779 # keys in dictionary will be strings of format "ClusterId.ProcId"
780 wms_workflow_id = str(wms_workflow_id)
781 if not wms_workflow_id.endswith(".0"):
782 wms_workflow_id += ".0"
784 if wms_workflow_id in jobs:
785 _, path_jobs, message = _get_info_from_path(jobs[wms_workflow_id]["Iwd"])
786 _update_jobs(jobs, path_jobs)
787 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
788 else:
789 run_reports = {}
790 message = f"Found 0 records for run id {wms_workflow_id}"
791 return run_reports, message
794def _get_info_from_path(wms_path):
795 """Gather run information from a given run directory.
797 Parameters
798 ----------
799 wms_path : `str`
800 Directory containing HTCondor files.
802 Returns
803 -------
804 wms_workflow_id : `str`
805 The run id which is a DAGman job id.
806 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
807 Information about jobs read from files in the given directory.
808 The key is the HTCondor id and the value is a dictionary of HTCondor
809 keys and values.
810 message : `str`
811 Message to be printed with the summary report.
812 """
813 try:
814 wms_workflow_id, jobs = read_dag_log(wms_path)
815 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
816 _update_jobs(jobs, read_node_status(wms_path))
817 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
819 # Add more info for DAGman job
820 job = jobs[wms_workflow_id]
821 job.update(read_dag_status(wms_path))
822 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
823 if "bps_run" not in job:
824 _add_run_info(wms_path, job)
826 message = htc_check_dagman_output(wms_path)
827 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
828 jobs[wms_workflow_id]["total_jobs"])
829 except StopIteration:
830 message = f"Could not find HTCondor files in {wms_path}"
831 _LOG.warning(message)
832 wms_workflow_id = MISSING_ID
833 jobs = {}
835 return wms_workflow_id, jobs, message
838def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
839 """Gather run information to be used in generating summary reports.
841 Parameters
842 ----------
843 wms_workflow_id : `str`
844 Run lookup restricted to given user.
845 jobs : `float`
846 How many previous days to search for run information.
848 Returns
849 -------
850 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
851 Run information for the detailed report. The key is the given HTCondor
852 id and the value is a collection of report information for that run.
853 """
854 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
855 dag_job = jobs[wms_workflow_id]
856 if "total_jobs" not in dag_job or "DAGNodeName" in dag_job:
857 _LOG.error("Job ID %s is not a DAG job.", wms_workflow_id)
858 return {}
859 report = WmsRunReport(wms_id=wms_workflow_id,
860 path=dag_job["Iwd"],
861 label=dag_job.get("bps_job_label", "MISS"),
862 run=dag_job.get("bps_run", "MISS"),
863 project=dag_job.get("bps_project", "MISS"),
864 campaign=dag_job.get("bps_campaign", "MISS"),
865 payload=dag_job.get("bps_payload", "MISS"),
866 operator=_get_owner(dag_job),
867 run_summary=_get_run_summary(dag_job),
868 state=_htc_status_to_wms_state(dag_job),
869 jobs=[],
870 total_number_jobs=dag_job["total_jobs"],
871 job_state_counts=dag_job["state_counts"])
873 try:
874 for job in jobs.values():
875 if job["ClusterId"] != int(float(wms_workflow_id)):
876 job_report = WmsJobReport(wms_id=job["ClusterId"],
877 name=job.get("DAGNodeName", str(job["ClusterId"])),
878 label=job.get("bps_job_label",
879 pegasus_name_to_label(job["DAGNodeName"])),
880 state=_htc_status_to_wms_state(job))
881 if job_report.label == "init":
882 job_report.label = "pipetaskInit"
883 report.jobs.append(job_report)
884 except KeyError as ex:
885 _LOG.error("Job missing key '%s': %s", str(ex), job)
886 raise
888 run_reports = {report.wms_id: report}
889 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
890 return run_reports
893def _summary_report(user, hist, pass_thru):
894 """Gather run information to be used in generating summary reports.
896 Parameters
897 ----------
898 user : `str`
899 Run lookup restricted to given user.
900 hist : `float`
901 How many previous days to search for run information.
902 pass_thru : `str`
903 Advanced users can define the HTCondor constraint to be used
904 when searching queue and history.
906 Returns
907 -------
908 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
909 Run information for the summary report. The keys are HTCondor ids and
910 the values are collections of report information for each run.
911 message : `str`
912 Message to be printed with the summary report.
913 """
914 # only doing summary report so only look for dagman jobs
915 if pass_thru:
916 constraint = pass_thru
917 else:
918 # Notes:
919 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
920 # manually restarted.
921 # * Any job with DAGManJobID isn't a DAG job
922 constraint = 'bps_isjob == "True" && JobUniverse == 7'
923 if user:
924 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
926 # Check runs in queue.
927 jobs = condor_q(constraint)
929 if hist:
930 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
931 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
932 hist_jobs = condor_history(constraint)
933 _update_jobs(jobs, hist_jobs)
935 _LOG.debug("Job ids from queue and history %s", jobs.keys())
937 # Have list of DAGMan jobs, need to get run_report info.
938 run_reports = {}
939 for job in jobs.values():
940 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
941 # If didn't get from queue information (e.g., Kerberos bug),
942 # try reading from file.
943 if total_jobs == 0:
944 try:
945 job.update(read_dag_status(job["Iwd"]))
946 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
947 except StopIteration:
948 pass # don't kill report can't find htcondor files
950 if "bps_run" not in job:
951 _add_run_info(job["Iwd"], job)
952 report = WmsRunReport(wms_id=str(job.get("ClusterId", MISSING_ID)),
953 path=job["Iwd"],
954 label=job.get("bps_job_label", "MISS"),
955 run=job.get("bps_run", "MISS"),
956 project=job.get("bps_project", "MISS"),
957 campaign=job.get("bps_campaign", "MISS"),
958 payload=job.get("bps_payload", "MISS"),
959 operator=_get_owner(job),
960 run_summary=_get_run_summary(job),
961 state=_htc_status_to_wms_state(job),
962 jobs=[],
963 total_number_jobs=total_jobs,
964 job_state_counts=state_counts)
966 run_reports[report.wms_id] = report
968 return run_reports, ""
971def _add_run_info(wms_path, job):
972 """Find BPS run information elsewhere for runs without bps attributes.
974 Parameters
975 ----------
976 wms_path : `str`
977 Path to submit files for the run.
978 job : `dict` [`str`, `Any`]
979 HTCondor dag job information.
981 Raises
982 ------
983 StopIteration
984 If cannot find file it is looking for. Permission errors are
985 caught and job's run is marked with error.
986 """
987 path = Path(wms_path) / "jobs"
988 try:
989 subfile = next(path.glob("**/*.sub"))
990 except (StopIteration, PermissionError):
991 job["bps_run"] = "Unavailable"
992 else:
993 _LOG.debug("_add_run_info: subfile = %s", subfile)
994 try:
995 with open(subfile, "r") as fh:
996 for line in fh:
997 if line.startswith("+bps_"):
998 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
999 if m:
1000 _LOG.debug("Matching line: %s", line)
1001 job[m.group(1)] = m.group(2).replace('"', "")
1002 else:
1003 _LOG.debug("Could not parse attribute: %s", line)
1004 except PermissionError:
1005 job["bps_run"] = "PermissionError"
1006 _LOG.debug("After adding job = %s", job)
1009def _get_owner(job):
1010 """Get the owner of a dag job.
1012 Parameters
1013 ----------
1014 job : `dict` [`str`, `Any`]
1015 HTCondor dag job information.
1017 Returns
1018 -------
1019 owner : `str`
1020 Owner of the dag job.
1021 """
1022 owner = job.get("bps_operator", None)
1023 if not owner:
1024 owner = job.get("Owner", None)
1025 if not owner:
1026 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1027 owner = "MISS"
1028 return owner
1031def _get_run_summary(job):
1032 """Get the run summary for a job.
1034 Parameters
1035 ----------
1036 job : `dict` [`str`, `Any`]
1037 HTCondor dag job information.
1039 Returns
1040 -------
1041 summary : `str`
1042 Number of jobs per PipelineTask label in approximate pipeline order.
1043 Format: <label>:<count>[;<label>:<count>]+
1044 """
1045 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1046 if not summary:
1047 summary, _ = summary_from_dag(job["Iwd"])
1048 if not summary:
1049 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1050 _LOG.debug("_get_run_summary: summary=%s", summary)
1052 # Workaround sometimes using init vs pipetaskInit
1053 summary = summary.replace("init:", "pipetaskInit:")
1055 if "pegasus_version" in job and "pegasus" not in summary:
1056 summary += ";pegasus:0"
1058 return summary
1061def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1062 """Count number of jobs per WMS state.
1064 Parameters
1065 ----------
1066 wms_workflow_id : `str`
1067 HTCondor job id.
1068 jobs : `dict` [`str`, `Any`]
1069 HTCondor dag job information.
1071 Returns
1072 -------
1073 total_count : `int`
1074 Total number of dag nodes.
1075 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1076 Keys are the different WMS states and values are counts of jobs
1077 that are in that WMS state.
1078 """
1079 state_counts = dict.fromkeys(WmsStates, 0)
1081 for jid, jinfo in jobs.items():
1082 if jid != wms_workflow_id:
1083 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1085 total_counted = sum(state_counts.values())
1086 if "NodesTotal" in jobs[wms_workflow_id]:
1087 total_count = jobs[wms_workflow_id]["NodesTotal"]
1088 else:
1089 total_count = total_counted
1091 state_counts[WmsStates.UNREADY] += total_count - total_counted
1093 return total_count, state_counts
1096def _get_state_counts_from_dag_job(job):
1097 """Count number of jobs per WMS state.
1099 Parameters
1100 ----------
1101 job : `dict` [`str`, `Any`]
1102 HTCondor dag job information.
1104 Returns
1105 -------
1106 total_count : `int`
1107 Total number of dag nodes.
1108 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1109 Keys are the different WMS states and values are counts of jobs
1110 that are in that WMS state.
1111 """
1112 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1113 state_counts = dict.fromkeys(WmsStates, 0)
1114 if "DAG_NodesReady" in job:
1115 state_counts = {
1116 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1117 WmsStates.READY: job.get("DAG_NodesReady", 0),
1118 WmsStates.HELD: job.get("JobProcsHeld", 0),
1119 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1120 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1121 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1122 total_jobs = job.get("DAG_NodesTotal")
1123 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1124 elif "NodesFailed" in job:
1125 state_counts = {
1126 WmsStates.UNREADY: job.get("NodesUnready", 0),
1127 WmsStates.READY: job.get("NodesReady", 0),
1128 WmsStates.HELD: job.get("JobProcsHeld", 0),
1129 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1130 WmsStates.FAILED: job.get("NodesFailed", 0),
1131 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1132 try:
1133 total_jobs = job.get("NodesTotal")
1134 except KeyError as ex:
1135 _LOG.error("Job missing %s. job = %s", str(ex), job)
1136 raise
1137 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1138 else:
1139 # With Kerberos job auth and Kerberos bug, if warning would be printed
1140 # for every DAG.
1141 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1142 total_jobs = 0
1144 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1145 return total_jobs, state_counts
1148def _htc_status_to_wms_state(job):
1149 """Convert HTCondor job status to generic wms state.
1151 Parameters
1152 ----------
1153 job : `dict` [`str`, `Any`]
1154 HTCondor job information.
1156 Returns
1157 -------
1158 wms_state : `WmsStates`
1159 The equivalent WmsState to given job's status.
1160 """
1161 wms_state = WmsStates.MISFIT
1162 if "JobStatus" in job:
1163 wms_state = _htc_job_status_to_wms_state(job)
1164 elif "NodeStatus" in job:
1165 wms_state = _htc_node_status_to_wms_state(job)
1166 return wms_state
1169def _htc_job_status_to_wms_state(job):
1170 """Convert HTCondor job status to generic wms state.
1172 Parameters
1173 ----------
1174 job : `dict` [`str`, `Any`]
1175 HTCondor job information.
1177 Returns
1178 -------
1179 wms_state : `lsst.ctrl.bps.WmsStates`
1180 The equivalent WmsState to given job's status.
1181 """
1182 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1183 type(job["JobStatus"]))
1184 job_status = int(job["JobStatus"])
1185 wms_state = WmsStates.MISFIT
1187 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1188 if job_status == JobStatus.IDLE:
1189 wms_state = WmsStates.PENDING
1190 elif job_status == JobStatus.RUNNING:
1191 wms_state = WmsStates.RUNNING
1192 elif job_status == JobStatus.REMOVED:
1193 wms_state = WmsStates.DELETED
1194 elif job_status == JobStatus.COMPLETED:
1195 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1196 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1197 job.get("ReturnValue", 0):
1198 wms_state = WmsStates.FAILED
1199 else:
1200 wms_state = WmsStates.SUCCEEDED
1201 elif job_status == JobStatus.HELD:
1202 wms_state = WmsStates.HELD
1204 return wms_state
1207def _htc_node_status_to_wms_state(job):
1208 """Convert HTCondor status to generic wms state.
1210 Parameters
1211 ----------
1212 job : `dict` [`str`, `Any`]
1213 HTCondor job information.
1215 Returns
1216 -------
1217 wms_state : `lsst.ctrl.bps.WmsStates`
1218 The equivalent WmsState to given node's status.
1219 """
1220 wms_state = WmsStates.MISFIT
1222 status = job["NodeStatus"]
1223 if status == NodeStatus.NOT_READY:
1224 wms_state = WmsStates.UNREADY
1225 elif status == NodeStatus.READY:
1226 wms_state = WmsStates.READY
1227 elif status == NodeStatus.PRERUN:
1228 wms_state = WmsStates.MISFIT
1229 elif status == NodeStatus.SUBMITTED:
1230 if job["JobProcsHeld"]:
1231 wms_state = WmsStates.HELD
1232 elif job["StatusDetails"] == "not_idle":
1233 wms_state = WmsStates.RUNNING
1234 elif job["JobProcsQueued"]:
1235 wms_state = WmsStates.PENDING
1236 elif status == NodeStatus.POSTRUN:
1237 wms_state = WmsStates.MISFIT
1238 elif status == NodeStatus.DONE:
1239 wms_state = WmsStates.SUCCEEDED
1240 elif status == NodeStatus.ERROR:
1241 # Use job exist instead of post script exit
1242 if "DAGMAN error 0" in job["StatusDetails"]:
1243 wms_state = WmsStates.SUCCEEDED
1244 else:
1245 wms_state = WmsStates.FAILED
1247 return wms_state
1250def _update_jobs(jobs1, jobs2):
1251 """Update jobs1 with info in jobs2.
1253 (Basically an update for nested dictionaries.)
1255 Parameters
1256 ----------
1257 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1258 HTCondor job information to be updated.
1259 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1260 Additional HTCondor job information.
1261 """
1262 for jid, jinfo in jobs2.items():
1263 if jid in jobs1:
1264 jobs1[jid].update(jinfo)
1265 else:
1266 jobs1[jid] = jinfo
1269def _wms_id_to_cluster(wms_id):
1270 """Convert WMS ID to cluster ID.
1272 Parameters
1273 ----------
1274 wms_id : `int` or `float` or `str`
1275 HTCondor job id or path.
1277 Returns
1278 -------
1279 cluster_id : `int`
1280 HTCondor cluster id.
1281 """
1282 # If wms_id represents path, get numeric id.
1283 try:
1284 cluster_id = int(float(wms_id))
1285 except ValueError:
1286 wms_path = Path(wms_id)
1287 if wms_path.exists():
1288 try:
1289 cluster_id, _ = read_dag_log(wms_id)
1290 cluster_id = int(float(cluster_id))
1291 except StopIteration:
1292 cluster_id = 0
1293 else:
1294 cluster_id = 0
1295 return cluster_id
1298def _create_request_memory_expr(memory, multiplier):
1299 """Construct an HTCondor ClassAd expression for safe memory scaling.
1301 Parameters
1302 ----------
1303 memory : `int`
1304 Requested memory in MB.
1305 multiplier : `float`
1306 Memory growth rate between retires.
1308 Returns
1309 -------
1310 ad : `str`
1311 A string representing an HTCondor ClassAd expression enabling safe
1312 memory scaling between job retries.
1313 """
1314 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue.
1315 # The special comparison operators ensure that all comparisons below will
1316 # evaluate to FALSE in this case.
1317 was_mem_exceeded = "LastJobStatus =?= 5 " \
1318 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1319 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1321 # If job runs the first time or was held for reasons other than exceeding
1322 # the memory, set the required memory to the requested value or use
1323 # the memory value measured by HTCondor (MemoryUsage) depending on
1324 # whichever is greater.
1325 ad = f"({was_mem_exceeded}) " \
1326 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \
1327 f": max({{{memory}, MemoryUsage ?: 0}}))"
1328 return ad