Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py: 1%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from enum import IntEnum, auto
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import (
47 chdir,
48 create_count_summary
49)
50from .lssthtc import (
51 HTCDag,
52 HTCJob,
53 MISSING_ID,
54 JobStatus,
55 NodeStatus,
56 htc_check_dagman_output,
57 htc_escape,
58 htc_submit_dag,
59 read_dag_info,
60 read_dag_log,
61 read_dag_status,
62 read_node_status,
63 condor_q,
64 condor_search,
65 condor_status,
66 pegasus_name_to_label,
67 summary_from_dag,
68)
71class WmsIdType(IntEnum):
72 """Type of valid WMS ids.
73 """
75 UNKNOWN = auto()
76 """The type of id cannot be determined.
77 """
79 LOCAL = auto()
80 """The id is HTCondor job's ClusterId (with optional '.ProcId').
81 """
83 GLOBAL = auto()
84 """Id is a HTCondor's global job id.
85 """
87 PATH = auto()
88 """Id is a submission path.
89 """
92DEFAULT_HTC_EXEC_PATT = ".*worker.*"
93"""Default pattern for searching execute machines in an HTCondor pool.
94"""
96_LOG = logging.getLogger(__name__)
99class HTCondorService(BaseWmsService):
100 """HTCondor version of WMS service.
101 """
102 def prepare(self, config, generic_workflow, out_prefix=None):
103 """Convert generic workflow to an HTCondor DAG ready for submission.
105 Parameters
106 ----------
107 config : `lsst.ctrl.bps.BpsConfig`
108 BPS configuration that includes necessary submit/runtime
109 information.
110 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
111 The generic workflow (e.g., has executable name and arguments).
112 out_prefix : `str`
113 The root directory into which all WMS-specific files are written.
115 Returns
116 ----------
117 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
118 HTCondor workflow ready to be run.
119 """
120 _LOG.debug("out_prefix = '%s'", out_prefix)
121 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
122 f"{self.__class__.__module__}."
123 f"{self.__class__.__name__}")
124 workflow.write(out_prefix)
125 return workflow
127 def submit(self, workflow):
128 """Submit a single HTCondor workflow.
130 Parameters
131 ----------
132 workflow : `lsst.ctrl.bps.BaseWorkflow`
133 A single HTCondor workflow to submit. run_id is updated after
134 successful submission to WMS.
135 """
136 # For workflow portability, internal paths are all relative. Hence
137 # the DAG needs to be submitted to HTCondor from inside the submit
138 # directory.
139 with chdir(workflow.submit_path):
140 _LOG.info("Submitting from directory: %s", os.getcwd())
141 htc_submit_dag(workflow.dag, {})
142 workflow.run_id = workflow.dag.run_id
144 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
145 """Query WMS for list of submitted WMS workflows/jobs.
147 This should be a quick lookup function to create list of jobs for
148 other functions.
150 Parameters
151 ----------
152 wms_id : `int` or `str`, optional
153 Id or path that can be used by WMS service to look up job.
154 user : `str`, optional
155 User whose submitted jobs should be listed.
156 require_bps : `bool`, optional
157 Whether to require jobs returned in list to be bps-submitted jobs.
158 pass_thru : `str`, optional
159 Information to pass through to WMS.
160 is_global : `bool`, optional
161 If set, all job queues (and their histories) will be queried for
162 job information. Defaults to False which means that only the local
163 job queue will be queried.
165 Returns
166 -------
167 job_ids : `list` [`Any`]
168 Only job ids to be used by cancel and other functions. Typically
169 this means top-level jobs (i.e., not children jobs).
170 """
171 _LOG.debug("list_submitted_jobs params: "
172 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
173 wms_id, user, require_bps, pass_thru, is_global)
175 # Determine which Schedds will be queried for job information.
176 coll = htcondor.Collector()
178 schedd_ads = []
179 if is_global:
180 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
181 else:
182 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
184 # Construct appropriate constraint expression using provided arguments.
185 constraint = "False"
186 if wms_id is None:
187 if user is not None:
188 constraint = f'(Owner == "{user}")'
189 else:
190 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
191 if cluster_id is not None:
192 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
194 # If provided id is either a submission path or a global id,
195 # make sure the right Schedd will be queried regardless of
196 # 'is_global' value.
197 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
198 schedd_ads = [schedd_ad]
199 if require_bps:
200 constraint += ' && (bps_isjob == "True")'
201 if pass_thru:
202 if "-forcex" in pass_thru:
203 pass_thru_2 = pass_thru.replace("-forcex", "")
204 if pass_thru_2 and not pass_thru_2.isspace():
205 constraint += f" && ({pass_thru_2})"
206 else:
207 constraint += f" && ({pass_thru})"
209 # Create a list of scheduler daemons which need to be queried.
210 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
212 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
213 results = condor_q(constraint=constraint, schedds=schedds)
215 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
216 job_ids = []
217 for schedd_name, job_info in results.items():
218 for job_id, job_ad in job_info.items():
219 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
220 if "DAGManJobId" not in job_ad:
221 job_ids.append(job_ad.get("GlobalJobId", job_id))
222 else:
223 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
224 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
225 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
226 job_ids.append(job_ad.get("GlobalJobId", job_id))
228 _LOG.debug("job_ids = %s", job_ids)
229 return job_ids
231 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
232 """Return run information based upon given constraints.
234 Parameters
235 ----------
236 wms_workflow_id : `str`, optional
237 Limit to specific run based on id.
238 user : `str`, optional
239 Limit results to runs for this user.
240 hist : `float`, optional
241 Limit history search to this many days. Defaults to 0.
242 pass_thru : `str`, optional
243 Constraints to pass through to HTCondor.
244 is_global : `bool`, optional
245 If set, all job queues (and their histories) will be queried for
246 job information. Defaults to False which means that only the local
247 job queue will be queried.
249 Returns
250 -------
251 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
252 Information about runs from given job information.
253 message : `str`
254 Extra message for report command to print. This could be pointers
255 to documentation or to WMS specific commands.
256 """
257 if wms_workflow_id:
258 id_type = _wms_id_type(wms_workflow_id)
259 if id_type == WmsIdType.LOCAL:
260 schedulers = _locate_schedds(locate_all=is_global)
261 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
262 elif id_type == WmsIdType.GLOBAL:
263 schedulers = _locate_schedds(locate_all=True)
264 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
265 elif id_type == WmsIdType.PATH:
266 run_reports, message = _report_from_path(wms_workflow_id)
267 else:
268 run_reports, message = {}, 'Invalid job id'
269 else:
270 schedulers = _locate_schedds(locate_all=is_global)
271 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
272 _LOG.debug("report: %s, %s", run_reports, message)
274 return list(run_reports.values()), message
276 def cancel(self, wms_id, pass_thru=None):
277 """Cancel submitted workflows/jobs.
279 Parameters
280 ----------
281 wms_id : `str`
282 Id or path of job that should be canceled.
283 pass_thru : `str`, optional
284 Information to pass through to WMS.
286 Returns
287 --------
288 deleted : `bool`
289 Whether successful deletion or not. Currently, if any doubt or any
290 individual jobs not deleted, return False.
291 message : `str`
292 Any message from WMS (e.g., error details).
293 """
294 _LOG.debug("Canceling wms_id = %s", wms_id)
296 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
298 if cluster_id is None:
299 deleted = False
300 message = "invalid id"
301 else:
302 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s",
303 cluster_id, schedd_ad["Name"])
304 schedd = htcondor.Schedd(schedd_ad)
306 constraint = f"ClusterId == {cluster_id}"
307 if pass_thru is not None and "-forcex" in pass_thru:
308 pass_thru_2 = pass_thru.replace("-forcex", "")
309 if pass_thru_2 and not pass_thru_2.isspace():
310 constraint += f"&& ({pass_thru_2})"
311 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
312 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
313 else:
314 if pass_thru:
315 constraint += f"&& ({pass_thru})"
316 _LOG.debug("JobAction.Remove constraint = %s", constraint)
317 results = schedd.act(htcondor.JobAction.Remove, constraint)
318 _LOG.debug("Remove results: %s", results)
320 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
321 deleted = True
322 message = ""
323 else:
324 deleted = False
325 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
326 message = "no such bps job in batch queue"
327 else:
328 message = f"unknown problems deleting: {results}"
330 _LOG.debug("deleted: %s; message = %s", deleted, message)
331 return deleted, message
334class HTCondorWorkflow(BaseWmsWorkflow):
335 """Single HTCondor workflow.
337 Parameters
338 ----------
339 name : `str`
340 Unique name for Workflow used when naming files.
341 config : `lsst.ctrl.bps.BpsConfig`
342 BPS configuration that includes necessary submit/runtime information.
343 """
344 def __init__(self, name, config=None):
345 super().__init__(name, config)
346 self.dag = None
348 @classmethod
349 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
350 # Docstring inherited
351 htc_workflow = cls(generic_workflow.name, config)
352 htc_workflow.dag = HTCDag(name=generic_workflow.name)
354 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
355 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
356 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
357 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
358 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
359 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
361 # Determine the hard limit for the memory requirement.
362 found, limit = config.search('memoryLimit')
363 if not found:
364 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
365 _, site = config.search("computeSite")
366 if site:
367 search_opts["curvals"] = {"curr_site": site}
368 _, patt = config.search("executeMachinesPattern", opt=search_opts)
370 # To reduce the amount of data, ignore dynamic slots (if any) as,
371 # by definition, they cannot have more memory than
372 # the partitionable slot they are the part of.
373 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
374 pool_info = condor_status(constraint=constraint)
375 try:
376 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
377 except ValueError:
378 _LOG.debug("No execute machine in the pool matches %s", patt)
379 if limit:
380 config[".bps_defined.memory_limit"] = limit
382 # Create all DAG jobs
383 for job_name in generic_workflow:
384 gwjob = generic_workflow.get_job(job_name)
385 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
386 htc_workflow.dag.add_job(htc_job)
388 # Add job dependencies to the DAG
389 for job_name in generic_workflow:
390 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
392 # If final job exists in generic workflow, create DAG final job
393 final = generic_workflow.get_final()
394 if final and isinstance(final, GenericWorkflowJob):
395 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
396 if "post" not in final_htjob.dagcmds:
397 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
398 f" {final.name} $DAG_STATUS $RETURN"
399 htc_workflow.dag.add_final_job(final_htjob)
400 elif final and isinstance(final, GenericWorkflow):
401 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
402 elif final:
403 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
405 return htc_workflow
407 @staticmethod
408 def _create_job(config, generic_workflow, gwjob, out_prefix):
409 """Convert GenericWorkflow job nodes to DAG jobs.
411 Parameters
412 ----------
413 config : `lsst.ctrl.bps.BpsConfig`
414 BPS configuration that includes necessary submit/runtime
415 information.
416 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
417 Generic workflow that is being converted.
418 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
419 The generic job to convert to a HTCondor job.
420 out_prefix : `str`
421 Directory prefix for HTCondor files.
423 Returns
424 -------
425 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
426 The HTCondor job equivalent to the given generic job.
427 """
428 htc_job = HTCJob(gwjob.name, label=gwjob.label)
430 curvals = dataclasses.asdict(gwjob)
431 if gwjob.tags:
432 curvals.update(gwjob.tags)
433 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
434 if not found:
435 subdir = "jobs"
436 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
438 htc_job_cmds = {
439 "universe": "vanilla",
440 "should_transfer_files": "YES",
441 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
442 "transfer_output_files": '""', # Set to empty string to disable
443 "transfer_executable": "False",
444 "getenv": "True",
446 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
447 # to put SIGBUS jobs on hold.
448 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
449 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
450 "on_exit_hold_subcode": "34"
451 }
453 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
455 # job stdout, stderr, htcondor user log.
456 for key in ("output", "error", "log"):
457 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
458 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
460 _, use_shared = config.search("bpsUseShared", opt={"default": False})
461 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
463 # Add the job cmds dict to the job object.
464 htc_job.add_job_cmds(htc_job_cmds)
466 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
468 # Add job attributes to job.
469 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
470 htc_job.add_job_attrs(gwjob.attrs)
471 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
472 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
473 "bps_job_label": gwjob.label})
475 return htc_job
477 def write(self, out_prefix):
478 """Output HTCondor DAGMan files needed for workflow submission.
480 Parameters
481 ----------
482 out_prefix : `str`
483 Directory prefix for HTCondor files.
484 """
485 self.submit_path = out_prefix
486 os.makedirs(out_prefix, exist_ok=True)
488 # Write down the workflow in HTCondor format.
489 self.dag.write(out_prefix, "jobs/{self.label}")
492def _translate_job_cmds(config, generic_workflow, gwjob):
493 """Translate the job data that are one to one mapping
495 Parameters
496 ----------
497 config : `lsst.ctrl.bps.BpsConfig`
498 BPS configuration that includes necessary submit/runtime
499 information.
500 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
501 Generic workflow that contains job to being converted.
502 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
503 Generic workflow job to be converted.
505 Returns
506 -------
507 htc_job_commands : `dict` [`str`, `Any`]
508 Contains commands which can appear in the HTCondor submit description
509 file.
510 """
511 # Values in the job script that just are name mappings.
512 job_translation = {"mail_to": "notify_user",
513 "when_to_mail": "notification",
514 "request_cpus": "request_cpus",
515 "priority": "priority",
516 "category": "category"}
518 jobcmds = {}
519 for gwkey, htckey in job_translation.items():
520 jobcmds[htckey] = getattr(gwjob, gwkey, None)
522 # job commands that need modification
523 if gwjob.number_of_retries:
524 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
526 if gwjob.retry_unless_exit:
527 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
529 if gwjob.request_disk:
530 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
532 if gwjob.request_memory:
533 jobcmds["request_memory"] = f"{gwjob.request_memory}"
535 if gwjob.memory_multiplier:
536 # Do not use try-except! At the moment, BpsConfig returns an empty
537 # string if it does not contain the key.
538 memory_limit = config[".bps_defined.memory_limit"]
539 if not memory_limit:
540 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
541 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
542 "search pattern 'executeMachinesPattern' might help.")
544 # Set maximal amount of memory job can ask for.
545 #
546 # The check below assumes that 'memory_limit' was set to a value which
547 # realistically reflects actual physical limitations of a given compute
548 # resource.
549 memory_max = memory_limit
550 if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
551 memory_max = gwjob.request_memory_max
553 # Make job ask for more memory each time it failed due to insufficient
554 # memory requirements.
555 jobcmds["request_memory"] = \
556 _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max)
558 # Periodically release jobs which are being held due to exceeding
559 # memory. Stop doing that (by removing the job from the HTCondor queue)
560 # after the maximal number of retries has been reached or the job was
561 # already run at maximal allowed memory.
562 jobcmds["periodic_release"] = \
563 _create_periodic_release_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max)
564 jobcmds["periodic_remove"] = \
565 _create_periodic_remove_expr(gwjob.request_memory, gwjob.memory_multiplier, memory_max)
567 # Assume concurrency_limit implemented using HTCondor concurrency limits.
568 # May need to move to special site-specific implementation if sites use
569 # other mechanisms.
570 if gwjob.concurrency_limit:
571 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
573 # Handle command line
574 if gwjob.executable.transfer_executable:
575 jobcmds["transfer_executable"] = "True"
576 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
577 else:
578 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
580 if gwjob.arguments:
581 arguments = gwjob.arguments
582 arguments = _replace_cmd_vars(arguments, gwjob)
583 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
584 arguments = _fix_env_var_syntax(arguments)
585 jobcmds["arguments"] = arguments
587 # Add extra "pass-thru" job commands
588 if gwjob.profile:
589 for key, val in gwjob.profile.items():
590 jobcmds[key] = htc_escape(val)
592 return jobcmds
595def _translate_dag_cmds(gwjob):
596 """Translate job values into DAGMan commands.
598 Parameters
599 ----------
600 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
601 Job containing values to be translated.
603 Returns
604 -------
605 dagcmds : `dict` [`str`, `Any`]
606 DAGMan commands for the job.
607 """
608 # Values in the dag script that just are name mappings.
609 dag_translation = {"abort_on_value": "abort_dag_on",
610 "abort_return_value": "abort_exit"}
612 dagcmds = {}
613 for gwkey, htckey in dag_translation.items():
614 dagcmds[htckey] = getattr(gwjob, gwkey, None)
616 # Still to be coded: vars "pre_cmdline", "post_cmdline"
617 return dagcmds
620def _fix_env_var_syntax(oldstr):
621 """Change ENV place holders to HTCondor Env var syntax.
623 Parameters
624 ----------
625 oldstr : `str`
626 String in which environment variable syntax is to be fixed.
628 Returns
629 -------
630 newstr : `str`
631 Given string with environment variable syntax fixed.
632 """
633 newstr = oldstr
634 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
635 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
636 return newstr
639def _replace_file_vars(config, arguments, workflow, gwjob):
640 """Replace file placeholders in command line arguments with correct
641 physical file names.
643 Parameters
644 ----------
645 config : `lsst.ctrl.bps.BpsConfig`
646 BPS configuration that includes necessary submit/runtime
647 information.
648 arguments : `str`
649 Arguments string in which to replace file placeholders.
650 workflow : `lsst.ctrl.bps.GenericWorkflow`
651 Generic workflow that contains file information.
652 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
653 The job corresponding to the arguments.
655 Returns
656 -------
657 arguments : `str`
658 Given arguments string with file placeholders replaced.
659 """
660 _, use_shared = config.search("bpsUseShared", opt={"default": False})
662 # Replace input file placeholders with paths.
663 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
664 if not gwfile.wms_transfer:
665 # Must assume full URI if in command line and told WMS is not
666 # responsible for transferring file.
667 uri = gwfile.src_uri
668 elif use_shared:
669 if gwfile.job_shared:
670 # Have shared filesystems and jobs can share file.
671 uri = gwfile.src_uri
672 else:
673 # Taking advantage of inside knowledge. Not future-proof.
674 # Temporary fix until have job wrapper that pulls files
675 # within job.
676 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
677 uri = "butler.yaml"
678 else:
679 uri = os.path.basename(gwfile.src_uri)
680 else: # Using push transfer
681 uri = os.path.basename(gwfile.src_uri)
682 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
684 # Replace output file placeholders with paths.
685 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
686 if not gwfile.wms_transfer:
687 # Must assume full URI if in command line and told WMS is not
688 # responsible for transferring file.
689 uri = gwfile.src_uri
690 elif use_shared:
691 if gwfile.job_shared:
692 # Have shared filesystems and jobs can share file.
693 uri = gwfile.src_uri
694 else:
695 uri = os.path.basename(gwfile.src_uri)
696 else: # Using push transfer
697 uri = os.path.basename(gwfile.src_uri)
698 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
699 return arguments
702def _replace_cmd_vars(arguments, gwjob):
703 """Replace format-style placeholders in arguments.
705 Parameters
706 ----------
707 arguments : `str`
708 Arguments string in which to replace placeholders.
709 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
710 Job containing values to be used to replace placeholders
711 (in particular gwjob.cmdvals).
713 Returns
714 -------
715 arguments : `str`
716 Given arguments string with placeholders replaced.
717 """
718 try:
719 arguments = arguments.format(**gwjob.cmdvals)
720 except (KeyError, TypeError): # TypeError in case None instead of {}
721 _LOG.error("Could not replace command variables:\n"
722 "arguments: %s\n"
723 "cmdvals: %s", arguments, gwjob.cmdvals)
724 raise
725 return arguments
728def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
729 """Add job input files from generic workflow to job.
731 Parameters
732 ----------
733 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
734 The generic workflow (e.g., has executable name and arguments).
735 job_name : `str`
736 Unique name for the job.
737 use_shared : `bool`
738 Whether job has access to files via shared filesystem.
739 out_prefix : `str`
740 The root directory into which all WMS-specific files are written.
742 Returns
743 -------
744 htc_commands : `dict` [`str`, `str`]
745 HTCondor commands for the job submission script.
746 """
747 htc_commands = {}
748 inputs = []
749 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
750 _LOG.debug("src_uri=%s", gwf_file.src_uri)
752 uri = Path(gwf_file.src_uri)
754 # Note if use_shared and job_shared, don't need to transfer file.
756 if not use_shared: # Copy file using push to job
757 inputs.append(str(uri.relative_to(out_prefix)))
758 elif not gwf_file.job_shared: # Jobs require own copy
760 # if using shared filesystem, but still need copy in job. Use
761 # HTCondor's curl plugin for a local copy.
763 # Execution butler is represented as a directory which the
764 # curl plugin does not handle. Taking advantage of inside
765 # knowledge for temporary fix until have job wrapper that pulls
766 # files within job.
767 if gwf_file.name == "butlerConfig":
768 # The execution butler directory doesn't normally exist until
769 # the submit phase so checking for suffix instead of using
770 # is_dir(). If other non-yaml file exists they would have a
771 # different gwf_file.name.
772 if uri.suffix == ".yaml": # Single file, so just copy.
773 inputs.append(f"file://{uri}")
774 else:
775 inputs.append(f"file://{uri / 'butler.yaml'}")
776 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
777 elif uri.is_dir():
778 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)",
779 gwf_file.src_uri)
780 else:
781 inputs.append(f"file://{uri}")
783 if inputs:
784 htc_commands["transfer_input_files"] = ",".join(inputs)
785 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
786 return htc_commands
789def _report_from_path(wms_path):
790 """Gather run information from a given run directory.
792 Parameters
793 ----------
794 wms_path : `str`
795 The directory containing the submit side files (e.g., HTCondor files).
797 Returns
798 -------
799 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
800 Run information for the detailed report. The key is the HTCondor id
801 and the value is a collection of report information for that run.
802 message : `str`
803 Message to be printed with the summary report.
804 """
805 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
806 if wms_workflow_id == MISSING_ID:
807 run_reports = {}
808 else:
809 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
810 return run_reports, message
813def _report_from_id(wms_workflow_id, hist, schedds=None):
814 """Gather run information using workflow id.
816 Parameters
817 ----------
818 wms_workflow_id : `str`
819 Limit to specific run based on id.
820 hist : `float`
821 Limit history search to this many days.
822 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
823 HTCondor schedulers which to query for job information. If None
824 (default), all queries will be run against the local scheduler only.
826 Returns
827 -------
828 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
829 Run information for the detailed report. The key is the HTCondor id
830 and the value is a collection of report information for that run.
831 message : `str`
832 Message to be printed with the summary report.
833 """
834 dag_constraint = 'regexp("dagman$", Cmd)'
835 try:
836 cluster_id = int(float(wms_workflow_id))
837 except ValueError:
838 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
839 else:
840 dag_constraint += f" && ClusterId == {cluster_id}"
842 # With the current implementation of the condor_* functions the query will
843 # always return only one match per Scheduler.
844 #
845 # Even in the highly unlikely situation where HTCondor history (which
846 # condor_search queries too) is long enough to have jobs from before the
847 # cluster ids were rolled over (and as a result there is more then one job
848 # with the same cluster id) they will not show up in the results.
849 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
850 if len(schedd_dag_info) == 0:
851 run_reports = {}
852 message = ""
853 elif len(schedd_dag_info) == 1:
854 _, dag_info = schedd_dag_info.popitem()
855 dag_id, dag_ad = dag_info.popitem()
857 # Create a mapping between jobs and their classads. The keys will be
858 # of format 'ClusterId.ProcId'.
859 job_info = {dag_id: dag_ad}
861 # Find jobs (nodes) belonging to that DAGMan job.
862 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
863 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
864 _, node_info = schedd_job_info.popitem()
865 job_info.update(node_info)
867 # Collect additional pieces of information about jobs using HTCondor
868 # files in the submission directory.
869 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
870 _update_jobs(job_info, path_jobs)
872 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
873 message = ""
874 else:
875 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
876 run_reports = {}
877 message = f"More than one job matches id '{wms_workflow_id}', " \
878 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
879 return run_reports, message
882def _get_info_from_path(wms_path):
883 """Gather run information from a given run directory.
885 Parameters
886 ----------
887 wms_path : `str`
888 Directory containing HTCondor files.
890 Returns
891 -------
892 wms_workflow_id : `str`
893 The run id which is a DAGman job id.
894 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
895 Information about jobs read from files in the given directory.
896 The key is the HTCondor id and the value is a dictionary of HTCondor
897 keys and values.
898 message : `str`
899 Message to be printed with the summary report.
900 """
901 messages = []
902 try:
903 wms_workflow_id, jobs = read_dag_log(wms_path)
904 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
905 _update_jobs(jobs, read_node_status(wms_path))
906 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
908 # Add more info for DAGman job
909 job = jobs[wms_workflow_id]
910 job.update(read_dag_status(wms_path))
912 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
913 if "bps_run" not in job:
914 _add_run_info(wms_path, job)
916 message = htc_check_dagman_output(wms_path)
917 if message:
918 messages.append(message)
919 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
920 jobs[wms_workflow_id]["total_jobs"])
922 # Add extra pieces of information which cannot be found in HTCondor
923 # generated files like 'GlobalJobId'.
924 #
925 # Do not treat absence of this file as a serious error. Neither runs
926 # submitted with earlier versions of the plugin nor the runs submitted
927 # with Pegasus plugin will have it at the moment. However, once enough
928 # time passes and Pegasus plugin will have its own report() method
929 # (instead of sneakily using HTCondor's one), the lack of that file
930 # should be treated as seriously as lack of any other file.
931 try:
932 job_info = read_dag_info(wms_path)
933 except FileNotFoundError as exc:
934 message = f"Warn: Some information may not be available: {exc}"
935 messages.append(message)
936 else:
937 schedd_name = next(iter(job_info))
938 job_ad = next(iter(job_info[schedd_name].values()))
939 job.update(job_ad)
940 except FileNotFoundError:
941 message = f"Could not find HTCondor files in '{wms_path}'"
942 _LOG.warning(message)
943 messages.append(message)
944 wms_workflow_id = MISSING_ID
945 jobs = {}
947 message = '\n'.join([msg for msg in messages if msg])
948 return wms_workflow_id, jobs, message
951def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
952 """Gather run information to be used in generating summary reports.
954 Parameters
955 ----------
956 wms_workflow_id : `str`
957 The run id to create the report for.
958 jobs : `dict` [`str`, `dict` [`str`, Any]]
959 Mapping HTCondor job id to job information.
961 Returns
962 -------
963 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
964 Run information for the detailed report. The key is the given HTCondor
965 id and the value is a collection of report information for that run.
966 """
967 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
968 dag_job = jobs[wms_workflow_id]
969 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
970 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
971 path=dag_job["Iwd"],
972 label=dag_job.get("bps_job_label", "MISS"),
973 run=dag_job.get("bps_run", "MISS"),
974 project=dag_job.get("bps_project", "MISS"),
975 campaign=dag_job.get("bps_campaign", "MISS"),
976 payload=dag_job.get("bps_payload", "MISS"),
977 operator=_get_owner(dag_job),
978 run_summary=_get_run_summary(dag_job),
979 state=_htc_status_to_wms_state(dag_job),
980 jobs=[],
981 total_number_jobs=dag_job["total_jobs"],
982 job_state_counts=dag_job["state_counts"])
984 for job_id, job_info in jobs.items():
985 try:
986 if job_info["ClusterId"] != int(float(wms_workflow_id)):
987 job_report = WmsJobReport(wms_id=job_id,
988 name=job_info.get("DAGNodeName", job_id),
989 label=job_info.get("bps_job_label",
990 pegasus_name_to_label(job_info["DAGNodeName"])),
991 state=_htc_status_to_wms_state(job_info))
992 if job_report.label == "init":
993 job_report.label = "pipetaskInit"
994 report.jobs.append(job_report)
995 except KeyError as ex:
996 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
997 raise
999 run_reports = {report.wms_id: report}
1000 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1001 return run_reports
1004def _summary_report(user, hist, pass_thru, schedds=None):
1005 """Gather run information to be used in generating summary reports.
1007 Parameters
1008 ----------
1009 user : `str`
1010 Run lookup restricted to given user.
1011 hist : `float`
1012 How many previous days to search for run information.
1013 pass_thru : `str`
1014 Advanced users can define the HTCondor constraint to be used
1015 when searching queue and history.
1017 Returns
1018 -------
1019 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1020 Run information for the summary report. The keys are HTCondor ids and
1021 the values are collections of report information for each run.
1022 message : `str`
1023 Message to be printed with the summary report.
1024 """
1025 # only doing summary report so only look for dagman jobs
1026 if pass_thru:
1027 constraint = pass_thru
1028 else:
1029 # Notes:
1030 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1031 # manually restarted.
1032 # * Any job with DAGManJobID isn't a DAG job
1033 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1034 if user:
1035 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1037 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1039 # Have list of DAGMan jobs, need to get run_report info.
1040 run_reports = {}
1041 for jobs in job_info.values():
1042 for job_id, job in jobs.items():
1043 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1044 # If didn't get from queue information (e.g., Kerberos bug),
1045 # try reading from file.
1046 if total_jobs == 0:
1047 try:
1048 job.update(read_dag_status(job["Iwd"]))
1049 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1050 except StopIteration:
1051 pass # don't kill report can't find htcondor files
1053 if "bps_run" not in job:
1054 _add_run_info(job["Iwd"], job)
1055 report = WmsRunReport(wms_id=job_id,
1056 global_wms_id=job["GlobalJobId"],
1057 path=job["Iwd"],
1058 label=job.get("bps_job_label", "MISS"),
1059 run=job.get("bps_run", "MISS"),
1060 project=job.get("bps_project", "MISS"),
1061 campaign=job.get("bps_campaign", "MISS"),
1062 payload=job.get("bps_payload", "MISS"),
1063 operator=_get_owner(job),
1064 run_summary=_get_run_summary(job),
1065 state=_htc_status_to_wms_state(job),
1066 jobs=[],
1067 total_number_jobs=total_jobs,
1068 job_state_counts=state_counts)
1069 run_reports[report.global_wms_id] = report
1071 return run_reports, ""
1074def _add_run_info(wms_path, job):
1075 """Find BPS run information elsewhere for runs without bps attributes.
1077 Parameters
1078 ----------
1079 wms_path : `str`
1080 Path to submit files for the run.
1081 job : `dict` [`str`, `Any`]
1082 HTCondor dag job information.
1084 Raises
1085 ------
1086 StopIteration
1087 If cannot find file it is looking for. Permission errors are
1088 caught and job's run is marked with error.
1089 """
1090 path = Path(wms_path) / "jobs"
1091 try:
1092 subfile = next(path.glob("**/*.sub"))
1093 except (StopIteration, PermissionError):
1094 job["bps_run"] = "Unavailable"
1095 else:
1096 _LOG.debug("_add_run_info: subfile = %s", subfile)
1097 try:
1098 with open(subfile, "r") as fh:
1099 for line in fh:
1100 if line.startswith("+bps_"):
1101 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1102 if m:
1103 _LOG.debug("Matching line: %s", line)
1104 job[m.group(1)] = m.group(2).replace('"', "")
1105 else:
1106 _LOG.debug("Could not parse attribute: %s", line)
1107 except PermissionError:
1108 job["bps_run"] = "PermissionError"
1109 _LOG.debug("After adding job = %s", job)
1112def _get_owner(job):
1113 """Get the owner of a dag job.
1115 Parameters
1116 ----------
1117 job : `dict` [`str`, `Any`]
1118 HTCondor dag job information.
1120 Returns
1121 -------
1122 owner : `str`
1123 Owner of the dag job.
1124 """
1125 owner = job.get("bps_operator", None)
1126 if not owner:
1127 owner = job.get("Owner", None)
1128 if not owner:
1129 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1130 owner = "MISS"
1131 return owner
1134def _get_run_summary(job):
1135 """Get the run summary for a job.
1137 Parameters
1138 ----------
1139 job : `dict` [`str`, `Any`]
1140 HTCondor dag job information.
1142 Returns
1143 -------
1144 summary : `str`
1145 Number of jobs per PipelineTask label in approximate pipeline order.
1146 Format: <label>:<count>[;<label>:<count>]+
1147 """
1148 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1149 if not summary:
1150 summary, _ = summary_from_dag(job["Iwd"])
1151 if not summary:
1152 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1153 _LOG.debug("_get_run_summary: summary=%s", summary)
1155 # Workaround sometimes using init vs pipetaskInit
1156 summary = summary.replace("init:", "pipetaskInit:")
1158 if "pegasus_version" in job and "pegasus" not in summary:
1159 summary += ";pegasus:0"
1161 return summary
1164def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1165 """Count number of jobs per WMS state.
1167 Parameters
1168 ----------
1169 wms_workflow_id : `str`
1170 HTCondor job id.
1171 jobs : `dict` [`str`, `Any`]
1172 HTCondor dag job information.
1174 Returns
1175 -------
1176 total_count : `int`
1177 Total number of dag nodes.
1178 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1179 Keys are the different WMS states and values are counts of jobs
1180 that are in that WMS state.
1181 """
1182 state_counts = dict.fromkeys(WmsStates, 0)
1184 for jid, jinfo in jobs.items():
1185 if jid != wms_workflow_id:
1186 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1188 total_counted = sum(state_counts.values())
1189 if "NodesTotal" in jobs[wms_workflow_id]:
1190 total_count = jobs[wms_workflow_id]["NodesTotal"]
1191 else:
1192 total_count = total_counted
1194 state_counts[WmsStates.UNREADY] += total_count - total_counted
1196 return total_count, state_counts
1199def _get_state_counts_from_dag_job(job):
1200 """Count number of jobs per WMS state.
1202 Parameters
1203 ----------
1204 job : `dict` [`str`, `Any`]
1205 HTCondor dag job information.
1207 Returns
1208 -------
1209 total_count : `int`
1210 Total number of dag nodes.
1211 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1212 Keys are the different WMS states and values are counts of jobs
1213 that are in that WMS state.
1214 """
1215 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1216 state_counts = dict.fromkeys(WmsStates, 0)
1217 if "DAG_NodesReady" in job:
1218 state_counts = {
1219 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1220 WmsStates.READY: job.get("DAG_NodesReady", 0),
1221 WmsStates.HELD: job.get("JobProcsHeld", 0),
1222 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1223 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1224 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1225 total_jobs = job.get("DAG_NodesTotal")
1226 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1227 elif "NodesFailed" in job:
1228 state_counts = {
1229 WmsStates.UNREADY: job.get("NodesUnready", 0),
1230 WmsStates.READY: job.get("NodesReady", 0),
1231 WmsStates.HELD: job.get("JobProcsHeld", 0),
1232 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1233 WmsStates.FAILED: job.get("NodesFailed", 0),
1234 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1235 try:
1236 total_jobs = job.get("NodesTotal")
1237 except KeyError as ex:
1238 _LOG.error("Job missing %s. job = %s", str(ex), job)
1239 raise
1240 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1241 else:
1242 # With Kerberos job auth and Kerberos bug, if warning would be printed
1243 # for every DAG.
1244 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1245 total_jobs = 0
1247 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1248 return total_jobs, state_counts
1251def _htc_status_to_wms_state(job):
1252 """Convert HTCondor job status to generic wms state.
1254 Parameters
1255 ----------
1256 job : `dict` [`str`, `Any`]
1257 HTCondor job information.
1259 Returns
1260 -------
1261 wms_state : `WmsStates`
1262 The equivalent WmsState to given job's status.
1263 """
1264 wms_state = WmsStates.MISFIT
1265 if "JobStatus" in job:
1266 wms_state = _htc_job_status_to_wms_state(job)
1267 elif "NodeStatus" in job:
1268 wms_state = _htc_node_status_to_wms_state(job)
1269 return wms_state
1272def _htc_job_status_to_wms_state(job):
1273 """Convert HTCondor job status to generic wms state.
1275 Parameters
1276 ----------
1277 job : `dict` [`str`, `Any`]
1278 HTCondor job information.
1280 Returns
1281 -------
1282 wms_state : `lsst.ctrl.bps.WmsStates`
1283 The equivalent WmsState to given job's status.
1284 """
1285 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1286 type(job["JobStatus"]))
1287 job_status = int(job["JobStatus"])
1288 wms_state = WmsStates.MISFIT
1290 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1291 if job_status == JobStatus.IDLE:
1292 wms_state = WmsStates.PENDING
1293 elif job_status == JobStatus.RUNNING:
1294 wms_state = WmsStates.RUNNING
1295 elif job_status == JobStatus.REMOVED:
1296 wms_state = WmsStates.DELETED
1297 elif job_status == JobStatus.COMPLETED:
1298 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1299 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1300 job.get("ReturnValue", 0):
1301 wms_state = WmsStates.FAILED
1302 else:
1303 wms_state = WmsStates.SUCCEEDED
1304 elif job_status == JobStatus.HELD:
1305 wms_state = WmsStates.HELD
1307 return wms_state
1310def _htc_node_status_to_wms_state(job):
1311 """Convert HTCondor status to generic wms state.
1313 Parameters
1314 ----------
1315 job : `dict` [`str`, `Any`]
1316 HTCondor job information.
1318 Returns
1319 -------
1320 wms_state : `lsst.ctrl.bps.WmsStates`
1321 The equivalent WmsState to given node's status.
1322 """
1323 wms_state = WmsStates.MISFIT
1325 status = job["NodeStatus"]
1326 if status == NodeStatus.NOT_READY:
1327 wms_state = WmsStates.UNREADY
1328 elif status == NodeStatus.READY:
1329 wms_state = WmsStates.READY
1330 elif status == NodeStatus.PRERUN:
1331 wms_state = WmsStates.MISFIT
1332 elif status == NodeStatus.SUBMITTED:
1333 if job["JobProcsHeld"]:
1334 wms_state = WmsStates.HELD
1335 elif job["StatusDetails"] == "not_idle":
1336 wms_state = WmsStates.RUNNING
1337 elif job["JobProcsQueued"]:
1338 wms_state = WmsStates.PENDING
1339 elif status == NodeStatus.POSTRUN:
1340 wms_state = WmsStates.MISFIT
1341 elif status == NodeStatus.DONE:
1342 wms_state = WmsStates.SUCCEEDED
1343 elif status == NodeStatus.ERROR:
1344 # Use job exist instead of post script exit
1345 if "DAGMAN error 0" in job["StatusDetails"]:
1346 wms_state = WmsStates.SUCCEEDED
1347 else:
1348 wms_state = WmsStates.FAILED
1350 return wms_state
1353def _update_jobs(jobs1, jobs2):
1354 """Update jobs1 with info in jobs2.
1356 (Basically an update for nested dictionaries.)
1358 Parameters
1359 ----------
1360 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1361 HTCondor job information to be updated.
1362 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1363 Additional HTCondor job information.
1364 """
1365 for jid, jinfo in jobs2.items():
1366 if jid in jobs1:
1367 jobs1[jid].update(jinfo)
1368 else:
1369 jobs1[jid] = jinfo
1372def _wms_id_type(wms_id):
1373 """Determine the type of the WMS id.
1375 Parameters
1376 ----------
1377 wms_id : `str`
1378 WMS id identifying a job.
1380 Returns
1381 -------
1382 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1383 Type of WMS id.
1384 """
1385 try:
1386 int(float(wms_id))
1387 except ValueError:
1388 wms_path = Path(wms_id)
1389 if wms_path.exists():
1390 id_type = WmsIdType.PATH
1391 else:
1392 id_type = WmsIdType.GLOBAL
1393 except TypeError:
1394 id_type = WmsIdType.UNKNOWN
1395 else:
1396 id_type = WmsIdType.LOCAL
1397 return id_type
1400def _wms_id_to_cluster(wms_id):
1401 """Convert WMS id to cluster id.
1403 Parameters
1404 ----------
1405 wms_id : `int` or `float` or `str`
1406 HTCondor job id or path.
1408 Returns
1409 -------
1410 schedd_ad : `classad.ClassAd`
1411 ClassAd describing the scheduler managing the job with the given id.
1412 cluster_id : `int`
1413 HTCondor cluster id.
1414 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1415 The type of the provided id.
1416 """
1417 coll = htcondor.Collector()
1419 schedd_ad = None
1420 cluster_id = None
1421 id_type = _wms_id_type(wms_id)
1422 if id_type == WmsIdType.LOCAL:
1423 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1424 cluster_id = int(float(wms_id))
1425 elif id_type == WmsIdType.GLOBAL:
1426 constraint = f'GlobalJobId == "{wms_id}"'
1427 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1428 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()]
1429 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds]
1430 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries)
1431 if (ads := query.nextAdsNonBlocking())}
1432 if results:
1433 schedd_name = next(iter(results))
1434 schedd_ad = schedd_ads[schedd_name]
1435 cluster_id = results[schedd_name]["ClusterId"]
1436 elif id_type == WmsIdType.PATH:
1437 try:
1438 job_info = read_dag_info(wms_id)
1439 except (FileNotFoundError, PermissionError, IOError):
1440 pass
1441 else:
1442 schedd_name = next(iter(job_info))
1443 job_id = next(iter(job_info[schedd_name]))
1444 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1445 cluster_id = int(float(job_id))
1446 else:
1447 pass
1448 return schedd_ad, cluster_id, id_type
1451def _create_periodic_release_expr(memory, multiplier, limit):
1452 """Construct an HTCondorAd expression for releasing held jobs.
1454 The expression instruct HTCondor to release any job which was put on hold
1455 due to exceeding memory requirements back to the job queue providing it
1456 satisfies all of the conditions below:
1458 * number of run attempts did not reach allowable number of retries,
1459 * the memory requirements in the last failed run attempt did not reach
1460 the specified memory limit.
1462 Parameters
1463 ----------
1464 memory : `int`
1465 Requested memory in MB.
1466 multiplier : `float`
1467 Memory growth rate between retires.
1468 limit : `int`
1469 Memory limit.
1471 Returns
1472 -------
1473 expr : `str`
1474 A string representing an HTCondor ClassAd expression for releasing jobs
1475 which have been held due to exceeding the memory requirements.
1476 """
1477 is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1478 was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1480 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1481 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1482 # The special comparison operators ensure that all comparisons below will
1483 # evaluate to FALSE in this case.
1484 #
1485 # Note:
1486 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1487 # the entire expression should evaluate to FALSE when the job is not HELD.
1488 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1489 # but better safe than sorry.
1490 was_mem_exceeded = "JobStatus == 5 " \
1491 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \
1492 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1494 expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1495 return expr
1498def _create_periodic_remove_expr(memory, multiplier, limit):
1499 """Construct an HTCondorAd expression for removing jobs from the queue.
1501 The expression instruct HTCondor to remove any job which was put on hold
1502 due to exceeding memory requirements from the job queue providing it
1503 satisfies any of the conditions below:
1505 * allowable number of retries was reached,
1506 * the memory requirements during the last failed run attempt reached
1507 the specified memory limit.
1509 Parameters
1510 ----------
1511 memory : `int`
1512 Requested memory in MB.
1513 multiplier : `float`
1514 Memory growth rate between retires.
1515 limit : `int`
1516 Memory limit.
1518 Returns
1519 -------
1520 expr : `str`
1521 A string representing an HTCondor ClassAd expression for removing jobs
1522 which were run at the maximal allowable memory and still exceeded
1523 the memory requirements.
1524 """
1525 is_retry_disallowed = "NumJobStarts > JobMaxRetries"
1526 was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
1528 # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1529 # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
1530 # The special comparison operators ensure that all comparisons below will
1531 # evaluate to FALSE in this case.
1532 #
1533 # Note:
1534 # May not be strictly necessary. Operators '&&' and '||' are not strict so
1535 # the entire expression should evaluate to FALSE when the job is not HELD.
1536 # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1537 # but better safe than sorry.
1538 was_mem_exceeded = "JobStatus == 5 " \
1539 "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 " \
1540 "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1542 expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
1543 return expr
1546def _create_request_memory_expr(memory, multiplier, limit):
1547 """Construct an HTCondor ClassAd expression for safe memory scaling.
1549 Parameters
1550 ----------
1551 memory : `int`
1552 Requested memory in MB.
1553 multiplier : `float`
1554 Memory growth rate between retires.
1555 limit : `int`
1556 Memory limit.
1558 Returns
1559 -------
1560 expr : `str`
1561 A string representing an HTCondor ClassAd expression enabling safe
1562 memory scaling between job retries.
1563 """
1564 # The check if the job was held due to exceeding memory requirements
1565 # will be made *after* job was released back to the job queue (is in
1566 # the IDLE state), hence the need to use `Last*` job ClassAds instead of
1567 # the ones describing job's current state.
1568 #
1569 # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
1570 # initially put in the job queue. The special comparison operators ensure
1571 # that all comparisons below will evaluate to FALSE in this case.
1572 was_mem_exceeded = "LastJobStatus =?= 5 " \
1573 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1574 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1576 # If job runs the first time or was held for reasons other than exceeding
1577 # the memory, set the required memory to the requested value or use
1578 # the memory value measured by HTCondor (MemoryUsage) depending on
1579 # whichever is greater.
1580 expr = f"({was_mem_exceeded}) " \
1581 f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) " \
1582 f": max({{{memory}, MemoryUsage ?: 0}})"
1583 return expr
1586def _locate_schedds(locate_all=False):
1587 """Find out Scheduler daemons in an HTCondor pool.
1589 Parameters
1590 ----------
1591 locate_all : `bool`, optional
1592 If True, all available schedulers in the HTCondor pool will be located.
1593 False by default which means that the search will be limited to looking
1594 for the Scheduler running on a local host.
1596 Returns
1597 -------
1598 schedds : `dict` [`str`, `htcondor.Schedd`]
1599 A mapping between Scheduler names and Python objects allowing for
1600 interacting with them.
1601 """
1602 coll = htcondor.Collector()
1604 schedd_ads = []
1605 if locate_all:
1606 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1607 else:
1608 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1609 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}