Coverage for python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py : 1%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Interface between generic workflow to HTCondor workflow system.
23"""
25__all__ = ["HTCondorService", "HTCondorWorkflow"]
28import dataclasses
29import os
30import re
31import logging
32from enum import IntEnum, auto
33from pathlib import Path
35import htcondor
37from ... import (
38 BaseWmsWorkflow,
39 BaseWmsService,
40 GenericWorkflow,
41 GenericWorkflowJob,
42 WmsRunReport,
43 WmsJobReport,
44 WmsStates
45)
46from ...bps_utils import (
47 chdir,
48 create_count_summary
49)
50from .lssthtc import (
51 HTCDag,
52 HTCJob,
53 MISSING_ID,
54 JobStatus,
55 NodeStatus,
56 htc_check_dagman_output,
57 htc_escape,
58 htc_submit_dag,
59 read_dag_info,
60 read_dag_log,
61 read_dag_status,
62 read_node_status,
63 condor_q,
64 condor_search,
65 condor_status,
66 pegasus_name_to_label,
67 summary_from_dag,
68)
71class WmsIdType(IntEnum):
72 """Type of valid WMS ids.
73 """
75 UNKNOWN = auto()
76 """The type of id cannot be determined.
77 """
79 LOCAL = auto()
80 """The id is HTCondor job's ClusterId (with optional '.ProcId').
81 """
83 GLOBAL = auto()
84 """Id is a HTCondor's global job id.
85 """
87 PATH = auto()
88 """Id is a submission path.
89 """
92DEFAULT_HTC_EXEC_PATT = ".*worker.*"
93"""Default pattern for searching execute machines in an HTCondor pool.
94"""
96_LOG = logging.getLogger(__name__)
99class HTCondorService(BaseWmsService):
100 """HTCondor version of WMS service.
101 """
102 def prepare(self, config, generic_workflow, out_prefix=None):
103 """Convert generic workflow to an HTCondor DAG ready for submission.
105 Parameters
106 ----------
107 config : `lsst.ctrl.bps.BpsConfig`
108 BPS configuration that includes necessary submit/runtime
109 information.
110 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
111 The generic workflow (e.g., has executable name and arguments).
112 out_prefix : `str`
113 The root directory into which all WMS-specific files are written.
115 Returns
116 ----------
117 workflow : `lsst.ctrl.bps.wms.htcondor.HTCondorWorkflow`
118 HTCondor workflow ready to be run.
119 """
120 _LOG.debug("out_prefix = '%s'", out_prefix)
121 workflow = HTCondorWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
122 f"{self.__class__.__module__}."
123 f"{self.__class__.__name__}")
124 workflow.write(out_prefix)
125 return workflow
127 def submit(self, workflow):
128 """Submit a single HTCondor workflow.
130 Parameters
131 ----------
132 workflow : `lsst.ctrl.bps.BaseWorkflow`
133 A single HTCondor workflow to submit. run_id is updated after
134 successful submission to WMS.
135 """
136 # For workflow portability, internal paths are all relative. Hence
137 # the DAG needs to be submitted to HTCondor from inside the submit
138 # directory.
139 with chdir(workflow.submit_path):
140 _LOG.info("Submitting from directory: %s", os.getcwd())
141 htc_submit_dag(workflow.dag, {})
142 workflow.run_id = workflow.dag.run_id
144 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
145 """Query WMS for list of submitted WMS workflows/jobs.
147 This should be a quick lookup function to create list of jobs for
148 other functions.
150 Parameters
151 ----------
152 wms_id : `int` or `str`, optional
153 Id or path that can be used by WMS service to look up job.
154 user : `str`, optional
155 User whose submitted jobs should be listed.
156 require_bps : `bool`, optional
157 Whether to require jobs returned in list to be bps-submitted jobs.
158 pass_thru : `str`, optional
159 Information to pass through to WMS.
160 is_global : `bool`, optional
161 If set, all job queues (and their histories) will be queried for
162 job information. Defaults to False which means that only the local
163 job queue will be queried.
165 Returns
166 -------
167 job_ids : `list` [`Any`]
168 Only job ids to be used by cancel and other functions. Typically
169 this means top-level jobs (i.e., not children jobs).
170 """
171 _LOG.debug("list_submitted_jobs params: "
172 "wms_id=%s, user=%s, require_bps=%s, pass_thru=%s, is_global=%s",
173 wms_id, user, require_bps, pass_thru, is_global)
175 # Determine which Schedds will be queried for job information.
176 coll = htcondor.Collector()
178 schedd_ads = []
179 if is_global:
180 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
181 else:
182 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
184 # Construct appropriate constraint expression using provided arguments.
185 constraint = "False"
186 if wms_id is None:
187 if user is not None:
188 constraint = f'(Owner == "{user}")'
189 else:
190 schedd_ad, cluster_id, id_type = _wms_id_to_cluster(wms_id)
191 if cluster_id is not None:
192 constraint = f"(DAGManJobId == {cluster_id} || ClusterId == {cluster_id})"
194 # If provided id is either a submission path or a global id,
195 # make sure the right Schedd will be queried regardless of
196 # 'is_global' value.
197 if id_type in {WmsIdType.GLOBAL, WmsIdType.PATH}:
198 schedd_ads = [schedd_ad]
199 if require_bps:
200 constraint += ' && (bps_isjob == "True")'
201 if pass_thru:
202 if "-forcex" in pass_thru:
203 pass_thru_2 = pass_thru.replace("-forcex", "")
204 if pass_thru_2 and not pass_thru_2.isspace():
205 constraint += f" && ({pass_thru_2})"
206 else:
207 constraint += f" && ({pass_thru})"
209 # Create a list of scheduler daemons which need to be queried.
210 schedds = {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
212 _LOG.debug("constraint = %s, schedds = %s", constraint, ", ".join(schedds))
213 results = condor_q(constraint=constraint, schedds=schedds)
215 # Prune child jobs where DAG job is in queue (i.e., aren't orphans).
216 job_ids = []
217 for schedd_name, job_info in results.items():
218 for job_id, job_ad in job_info.items():
219 _LOG.debug("job_id=%s DAGManJobId=%s", job_id, job_ad.get("DAGManJobId", "None"))
220 if "DAGManJobId" not in job_ad:
221 job_ids.append(job_ad.get("GlobalJobId", job_id))
222 else:
223 _LOG.debug("Looking for %s", f"{job_ad['DAGManJobId']}.0")
224 _LOG.debug("\tin jobs.keys() = %s", job_info.keys())
225 if f"{job_ad['DAGManJobId']}.0" not in job_info: # orphaned job
226 job_ids.append(job_ad.get("GlobalJobId", job_id))
228 _LOG.debug("job_ids = %s", job_ids)
229 return job_ids
231 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
232 """Return run information based upon given constraints.
234 Parameters
235 ----------
236 wms_workflow_id : `str`, optional
237 Limit to specific run based on id.
238 user : `str`, optional
239 Limit results to runs for this user.
240 hist : `float`, optional
241 Limit history search to this many days. Defaults to 0.
242 pass_thru : `str`, optional
243 Constraints to pass through to HTCondor.
244 is_global : `bool`, optional
245 If set, all job queues (and their histories) will be queried for
246 job information. Defaults to False which means that only the local
247 job queue will be queried.
249 Returns
250 -------
251 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
252 Information about runs from given job information.
253 message : `str`
254 Extra message for report command to print. This could be pointers
255 to documentation or to WMS specific commands.
256 """
257 if wms_workflow_id:
258 id_type = _wms_id_type(wms_workflow_id)
259 if id_type == WmsIdType.LOCAL:
260 schedulers = _locate_schedds(locate_all=is_global)
261 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
262 elif id_type == WmsIdType.GLOBAL:
263 schedulers = _locate_schedds(locate_all=True)
264 run_reports, message = _report_from_id(wms_workflow_id, hist, schedds=schedulers)
265 elif id_type == WmsIdType.PATH:
266 run_reports, message = _report_from_path(wms_workflow_id)
267 else:
268 run_reports, message = {}, 'Invalid job id'
269 else:
270 schedulers = _locate_schedds(locate_all=is_global)
271 run_reports, message = _summary_report(user, hist, pass_thru, schedds=schedulers)
272 _LOG.debug("report: %s, %s", run_reports, message)
274 return list(run_reports.values()), message
276 def cancel(self, wms_id, pass_thru=None):
277 """Cancel submitted workflows/jobs.
279 Parameters
280 ----------
281 wms_id : `str`
282 Id or path of job that should be canceled.
283 pass_thru : `str`, optional
284 Information to pass through to WMS.
286 Returns
287 --------
288 deleted : `bool`
289 Whether successful deletion or not. Currently, if any doubt or any
290 individual jobs not deleted, return False.
291 message : `str`
292 Any message from WMS (e.g., error details).
293 """
294 _LOG.debug("Canceling wms_id = %s", wms_id)
296 schedd_ad, cluster_id, _ = _wms_id_to_cluster(wms_id)
298 if cluster_id is None:
299 deleted = False
300 message = "invalid id"
301 else:
302 _LOG.debug("Canceling job managed by schedd_name = %s with cluster_id = %s",
303 cluster_id, schedd_ad["Name"])
304 schedd = htcondor.Schedd(schedd_ad)
306 constraint = f"ClusterId == {cluster_id}"
307 if pass_thru is not None and "-forcex" in pass_thru:
308 pass_thru_2 = pass_thru.replace("-forcex", "")
309 if pass_thru_2 and not pass_thru_2.isspace():
310 constraint += f"&& ({pass_thru_2})"
311 _LOG.debug("JobAction.RemoveX constraint = %s", constraint)
312 results = schedd.act(htcondor.JobAction.RemoveX, constraint)
313 else:
314 if pass_thru:
315 constraint += f"&& ({pass_thru})"
316 _LOG.debug("JobAction.Remove constraint = %s", constraint)
317 results = schedd.act(htcondor.JobAction.Remove, constraint)
318 _LOG.debug("Remove results: %s", results)
320 if results["TotalSuccess"] > 0 and results["TotalError"] == 0:
321 deleted = True
322 message = ""
323 else:
324 deleted = False
325 if results["TotalSuccess"] == 0 and results["TotalError"] == 0:
326 message = "no such bps job in batch queue"
327 else:
328 message = f"unknown problems deleting: {results}"
330 _LOG.debug("deleted: %s; message = %s", deleted, message)
331 return deleted, message
334class HTCondorWorkflow(BaseWmsWorkflow):
335 """Single HTCondor workflow.
337 Parameters
338 ----------
339 name : `str`
340 Unique name for Workflow used when naming files.
341 config : `lsst.ctrl.bps.BpsConfig`
342 BPS configuration that includes necessary submit/runtime information.
343 """
344 def __init__(self, name, config=None):
345 super().__init__(name, config)
346 self.dag = None
348 @classmethod
349 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
350 # Docstring inherited
351 htc_workflow = cls(generic_workflow.name, config)
352 htc_workflow.dag = HTCDag(name=generic_workflow.name)
354 _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
355 htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
356 htc_workflow.dag.add_attribs({"bps_wms_service": service_class,
357 "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
358 "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
359 "bps_job_summary": create_count_summary(generic_workflow.job_counts)})
361 # Determine the hard limit for the memory requirement.
362 found, limit = config.search('memoryLimit')
363 if not found:
364 search_opts = {"default": DEFAULT_HTC_EXEC_PATT}
365 _, site = config.search("computeSite")
366 if site:
367 search_opts["curvals"] = {"curr_site": site}
368 _, patt = config.search("executeMachinesPattern", opt=search_opts)
370 # To reduce the amount of data, ignore dynamic slots (if any) as,
371 # by definition, they cannot have more memory than
372 # the partitionable slot they are the part of.
373 constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
374 pool_info = condor_status(constraint=constraint)
375 try:
376 limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
377 except ValueError:
378 _LOG.debug("No execute machine in the pool matches %s", patt)
379 if limit:
380 config[".bps_defined.memory_limit"] = limit
382 # Create all DAG jobs
383 for job_name in generic_workflow:
384 gwjob = generic_workflow.get_job(job_name)
385 htc_job = HTCondorWorkflow._create_job(config, generic_workflow, gwjob, out_prefix)
386 htc_workflow.dag.add_job(htc_job)
388 # Add job dependencies to the DAG
389 for job_name in generic_workflow:
390 htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
392 # If final job exists in generic workflow, create DAG final job
393 final = generic_workflow.get_final()
394 if final and isinstance(final, GenericWorkflowJob):
395 final_htjob = HTCondorWorkflow._create_job(config, generic_workflow, final, out_prefix)
396 if "post" not in final_htjob.dagcmds:
397 final_htjob.dagcmds["post"] = f"{os.path.dirname(__file__)}/final_post.sh" \
398 f" {final.name} $DAG_STATUS $RETURN"
399 htc_workflow.dag.add_final_job(final_htjob)
400 elif final and isinstance(final, GenericWorkflow):
401 raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
402 elif final:
403 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
405 return htc_workflow
407 @staticmethod
408 def _create_job(config, generic_workflow, gwjob, out_prefix):
409 """Convert GenericWorkflow job nodes to DAG jobs.
411 Parameters
412 ----------
413 config : `lsst.ctrl.bps.BpsConfig`
414 BPS configuration that includes necessary submit/runtime
415 information.
416 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
417 Generic workflow that is being converted.
418 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
419 The generic job to convert to a HTCondor job.
420 out_prefix : `str`
421 Directory prefix for HTCondor files.
423 Returns
424 -------
425 htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
426 The HTCondor job equivalent to the given generic job.
427 """
428 htc_job = HTCJob(gwjob.name, label=gwjob.label)
430 curvals = dataclasses.asdict(gwjob)
431 if gwjob.tags:
432 curvals.update(gwjob.tags)
433 found, subdir = config.search("subDirTemplate", opt={'curvals': curvals})
434 if not found:
435 subdir = "jobs"
436 htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
438 htc_job_cmds = {
439 "universe": "vanilla",
440 "should_transfer_files": "YES",
441 "when_to_transfer_output": "ON_EXIT_OR_EVICT",
442 "transfer_output_files": '""', # Set to empty string to disable
443 "transfer_executable": "False",
444 "getenv": "True",
446 # Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
447 # to put SIGBUS jobs on hold.
448 "on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
449 "on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
450 "on_exit_hold_subcode": "34"
451 }
453 htc_job_cmds.update(_translate_job_cmds(config, generic_workflow, gwjob))
455 # job stdout, stderr, htcondor user log.
456 for key in ("output", "error", "log"):
457 htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
458 _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
460 _, use_shared = config.search("bpsUseShared", opt={"default": False})
461 htc_job_cmds.update(_handle_job_inputs(generic_workflow, gwjob.name, use_shared, out_prefix))
463 # Add the job cmds dict to the job object.
464 htc_job.add_job_cmds(htc_job_cmds)
466 htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
468 # Add job attributes to job.
469 _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
470 htc_job.add_job_attrs(gwjob.attrs)
471 htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
472 htc_job.add_job_attrs({"bps_job_name": gwjob.name,
473 "bps_job_label": gwjob.label})
475 return htc_job
477 def write(self, out_prefix):
478 """Output HTCondor DAGMan files needed for workflow submission.
480 Parameters
481 ----------
482 out_prefix : `str`
483 Directory prefix for HTCondor files.
484 """
485 self.submit_path = out_prefix
486 os.makedirs(out_prefix, exist_ok=True)
488 # Write down the workflow in HTCondor format.
489 self.dag.write(out_prefix, "jobs/{self.label}")
492def _translate_job_cmds(config, generic_workflow, gwjob):
493 """Translate the job data that are one to one mapping
495 Parameters
496 ----------
497 config : `lsst.ctrl.bps.BpsConfig`
498 BPS configuration that includes necessary submit/runtime
499 information.
500 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
501 Generic workflow that contains job to being converted.
502 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
503 Generic workflow job to be converted.
505 Returns
506 -------
507 htc_job_commands : `dict` [`str`, `Any`]
508 Contains commands which can appear in the HTCondor submit description
509 file.
510 """
511 # Values in the job script that just are name mappings.
512 job_translation = {"mail_to": "notify_user",
513 "when_to_mail": "notification",
514 "request_cpus": "request_cpus",
515 "priority": "priority",
516 "category": "category"}
518 jobcmds = {}
519 for gwkey, htckey in job_translation.items():
520 jobcmds[htckey] = getattr(gwjob, gwkey, None)
522 # job commands that need modification
523 if gwjob.number_of_retries:
524 jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
526 if gwjob.retry_unless_exit:
527 jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
529 if gwjob.request_disk:
530 jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
532 if gwjob.request_memory:
533 jobcmds["request_memory"] = f"{gwjob.request_memory}"
535 if gwjob.memory_multiplier:
536 # Do not use try-except! At the moment, BpsConfig returns an empty
537 # string if it does not contain the key.
538 memory_limit = config[".bps_defined.memory_limit"]
539 if not memory_limit:
540 raise RuntimeError("Memory autoscaling enabled, but automatic detection of the memory limit "
541 "failed; setting it explicitly with 'memoryLimit' or changing worker node "
542 "search pattern 'executeMachinesPattern' might help.")
543 jobcmds["request_memory"] = _create_request_memory_expr(gwjob.request_memory, gwjob.memory_multiplier)
545 # Periodically release jobs which are being held due to exceeding
546 # memory. Stop doing that (by removing the job from the HTCondor queue)
547 # after the maximal number of retries has been reached or the memory
548 # requirements cannot be satisfied.
549 jobcmds["periodic_release"] = \
550 "NumJobStarts <= JobMaxRetries && (HoldReasonCode == 34 || HoldReasonSubCode == 34)"
551 jobcmds["periodic_remove"] = \
552 f"JobStatus == 1 && RequestMemory > {memory_limit} || " \
553 f"JobStatus == 5 && NumJobStarts > JobMaxRetries"
555 # Assume concurrency_limit implemented using HTCondor concurrency limits.
556 # May need to move to special site-specific implementation if sites use
557 # other mechanisms.
558 if gwjob.concurrency_limit:
559 jobcmds["concurrency_limit"] = gwjob.concurrency_limit
561 # Handle command line
562 if gwjob.executable.transfer_executable:
563 jobcmds["transfer_executable"] = "True"
564 jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
565 else:
566 jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
568 if gwjob.arguments:
569 arguments = gwjob.arguments
570 arguments = _replace_cmd_vars(arguments, gwjob)
571 arguments = _replace_file_vars(config, arguments, generic_workflow, gwjob)
572 arguments = _fix_env_var_syntax(arguments)
573 jobcmds["arguments"] = arguments
575 # Add extra "pass-thru" job commands
576 if gwjob.profile:
577 for key, val in gwjob.profile.items():
578 jobcmds[key] = htc_escape(val)
580 return jobcmds
583def _translate_dag_cmds(gwjob):
584 """Translate job values into DAGMan commands.
586 Parameters
587 ----------
588 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
589 Job containing values to be translated.
591 Returns
592 -------
593 dagcmds : `dict` [`str`, `Any`]
594 DAGMan commands for the job.
595 """
596 # Values in the dag script that just are name mappings.
597 dag_translation = {"abort_on_value": "abort_dag_on",
598 "abort_return_value": "abort_exit"}
600 dagcmds = {}
601 for gwkey, htckey in dag_translation.items():
602 dagcmds[htckey] = getattr(gwjob, gwkey, None)
604 # Still to be coded: vars "pre_cmdline", "post_cmdline"
605 return dagcmds
608def _fix_env_var_syntax(oldstr):
609 """Change ENV place holders to HTCondor Env var syntax.
611 Parameters
612 ----------
613 oldstr : `str`
614 String in which environment variable syntax is to be fixed.
616 Returns
617 -------
618 newstr : `str`
619 Given string with environment variable syntax fixed.
620 """
621 newstr = oldstr
622 for key in re.findall(r"<ENV:([^>]+)>", oldstr):
623 newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
624 return newstr
627def _replace_file_vars(config, arguments, workflow, gwjob):
628 """Replace file placeholders in command line arguments with correct
629 physical file names.
631 Parameters
632 ----------
633 config : `lsst.ctrl.bps.BpsConfig`
634 BPS configuration that includes necessary submit/runtime
635 information.
636 arguments : `str`
637 Arguments string in which to replace file placeholders.
638 workflow : `lsst.ctrl.bps.GenericWorkflow`
639 Generic workflow that contains file information.
640 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
641 The job corresponding to the arguments.
643 Returns
644 -------
645 arguments : `str`
646 Given arguments string with file placeholders replaced.
647 """
648 _, use_shared = config.search("bpsUseShared", opt={"default": False})
650 # Replace input file placeholders with paths.
651 for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
652 if not gwfile.wms_transfer:
653 # Must assume full URI if in command line and told WMS is not
654 # responsible for transferring file.
655 uri = gwfile.src_uri
656 elif use_shared:
657 if gwfile.job_shared:
658 # Have shared filesystems and jobs can share file.
659 uri = gwfile.src_uri
660 else:
661 # Taking advantage of inside knowledge. Not future-proof.
662 # Temporary fix until have job wrapper that pulls files
663 # within job.
664 if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
665 uri = "butler.yaml"
666 else:
667 uri = os.path.basename(gwfile.src_uri)
668 else: # Using push transfer
669 uri = os.path.basename(gwfile.src_uri)
670 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
672 # Replace output file placeholders with paths.
673 for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
674 if not gwfile.wms_transfer:
675 # Must assume full URI if in command line and told WMS is not
676 # responsible for transferring file.
677 uri = gwfile.src_uri
678 elif use_shared:
679 if gwfile.job_shared:
680 # Have shared filesystems and jobs can share file.
681 uri = gwfile.src_uri
682 else:
683 uri = os.path.basename(gwfile.src_uri)
684 else: # Using push transfer
685 uri = os.path.basename(gwfile.src_uri)
686 arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
687 return arguments
690def _replace_cmd_vars(arguments, gwjob):
691 """Replace format-style placeholders in arguments.
693 Parameters
694 ----------
695 arguments : `str`
696 Arguments string in which to replace placeholders.
697 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
698 Job containing values to be used to replace placeholders
699 (in particular gwjob.cmdvals).
701 Returns
702 -------
703 arguments : `str`
704 Given arguments string with placeholders replaced.
705 """
706 try:
707 arguments = arguments.format(**gwjob.cmdvals)
708 except (KeyError, TypeError): # TypeError in case None instead of {}
709 _LOG.error("Could not replace command variables:\n"
710 "arguments: %s\n"
711 "cmdvals: %s", arguments, gwjob.cmdvals)
712 raise
713 return arguments
716def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
717 """Add job input files from generic workflow to job.
719 Parameters
720 ----------
721 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
722 The generic workflow (e.g., has executable name and arguments).
723 job_name : `str`
724 Unique name for the job.
725 use_shared : `bool`
726 Whether job has access to files via shared filesystem.
727 out_prefix : `str`
728 The root directory into which all WMS-specific files are written.
730 Returns
731 -------
732 htc_commands : `dict` [`str`, `str`]
733 HTCondor commands for the job submission script.
734 """
735 htc_commands = {}
736 inputs = []
737 for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
738 _LOG.debug("src_uri=%s", gwf_file.src_uri)
740 uri = Path(gwf_file.src_uri)
742 # Note if use_shared and job_shared, don't need to transfer file.
744 if not use_shared: # Copy file using push to job
745 inputs.append(str(uri.relative_to(out_prefix)))
746 elif not gwf_file.job_shared: # Jobs require own copy
748 # if using shared filesystem, but still need copy in job. Use
749 # HTCondor's curl plugin for a local copy.
751 # Execution butler is represented as a directory which the
752 # curl plugin does not handle. Taking advantage of inside
753 # knowledge for temporary fix until have job wrapper that pulls
754 # files within job.
755 if gwf_file.name == "butlerConfig":
756 # The execution butler directory doesn't normally exist until
757 # the submit phase so checking for suffix instead of using
758 # is_dir(). If other non-yaml file exists they would have a
759 # different gwf_file.name.
760 if uri.suffix == ".yaml": # Single file, so just copy.
761 inputs.append(f"file://{uri}")
762 else:
763 inputs.append(f"file://{uri / 'butler.yaml'}")
764 inputs.append(f"file://{uri / 'gen3.sqlite3'}")
765 elif uri.is_dir():
766 raise RuntimeError("HTCondor plugin cannot transfer directories locally within job (%s)",
767 gwf_file.src_uri)
768 else:
769 inputs.append(f"file://{uri}")
771 if inputs:
772 htc_commands["transfer_input_files"] = ",".join(inputs)
773 _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
774 return htc_commands
777def _report_from_path(wms_path):
778 """Gather run information from a given run directory.
780 Parameters
781 ----------
782 wms_path : `str`
783 The directory containing the submit side files (e.g., HTCondor files).
785 Returns
786 -------
787 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
788 Run information for the detailed report. The key is the HTCondor id
789 and the value is a collection of report information for that run.
790 message : `str`
791 Message to be printed with the summary report.
792 """
793 wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
794 if wms_workflow_id == MISSING_ID:
795 run_reports = {}
796 else:
797 run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
798 return run_reports, message
801def _report_from_id(wms_workflow_id, hist, schedds=None):
802 """Gather run information using workflow id.
804 Parameters
805 ----------
806 wms_workflow_id : `str`
807 Limit to specific run based on id.
808 hist : `float`
809 Limit history search to this many days.
810 schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
811 HTCondor schedulers which to query for job information. If None
812 (default), all queries will be run against the local scheduler only.
814 Returns
815 -------
816 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
817 Run information for the detailed report. The key is the HTCondor id
818 and the value is a collection of report information for that run.
819 message : `str`
820 Message to be printed with the summary report.
821 """
822 dag_constraint = 'regexp("dagman$", Cmd)'
823 try:
824 cluster_id = int(float(wms_workflow_id))
825 except ValueError:
826 dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
827 else:
828 dag_constraint += f" && ClusterId == {cluster_id}"
830 # With the current implementation of the condor_* functions the query will
831 # always return only one match per Scheduler.
832 #
833 # Even in the highly unlikely situation where HTCondor history (which
834 # condor_search queries too) is long enough to have jobs from before the
835 # cluster ids were rolled over (and as a result there is more then one job
836 # with the same cluster id) they will not show up in the results.
837 schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
838 if len(schedd_dag_info) == 0:
839 run_reports = {}
840 message = ""
841 elif len(schedd_dag_info) == 1:
842 _, dag_info = schedd_dag_info.popitem()
843 dag_id, dag_ad = dag_info.popitem()
845 # Create a mapping between jobs and their classads. The keys will be
846 # of format 'ClusterId.ProcId'.
847 job_info = {dag_id: dag_ad}
849 # Find jobs (nodes) belonging to that DAGMan job.
850 job_constraint = f"DAGManJobId == {int(float(dag_id))}"
851 schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
852 _, node_info = schedd_job_info.popitem()
853 job_info.update(node_info)
855 # Collect additional pieces of information about jobs using HTCondor
856 # files in the submission directory.
857 _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
858 _update_jobs(job_info, path_jobs)
860 run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
861 message = ""
862 else:
863 ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
864 run_reports = {}
865 message = f"More than one job matches id '{wms_workflow_id}', " \
866 f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
867 return run_reports, message
870def _get_info_from_path(wms_path):
871 """Gather run information from a given run directory.
873 Parameters
874 ----------
875 wms_path : `str`
876 Directory containing HTCondor files.
878 Returns
879 -------
880 wms_workflow_id : `str`
881 The run id which is a DAGman job id.
882 jobs : `dict` [`str`, `dict` [`str`, `Any`]]
883 Information about jobs read from files in the given directory.
884 The key is the HTCondor id and the value is a dictionary of HTCondor
885 keys and values.
886 message : `str`
887 Message to be printed with the summary report.
888 """
889 messages = []
890 try:
891 wms_workflow_id, jobs = read_dag_log(wms_path)
892 _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
893 _update_jobs(jobs, read_node_status(wms_path))
894 _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
896 # Add more info for DAGman job
897 job = jobs[wms_workflow_id]
898 job.update(read_dag_status(wms_path))
900 job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
901 if "bps_run" not in job:
902 _add_run_info(wms_path, job)
904 message = htc_check_dagman_output(wms_path)
905 if message:
906 messages.append(message)
907 _LOG.debug("_get_info: id = %s, total_jobs = %s", wms_workflow_id,
908 jobs[wms_workflow_id]["total_jobs"])
910 # Add extra pieces of information which cannot be found in HTCondor
911 # generated files like 'GlobalJobId'.
912 #
913 # Do not treat absence of this file as a serious error. Neither runs
914 # submitted with earlier versions of the plugin nor the runs submitted
915 # with Pegasus plugin will have it at the moment. However, once enough
916 # time passes and Pegasus plugin will have its own report() method
917 # (instead of sneakily using HTCondor's one), the lack of that file
918 # should be treated as seriously as lack of any other file.
919 try:
920 job_info = read_dag_info(wms_path)
921 except FileNotFoundError as exc:
922 message = f"Warn: Some information may not be available: {exc}"
923 messages.append(message)
924 else:
925 schedd_name = next(iter(job_info))
926 job_ad = next(iter(job_info[schedd_name].values()))
927 job.update(job_ad)
928 except FileNotFoundError:
929 message = f"Could not find HTCondor files in '{wms_path}'"
930 _LOG.warning(message)
931 messages.append(message)
932 wms_workflow_id = MISSING_ID
933 jobs = {}
935 message = '\n'.join([msg for msg in messages if msg])
936 return wms_workflow_id, jobs, message
939def _create_detailed_report_from_jobs(wms_workflow_id, jobs):
940 """Gather run information to be used in generating summary reports.
942 Parameters
943 ----------
944 wms_workflow_id : `str`
945 The run id to create the report for.
946 jobs : `dict` [`str`, `dict` [`str`, Any]]
947 Mapping HTCondor job id to job information.
949 Returns
950 -------
951 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
952 Run information for the detailed report. The key is the given HTCondor
953 id and the value is a collection of report information for that run.
954 """
955 _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
956 dag_job = jobs[wms_workflow_id]
957 report = WmsRunReport(wms_id=f"{dag_job['ClusterId']}.{dag_job['ProcId']}",
958 global_wms_id=dag_job.get("GlobalJobId", "MISS"),
959 path=dag_job["Iwd"],
960 label=dag_job.get("bps_job_label", "MISS"),
961 run=dag_job.get("bps_run", "MISS"),
962 project=dag_job.get("bps_project", "MISS"),
963 campaign=dag_job.get("bps_campaign", "MISS"),
964 payload=dag_job.get("bps_payload", "MISS"),
965 operator=_get_owner(dag_job),
966 run_summary=_get_run_summary(dag_job),
967 state=_htc_status_to_wms_state(dag_job),
968 jobs=[],
969 total_number_jobs=dag_job["total_jobs"],
970 job_state_counts=dag_job["state_counts"])
972 for job_id, job_info in jobs.items():
973 try:
974 if job_info["ClusterId"] != int(float(wms_workflow_id)):
975 job_report = WmsJobReport(wms_id=job_id,
976 name=job_info.get("DAGNodeName", job_id),
977 label=job_info.get("bps_job_label",
978 pegasus_name_to_label(job_info["DAGNodeName"])),
979 state=_htc_status_to_wms_state(job_info))
980 if job_report.label == "init":
981 job_report.label = "pipetaskInit"
982 report.jobs.append(job_report)
983 except KeyError as ex:
984 _LOG.error("Job missing key '%s': %s", str(ex), job_info)
985 raise
987 run_reports = {report.wms_id: report}
988 _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
989 return run_reports
992def _summary_report(user, hist, pass_thru, schedds=None):
993 """Gather run information to be used in generating summary reports.
995 Parameters
996 ----------
997 user : `str`
998 Run lookup restricted to given user.
999 hist : `float`
1000 How many previous days to search for run information.
1001 pass_thru : `str`
1002 Advanced users can define the HTCondor constraint to be used
1003 when searching queue and history.
1005 Returns
1006 -------
1007 run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1008 Run information for the summary report. The keys are HTCondor ids and
1009 the values are collections of report information for each run.
1010 message : `str`
1011 Message to be printed with the summary report.
1012 """
1013 # only doing summary report so only look for dagman jobs
1014 if pass_thru:
1015 constraint = pass_thru
1016 else:
1017 # Notes:
1018 # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1019 # manually restarted.
1020 # * Any job with DAGManJobID isn't a DAG job
1021 constraint = 'bps_isjob == "True" && JobUniverse == 7'
1022 if user:
1023 constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1025 job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1027 # Have list of DAGMan jobs, need to get run_report info.
1028 run_reports = {}
1029 for jobs in job_info.values():
1030 for job_id, job in jobs.items():
1031 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1032 # If didn't get from queue information (e.g., Kerberos bug),
1033 # try reading from file.
1034 if total_jobs == 0:
1035 try:
1036 job.update(read_dag_status(job["Iwd"]))
1037 total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1038 except StopIteration:
1039 pass # don't kill report can't find htcondor files
1041 if "bps_run" not in job:
1042 _add_run_info(job["Iwd"], job)
1043 report = WmsRunReport(wms_id=job_id,
1044 global_wms_id=job["GlobalJobId"],
1045 path=job["Iwd"],
1046 label=job.get("bps_job_label", "MISS"),
1047 run=job.get("bps_run", "MISS"),
1048 project=job.get("bps_project", "MISS"),
1049 campaign=job.get("bps_campaign", "MISS"),
1050 payload=job.get("bps_payload", "MISS"),
1051 operator=_get_owner(job),
1052 run_summary=_get_run_summary(job),
1053 state=_htc_status_to_wms_state(job),
1054 jobs=[],
1055 total_number_jobs=total_jobs,
1056 job_state_counts=state_counts)
1057 run_reports[report.global_wms_id] = report
1059 return run_reports, ""
1062def _add_run_info(wms_path, job):
1063 """Find BPS run information elsewhere for runs without bps attributes.
1065 Parameters
1066 ----------
1067 wms_path : `str`
1068 Path to submit files for the run.
1069 job : `dict` [`str`, `Any`]
1070 HTCondor dag job information.
1072 Raises
1073 ------
1074 StopIteration
1075 If cannot find file it is looking for. Permission errors are
1076 caught and job's run is marked with error.
1077 """
1078 path = Path(wms_path) / "jobs"
1079 try:
1080 subfile = next(path.glob("**/*.sub"))
1081 except (StopIteration, PermissionError):
1082 job["bps_run"] = "Unavailable"
1083 else:
1084 _LOG.debug("_add_run_info: subfile = %s", subfile)
1085 try:
1086 with open(subfile, "r") as fh:
1087 for line in fh:
1088 if line.startswith("+bps_"):
1089 m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1090 if m:
1091 _LOG.debug("Matching line: %s", line)
1092 job[m.group(1)] = m.group(2).replace('"', "")
1093 else:
1094 _LOG.debug("Could not parse attribute: %s", line)
1095 except PermissionError:
1096 job["bps_run"] = "PermissionError"
1097 _LOG.debug("After adding job = %s", job)
1100def _get_owner(job):
1101 """Get the owner of a dag job.
1103 Parameters
1104 ----------
1105 job : `dict` [`str`, `Any`]
1106 HTCondor dag job information.
1108 Returns
1109 -------
1110 owner : `str`
1111 Owner of the dag job.
1112 """
1113 owner = job.get("bps_operator", None)
1114 if not owner:
1115 owner = job.get("Owner", None)
1116 if not owner:
1117 _LOG.warning("Could not get Owner from htcondor job: %s", job)
1118 owner = "MISS"
1119 return owner
1122def _get_run_summary(job):
1123 """Get the run summary for a job.
1125 Parameters
1126 ----------
1127 job : `dict` [`str`, `Any`]
1128 HTCondor dag job information.
1130 Returns
1131 -------
1132 summary : `str`
1133 Number of jobs per PipelineTask label in approximate pipeline order.
1134 Format: <label>:<count>[;<label>:<count>]+
1135 """
1136 summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1137 if not summary:
1138 summary, _ = summary_from_dag(job["Iwd"])
1139 if not summary:
1140 _LOG.warning("Could not get run summary for htcondor job: %s", job)
1141 _LOG.debug("_get_run_summary: summary=%s", summary)
1143 # Workaround sometimes using init vs pipetaskInit
1144 summary = summary.replace("init:", "pipetaskInit:")
1146 if "pegasus_version" in job and "pegasus" not in summary:
1147 summary += ";pegasus:0"
1149 return summary
1152def _get_state_counts_from_jobs(wms_workflow_id, jobs):
1153 """Count number of jobs per WMS state.
1155 Parameters
1156 ----------
1157 wms_workflow_id : `str`
1158 HTCondor job id.
1159 jobs : `dict` [`str`, `Any`]
1160 HTCondor dag job information.
1162 Returns
1163 -------
1164 total_count : `int`
1165 Total number of dag nodes.
1166 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1167 Keys are the different WMS states and values are counts of jobs
1168 that are in that WMS state.
1169 """
1170 state_counts = dict.fromkeys(WmsStates, 0)
1172 for jid, jinfo in jobs.items():
1173 if jid != wms_workflow_id:
1174 state_counts[_htc_status_to_wms_state(jinfo)] += 1
1176 total_counted = sum(state_counts.values())
1177 if "NodesTotal" in jobs[wms_workflow_id]:
1178 total_count = jobs[wms_workflow_id]["NodesTotal"]
1179 else:
1180 total_count = total_counted
1182 state_counts[WmsStates.UNREADY] += total_count - total_counted
1184 return total_count, state_counts
1187def _get_state_counts_from_dag_job(job):
1188 """Count number of jobs per WMS state.
1190 Parameters
1191 ----------
1192 job : `dict` [`str`, `Any`]
1193 HTCondor dag job information.
1195 Returns
1196 -------
1197 total_count : `int`
1198 Total number of dag nodes.
1199 state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1200 Keys are the different WMS states and values are counts of jobs
1201 that are in that WMS state.
1202 """
1203 _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1204 state_counts = dict.fromkeys(WmsStates, 0)
1205 if "DAG_NodesReady" in job:
1206 state_counts = {
1207 WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1208 WmsStates.READY: job.get("DAG_NodesReady", 0),
1209 WmsStates.HELD: job.get("JobProcsHeld", 0),
1210 WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1211 WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1212 WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0)}
1213 total_jobs = job.get("DAG_NodesTotal")
1214 _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1215 elif "NodesFailed" in job:
1216 state_counts = {
1217 WmsStates.UNREADY: job.get("NodesUnready", 0),
1218 WmsStates.READY: job.get("NodesReady", 0),
1219 WmsStates.HELD: job.get("JobProcsHeld", 0),
1220 WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1221 WmsStates.FAILED: job.get("NodesFailed", 0),
1222 WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0)}
1223 try:
1224 total_jobs = job.get("NodesTotal")
1225 except KeyError as ex:
1226 _LOG.error("Job missing %s. job = %s", str(ex), job)
1227 raise
1228 _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1229 else:
1230 # With Kerberos job auth and Kerberos bug, if warning would be printed
1231 # for every DAG.
1232 _LOG.debug("Can't get job state counts %s", job["Iwd"])
1233 total_jobs = 0
1235 _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1236 return total_jobs, state_counts
1239def _htc_status_to_wms_state(job):
1240 """Convert HTCondor job status to generic wms state.
1242 Parameters
1243 ----------
1244 job : `dict` [`str`, `Any`]
1245 HTCondor job information.
1247 Returns
1248 -------
1249 wms_state : `WmsStates`
1250 The equivalent WmsState to given job's status.
1251 """
1252 wms_state = WmsStates.MISFIT
1253 if "JobStatus" in job:
1254 wms_state = _htc_job_status_to_wms_state(job)
1255 elif "NodeStatus" in job:
1256 wms_state = _htc_node_status_to_wms_state(job)
1257 return wms_state
1260def _htc_job_status_to_wms_state(job):
1261 """Convert HTCondor job status to generic wms state.
1263 Parameters
1264 ----------
1265 job : `dict` [`str`, `Any`]
1266 HTCondor job information.
1268 Returns
1269 -------
1270 wms_state : `lsst.ctrl.bps.WmsStates`
1271 The equivalent WmsState to given job's status.
1272 """
1273 _LOG.debug("htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"],
1274 type(job["JobStatus"]))
1275 job_status = int(job["JobStatus"])
1276 wms_state = WmsStates.MISFIT
1278 _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1279 if job_status == JobStatus.IDLE:
1280 wms_state = WmsStates.PENDING
1281 elif job_status == JobStatus.RUNNING:
1282 wms_state = WmsStates.RUNNING
1283 elif job_status == JobStatus.REMOVED:
1284 wms_state = WmsStates.DELETED
1285 elif job_status == JobStatus.COMPLETED:
1286 if job.get("ExitBySignal", False) or job.get("ExitCode", 0) or \
1287 job.get("ExitSignal", 0) or job.get("DAG_Status", 0) or \
1288 job.get("ReturnValue", 0):
1289 wms_state = WmsStates.FAILED
1290 else:
1291 wms_state = WmsStates.SUCCEEDED
1292 elif job_status == JobStatus.HELD:
1293 wms_state = WmsStates.HELD
1295 return wms_state
1298def _htc_node_status_to_wms_state(job):
1299 """Convert HTCondor status to generic wms state.
1301 Parameters
1302 ----------
1303 job : `dict` [`str`, `Any`]
1304 HTCondor job information.
1306 Returns
1307 -------
1308 wms_state : `lsst.ctrl.bps.WmsStates`
1309 The equivalent WmsState to given node's status.
1310 """
1311 wms_state = WmsStates.MISFIT
1313 status = job["NodeStatus"]
1314 if status == NodeStatus.NOT_READY:
1315 wms_state = WmsStates.UNREADY
1316 elif status == NodeStatus.READY:
1317 wms_state = WmsStates.READY
1318 elif status == NodeStatus.PRERUN:
1319 wms_state = WmsStates.MISFIT
1320 elif status == NodeStatus.SUBMITTED:
1321 if job["JobProcsHeld"]:
1322 wms_state = WmsStates.HELD
1323 elif job["StatusDetails"] == "not_idle":
1324 wms_state = WmsStates.RUNNING
1325 elif job["JobProcsQueued"]:
1326 wms_state = WmsStates.PENDING
1327 elif status == NodeStatus.POSTRUN:
1328 wms_state = WmsStates.MISFIT
1329 elif status == NodeStatus.DONE:
1330 wms_state = WmsStates.SUCCEEDED
1331 elif status == NodeStatus.ERROR:
1332 # Use job exist instead of post script exit
1333 if "DAGMAN error 0" in job["StatusDetails"]:
1334 wms_state = WmsStates.SUCCEEDED
1335 else:
1336 wms_state = WmsStates.FAILED
1338 return wms_state
1341def _update_jobs(jobs1, jobs2):
1342 """Update jobs1 with info in jobs2.
1344 (Basically an update for nested dictionaries.)
1346 Parameters
1347 ----------
1348 jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1349 HTCondor job information to be updated.
1350 jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1351 Additional HTCondor job information.
1352 """
1353 for jid, jinfo in jobs2.items():
1354 if jid in jobs1:
1355 jobs1[jid].update(jinfo)
1356 else:
1357 jobs1[jid] = jinfo
1360def _wms_id_type(wms_id):
1361 """Determine the type of the WMS id.
1363 Parameters
1364 ----------
1365 wms_id : `str`
1366 WMS id identifying a job.
1368 Returns
1369 -------
1370 id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1371 Type of WMS id.
1372 """
1373 try:
1374 int(float(wms_id))
1375 except ValueError:
1376 wms_path = Path(wms_id)
1377 if wms_path.exists():
1378 id_type = WmsIdType.PATH
1379 else:
1380 id_type = WmsIdType.GLOBAL
1381 except TypeError:
1382 id_type = WmsIdType.UNKNOWN
1383 else:
1384 id_type = WmsIdType.LOCAL
1385 return id_type
1388def _wms_id_to_cluster(wms_id):
1389 """Convert WMS id to cluster id.
1391 Parameters
1392 ----------
1393 wms_id : `int` or `float` or `str`
1394 HTCondor job id or path.
1396 Returns
1397 -------
1398 schedd_ad : `classad.ClassAd`
1399 ClassAd describing the scheduler managing the job with the given id.
1400 cluster_id : `int`
1401 HTCondor cluster id.
1402 id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
1403 The type of the provided id.
1404 """
1405 coll = htcondor.Collector()
1407 schedd_ad = None
1408 cluster_id = None
1409 id_type = _wms_id_type(wms_id)
1410 if id_type == WmsIdType.LOCAL:
1411 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1412 cluster_id = int(float(wms_id))
1413 elif id_type == WmsIdType.GLOBAL:
1414 constraint = f'GlobalJobId == "{wms_id}"'
1415 schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
1416 schedds = [htcondor.Schedd(ad) for ad in schedd_ads.values()]
1417 queries = [schedd.xquery(requirements=constraint, projection=["ClusterId"]) for schedd in schedds]
1418 results = {query.tag(): dict(ads[0]) for query in htcondor.poll(queries)
1419 if (ads := query.nextAdsNonBlocking())}
1420 if results:
1421 schedd_name = next(iter(results))
1422 schedd_ad = schedd_ads[schedd_name]
1423 cluster_id = results[schedd_name]["ClusterId"]
1424 elif id_type == WmsIdType.PATH:
1425 try:
1426 job_info = read_dag_info(wms_id)
1427 except (FileNotFoundError, PermissionError, IOError):
1428 pass
1429 else:
1430 schedd_name = next(iter(job_info))
1431 job_id = next(iter(job_info[schedd_name]))
1432 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
1433 cluster_id = int(float(job_id))
1434 else:
1435 pass
1436 return schedd_ad, cluster_id, id_type
1439def _create_request_memory_expr(memory, multiplier):
1440 """Construct an HTCondor ClassAd expression for safe memory scaling.
1442 Parameters
1443 ----------
1444 memory : `int`
1445 Requested memory in MB.
1446 multiplier : `float`
1447 Memory growth rate between retires.
1449 Returns
1450 -------
1451 ad : `str`
1452 A string representing an HTCondor ClassAd expression enabling safe
1453 memory scaling between job retries.
1454 """
1455 # ClassAds 'Last*' are UNDEFINED when a job is put in the job queue.
1456 # The special comparison operators ensure that all comparisons below will
1457 # evaluate to FALSE in this case.
1458 was_mem_exceeded = "LastJobStatus =?= 5 " \
1459 "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
1460 "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
1462 # If job runs the first time or was held for reasons other than exceeding
1463 # the memory, set the required memory to the requested value or use
1464 # the memory value measured by HTCondor (MemoryUsage) depending on
1465 # whichever is greater.
1466 ad = f"({was_mem_exceeded}) " \
1467 f"? int({memory} * pow({multiplier}, NumJobStarts)) " \
1468 f": max({{{memory}, MemoryUsage ?: 0}}))"
1469 return ad
1472def _locate_schedds(locate_all=False):
1473 """Find out Scheduler daemons in an HTCondor pool.
1475 Parameters
1476 ----------
1477 locate_all : `bool`, optional
1478 If True, all available schedulers in the HTCondor pool will be located.
1479 False by default which means that the search will be limited to looking
1480 for the Scheduler running on a local host.
1482 Returns
1483 -------
1484 schedds : `dict` [`str`, `htcondor.Schedd`]
1485 A mapping between Scheduler names and Python objects allowing for
1486 interacting with them.
1487 """
1488 coll = htcondor.Collector()
1490 schedd_ads = []
1491 if locate_all:
1492 schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
1493 else:
1494 schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
1495 return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}