Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 12%
585 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-13 10:07 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-13 10:07 +0000
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Placeholder HTCondor DAGMan API.
30There is new work on a python DAGMan API from HTCondor. However, at this
31time, it tries to make things easier by assuming DAG is easily broken into
32levels where there are 1-1 or all-to-all relationships to nodes in next
33level. LSST workflows are more complicated.
34"""
36__all__ = [
37 "DagStatus",
38 "JobStatus",
39 "NodeStatus",
40 "RestrictedDict",
41 "HTCJob",
42 "HTCDag",
43 "htc_backup_files",
44 "htc_check_dagman_output",
45 "htc_create_submit_from_cmd",
46 "htc_create_submit_from_dag",
47 "htc_create_submit_from_file",
48 "htc_escape",
49 "htc_write_attribs",
50 "htc_write_condor_file",
51 "htc_version",
52 "htc_submit_dag",
53 "condor_history",
54 "condor_q",
55 "condor_search",
56 "condor_status",
57 "update_job_info",
58 "MISSING_ID",
59 "summary_from_dag",
60 "read_dag_info",
61 "read_dag_log",
62 "read_dag_nodes_log",
63 "read_dag_status",
64 "read_node_status",
65 "write_dag_info",
66 "pegasus_name_to_label",
67]
70import itertools
71import json
72import logging
73import os
74import pprint
75import re
76import subprocess
77from collections import defaultdict
78from collections.abc import MutableMapping
79from datetime import datetime, timedelta
80from enum import IntEnum
81from pathlib import Path
83import classad
84import htcondor
85import networkx
87_LOG = logging.getLogger(__name__)
89MISSING_ID = -99999
92class DagStatus(IntEnum):
93 """HTCondor DAGMan's statuses for a DAG."""
95 OK = 0
96 ERROR = 1 # an error condition different than those listed here
97 FAILED = 2 # one or more nodes in the DAG have failed
98 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification
99 REMOVED = 4 # the DAG has been removed by condor_rm
100 CYCLE = 5 # a cycle was found in the DAG
101 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
104class JobStatus(IntEnum):
105 """HTCondor's statuses for jobs."""
107 UNEXPANDED = 0 # Unexpanded
108 IDLE = 1 # Idle
109 RUNNING = 2 # Running
110 REMOVED = 3 # Removed
111 COMPLETED = 4 # Completed
112 HELD = 5 # Held
113 TRANSFERRING_OUTPUT = 6 # Transferring_Output
114 SUSPENDED = 7 # Suspended
117class NodeStatus(IntEnum):
118 """HTCondor's statuses for DAGman nodes."""
120 # (STATUS_NOT_READY): At least one parent has not yet finished or the node
121 # is a FINAL node.
122 NOT_READY = 0
124 # (STATUS_READY): All parents have finished, but the node is not yet
125 # running.
126 READY = 1
128 # (STATUS_PRERUN): The node’s PRE script is running.
129 PRERUN = 2
131 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue.
132 # StatusDetails = "not_idle" -> running.
133 # JobProcsHeld = 1-> hold.
134 # JobProcsQueued = 1 -> idle.
135 SUBMITTED = 3
137 # (STATUS_POSTRUN): The node’s POST script is running.
138 POSTRUN = 4
140 # (STATUS_DONE): The node has completed successfully.
141 DONE = 5
143 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g.,
144 # ULOG_JOB_ABORTED for deleted job).
145 ERROR = 6
148HTC_QUOTE_KEYS = {"environment"}
149HTC_VALID_JOB_KEYS = {
150 "universe",
151 "executable",
152 "arguments",
153 "environment",
154 "log",
155 "error",
156 "output",
157 "should_transfer_files",
158 "when_to_transfer_output",
159 "getenv",
160 "notification",
161 "notify_user",
162 "concurrency_limit",
163 "transfer_executable",
164 "transfer_input_files",
165 "transfer_output_files",
166 "request_cpus",
167 "request_memory",
168 "request_disk",
169 "priority",
170 "category",
171 "requirements",
172 "on_exit_hold",
173 "on_exit_hold_reason",
174 "on_exit_hold_subcode",
175 "max_retries",
176 "periodic_release",
177 "periodic_remove",
178 "accounting_group",
179 "accounting_group_user",
180}
181HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"}
184class RestrictedDict(MutableMapping):
185 """A dictionary that only allows certain keys.
187 Parameters
188 ----------
189 valid_keys : `Container`
190 Strings that are valid keys.
191 init_data : `dict` or `RestrictedDict`, optional
192 Initial data.
194 Raises
195 ------
196 KeyError
197 If invalid key(s) in init_data.
198 """
200 def __init__(self, valid_keys, init_data=()):
201 self.valid_keys = valid_keys
202 self.data = {}
203 self.update(init_data)
205 def __getitem__(self, key):
206 """Return value for given key if exists.
208 Parameters
209 ----------
210 key : `str`
211 Identifier for value to return.
213 Returns
214 -------
215 value : `~collections.abc.Any`
216 Value associated with given key.
218 Raises
219 ------
220 KeyError
221 If key doesn't exist.
222 """
223 return self.data[key]
225 def __delitem__(self, key):
226 """Delete value for given key if exists.
228 Parameters
229 ----------
230 key : `str`
231 Identifier for value to delete.
233 Raises
234 ------
235 KeyError
236 If key doesn't exist.
237 """
238 del self.data[key]
240 def __setitem__(self, key, value):
241 """Store key,value in internal dict only if key is valid.
243 Parameters
244 ----------
245 key : `str`
246 Identifier to associate with given value.
247 value : `~collections.abc.Any`
248 Value to store.
250 Raises
251 ------
252 KeyError
253 If key is invalid.
254 """
255 if key not in self.valid_keys:
256 raise KeyError(f"Invalid key {key}")
257 self.data[key] = value
259 def __iter__(self):
260 return self.data.__iter__()
262 def __len__(self):
263 return len(self.data)
265 def __str__(self):
266 return str(self.data)
269def htc_backup_files(wms_path, subdir=None, limit=100):
270 """Backup select HTCondor files in the submit directory.
272 Files will be saved in separate subdirectories which will be created in
273 the submit directory where the files are located. These subdirectories
274 will be consecutive, zero-padded integers. Their values will correspond to
275 the number of HTCondor rescue DAGs in the submit directory.
277 Hence, with the default settings, copies after the initial failed run will
278 be placed in '001' subdirectory, '002' after the first restart, and so on
279 until the limit of backups is reached. If there's no rescue DAG yet, files
280 will be copied to '000' subdirectory.
282 Parameters
283 ----------
284 wms_path : `str` or `pathlib.Path`
285 Path to the submit directory either absolute or relative.
286 subdir : `str` or `pathlib.Path`, optional
287 A path, relative to the submit directory, where all subdirectories with
288 backup files will be kept. Defaults to None which means that the backup
289 subdirectories will be placed directly in the submit directory.
290 limit : `int`, optional
291 Maximal number of backups. If the number of backups reaches the limit,
292 the last backup files will be overwritten. The default value is 100
293 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
294 version 8.8+.
296 Raises
297 ------
298 FileNotFoundError
299 If the submit directory or the file that needs to be backed up does not
300 exist.
301 OSError
302 If the submit directory cannot be accessed or backing up a file failed
303 either due to permission or filesystem related issues.
305 Notes
306 -----
307 This is not a generic function for making backups. It is intended to be
308 used once, just before a restart, to make snapshots of files which will be
309 overwritten by HTCondor after during the next run.
310 """
311 width = len(str(limit))
313 path = Path(wms_path).resolve()
314 if not path.is_dir():
315 raise FileNotFoundError(f"Directory {path} not found")
317 # Initialize the backup counter.
318 rescue_dags = list(Path(wms_path).glob("*.rescue*"))
319 counter = min(len(rescue_dags), limit)
321 # Create the backup directory and move select files there.
322 dest = Path(wms_path)
323 if subdir:
324 # PurePath.is_relative_to() is not available before Python 3.9. Hence
325 # we need to check is 'subdir' is in the submit directory in some other
326 # way if it is an absolute path.
327 subdir = Path(subdir)
328 if subdir.is_absolute():
329 if dest not in subdir.parents:
330 _LOG.warning(
331 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
332 subdir,
333 wms_path,
334 )
335 else:
336 dest /= subdir
337 else:
338 dest /= subdir
339 dest /= f"{counter:0{width}}"
340 try:
341 dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
342 except FileExistsError:
343 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
344 else:
345 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]:
346 for source in path.glob(patt):
347 if source.is_file():
348 target = dest / source.relative_to(path)
349 try:
350 source.rename(target)
351 except OSError as exc:
352 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
353 else:
354 raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
357def htc_escape(value):
358 """Escape characters in given value based upon HTCondor syntax.
360 Parameters
361 ----------
362 value : `~collections.abc.Any`
363 Value that needs to have characters escaped if string.
365 Returns
366 -------
367 new_value : `~collections.abc.Any`
368 Given value with characters escaped appropriate for HTCondor if string.
369 """
370 if isinstance(value, str):
371 newval = value.replace('"', '""').replace("'", "''").replace(""", '"')
372 else:
373 newval = value
375 return newval
378def htc_write_attribs(stream, attrs):
379 """Write job attributes in HTCondor format to writeable stream.
381 Parameters
382 ----------
383 stream : `~io.TextIOBase`
384 Output text stream (typically an open file).
385 attrs : `dict`
386 HTCondor job attributes (dictionary of attribute key, value).
387 """
388 for key, value in attrs.items():
389 # Make sure strings are syntactically correct for HTCondor.
390 if isinstance(value, str):
391 pval = f'"{htc_escape(value)}"'
392 else:
393 pval = value
395 print(f"+{key} = {pval}", file=stream)
398def htc_write_condor_file(filename, job_name, job, job_attrs):
399 """Write an HTCondor submit file.
401 Parameters
402 ----------
403 filename : `str`
404 Filename for the HTCondor submit file.
405 job_name : `str`
406 Job name to use in submit file.
407 job : `RestrictedDict`
408 Submit script information.
409 job_attrs : `dict`
410 Job attributes.
411 """
412 os.makedirs(os.path.dirname(filename), exist_ok=True)
413 with open(filename, "w") as fh:
414 for key, value in job.items():
415 if value is not None:
416 if key in HTC_QUOTE_KEYS:
417 print(f'{key}="{htc_escape(value)}"', file=fh)
418 else:
419 print(f"{key}={value}", file=fh)
420 for key in ["output", "error", "log"]:
421 if key not in job:
422 filename = f"{job_name}.$(Cluster).${key[:3]}"
423 print(f"{key}={filename}", file=fh)
425 if job_attrs is not None:
426 htc_write_attribs(fh, job_attrs)
427 print("queue", file=fh)
430def htc_version():
431 """Return the version given by the HTCondor API.
433 Returns
434 -------
435 version : `str`
436 HTCondor version as easily comparable string.
438 Raises
439 ------
440 RuntimeError
441 Raised if fail to parse htcondor API string.
442 """
443 # Example string returned by htcondor.version:
444 # $CondorVersion: 8.8.6 Nov 13 2019 BuildID: 489199 PackageID: 8.8.6-1 $
445 version_info = re.match(r"\$CondorVersion: (\d+).(\d+).(\d+)", htcondor.version())
446 if version_info is None:
447 raise RuntimeError("Problems parsing condor version")
448 return f"{int(version_info.group(1))}.{int(version_info.group(2))}.{int(version_info.group(3))}"
451def htc_submit_dag(sub):
452 """Submit job for execution.
454 Parameters
455 ----------
456 sub : `htcondor.Submit`
457 An object representing a job submit description.
459 Returns
460 -------
461 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
462 Information about jobs satisfying the search criteria where for each
463 Scheduler, local HTCondor job ids are mapped to their respective
464 classads.
465 """
466 coll = htcondor.Collector()
467 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
468 schedd = htcondor.Schedd(schedd_ad)
470 jobs_ads = []
471 with schedd.transaction() as txn:
472 sub.queue(txn, ad_results=jobs_ads)
474 # Submit.queue() above will raise RuntimeError if submission fails, so
475 # 'jobs_ads' should contain the ad at this point.
476 dag_ad = jobs_ads[0]
478 # Sadly, the ClassAd from Submit.queue() (see above) does not have
479 # 'GlobalJobId' so we need to run a regular query to get it anyway.
480 schedd_name = schedd_ad["Name"]
481 schedd_dag_info = condor_q(
482 constraint=f"ClusterId == {dag_ad['ClusterId']}", schedds={schedd_name: schedd}
483 )
484 return schedd_dag_info
487def htc_create_submit_from_dag(dag_filename, submit_options=None):
488 """Create a DAGMan job submit description.
490 Parameters
491 ----------
492 dag_filename : `str`
493 Name of file containing HTCondor DAG commands.
494 submit_options : `dict` [`str`, Any], optional
495 Contains extra options for command line (Value of None means flag).
497 Returns
498 -------
499 sub : `htcondor.Submit`
500 An object representing a job submit description.
502 Notes
503 -----
504 Use with HTCondor versions which support htcondor.Submit.from_dag(),
505 i.e., 8.9.3 or newer.
506 """
507 return htcondor.Submit.from_dag(dag_filename, submit_options)
510def htc_create_submit_from_cmd(dag_filename, submit_options=None):
511 """Create a DAGMan job submit description.
513 Create a DAGMan job submit description by calling ``condor_submit_dag``
514 on given DAG description file.
516 Parameters
517 ----------
518 dag_filename : `str`
519 Name of file containing HTCondor DAG commands.
520 submit_options : `dict` [`str`, Any], optional
521 Contains extra options for command line (Value of None means flag).
523 Returns
524 -------
525 sub : `htcondor.Submit`
526 An object representing a job submit description.
528 Notes
529 -----
530 Use with HTCondor versions which do not support htcondor.Submit.from_dag(),
531 i.e., older than 8.9.3.
532 """
533 # Run command line condor_submit_dag command.
534 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse "
536 if submit_options is not None:
537 for opt, val in submit_options.items():
538 cmd += f" -{opt} {val or ''}"
539 cmd += f"{dag_filename}"
541 process = subprocess.Popen(
542 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8"
543 )
544 process.wait()
546 if process.returncode != 0:
547 print(f"Exit code: {process.returncode}")
548 print(process.communicate()[0])
549 raise RuntimeError("Problems running condor_submit_dag")
551 return htc_create_submit_from_file(f"{dag_filename}.condor.sub")
554def htc_create_submit_from_file(submit_file):
555 """Parse a submission file.
557 Parameters
558 ----------
559 submit_file : `str`
560 Name of the HTCondor submit file.
562 Returns
563 -------
564 sub : `htcondor.Submit`
565 An object representing a job submit description.
566 """
567 descriptors = {}
568 with open(submit_file) as fh:
569 for line in fh:
570 line = line.strip()
571 if not line.startswith("#") and not line == "queue":
572 (key, val) = re.split(r"\s*=\s*", line, 1)
573 descriptors[key] = val
575 # Avoid UserWarning: the line 'copy_to_spool = False' was
576 # unused by Submit object. Is it a typo?
577 try:
578 del descriptors["copy_to_spool"]
579 except KeyError:
580 pass
582 return htcondor.Submit(descriptors)
585def _htc_write_job_commands(stream, name, jobs):
586 """Output the DAGMan job lines for single job in DAG.
588 Parameters
589 ----------
590 stream : `~io.TextIOBase`
591 Writeable text stream (typically an opened file).
592 name : `str`
593 Job name.
594 jobs : `RestrictedDict`
595 DAG job keys and values.
596 """
597 if "pre" in jobs:
598 print(
599 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}"
600 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}",
601 file=stream,
602 )
604 if "post" in jobs:
605 print(
606 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}"
607 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}",
608 file=stream,
609 )
611 if "vars" in jobs:
612 for key, value in jobs["vars"]:
613 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
615 if "pre_skip" in jobs:
616 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream)
618 if "retry" in jobs and jobs["retry"]:
619 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream)
620 if "retry_unless_exit" in jobs:
621 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream)
622 print("\n", file=stream)
624 if "abort_dag_on" in jobs and jobs["abort_dag_on"]:
625 print(
626 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}"
627 f" RETURN {jobs['abort_dag_on']['abort_exit']}",
628 file=stream,
629 )
632class HTCJob:
633 """HTCondor job for use in building DAG.
635 Parameters
636 ----------
637 name : `str`
638 Name of the job
639 label : `str`
640 Label that can used for grouping or lookup.
641 initcmds : `RestrictedDict`
642 Initial job commands for submit file.
643 initdagcmds : `RestrictedDict`
644 Initial commands for job inside DAG.
645 initattrs : `dict`
646 Initial dictionary of job attributes.
647 """
649 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None):
650 self.name = name
651 self.label = label
652 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds)
653 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
654 self.attrs = initattrs
655 self.subfile = None
657 def __str__(self):
658 return self.name
660 def add_job_cmds(self, new_commands):
661 """Add commands to Job (overwrite existing).
663 Parameters
664 ----------
665 new_commands : `dict`
666 Submit file commands to be added to Job.
667 """
668 self.cmds.update(new_commands)
670 def add_dag_cmds(self, new_commands):
671 """Add DAG commands to Job (overwrite existing).
673 Parameters
674 ----------
675 new_commands : `dict`
676 DAG file commands to be added to Job
677 """
678 self.dagcmds.update(new_commands)
680 def add_job_attrs(self, new_attrs):
681 """Add attributes to Job (overwrite existing).
683 Parameters
684 ----------
685 new_attrs : `dict`
686 Attributes to be added to Job
687 """
688 if self.attrs is None:
689 self.attrs = {}
690 if new_attrs:
691 self.attrs.update(new_attrs)
693 def write_submit_file(self, submit_path, job_subdir=""):
694 """Write job description to submit file.
696 Parameters
697 ----------
698 submit_path : `str`
699 Prefix path for the submit file.
700 job_subdir : `str`, optional
701 Template for job subdir.
702 """
703 if not self.subfile:
704 self.subfile = f"{self.name}.sub"
705 job_subdir = job_subdir.format(self=self)
706 if job_subdir:
707 self.subfile = os.path.join(job_subdir, self.subfile)
708 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
710 def write_dag_commands(self, stream):
711 """Write DAG commands for single job to output stream.
713 Parameters
714 ----------
715 stream : `IO` or `str`
716 Output Stream
717 """
718 print(f"JOB {self.name} {self.subfile}", file=stream)
719 _htc_write_job_commands(stream, self.name, self.dagcmds)
721 def dump(self, fh):
722 """Dump job information to output stream.
724 Parameters
725 ----------
726 fh : `~io.TextIOBase`
727 Output stream
728 """
729 printer = pprint.PrettyPrinter(indent=4, stream=fh)
730 printer.pprint(self.name)
731 printer.pprint(self.cmds)
732 printer.pprint(self.attrs)
735class HTCDag(networkx.DiGraph):
736 """HTCondor DAG.
738 Parameters
739 ----------
740 data : networkx.DiGraph.data
741 Initial graph.
742 name : `str`
743 Name for DAG.
744 """
746 def __init__(self, data=None, name=""):
747 super().__init__(data=data, name=name)
749 self.graph["attr"] = {}
750 self.graph["run_id"] = None
751 self.graph["submit_path"] = None
752 self.graph["final_job"] = None
754 def __str__(self):
755 """Represent basic DAG info as string.
757 Returns
758 -------
759 info : `str`
760 String containing basic DAG info.
761 """
762 return f"{self.graph['name']} {len(self)}"
764 def add_attribs(self, attribs=None):
765 """Add attributes to the DAG.
767 Parameters
768 ----------
769 attribs : `dict`
770 DAG attributes
771 """
772 if attribs is not None:
773 self.graph["attr"].update(attribs)
775 def add_job(self, job, parent_names=None, child_names=None):
776 """Add an HTCJob to the HTCDag.
778 Parameters
779 ----------
780 job : `HTCJob`
781 HTCJob to add to the HTCDag
782 parent_names : `~collections.abc.Iterable` [`str`], optional
783 Names of parent jobs
784 child_names : `~collections.abc.Iterable` [`str`], optional
785 Names of child jobs
786 """
787 assert isinstance(job, HTCJob)
789 # Add dag level attributes to each job
790 job.add_job_attrs(self.graph["attr"])
792 self.add_node(job.name, data=job)
794 if parent_names is not None:
795 self.add_job_relationships(parent_names, job.name)
797 if child_names is not None:
798 self.add_job_relationships(child_names, job.name)
800 def add_job_relationships(self, parents, children):
801 """Add DAG edge between parents and children jobs.
803 Parameters
804 ----------
805 parents : `list` [`str`]
806 Contains parent job name(s).
807 children : `list` [`str`]
808 Contains children job name(s).
809 """
810 self.add_edges_from(itertools.product(parents, children))
812 def add_final_job(self, job):
813 """Add an HTCJob for the FINAL job in HTCDag.
815 Parameters
816 ----------
817 job : `HTCJob`
818 HTCJob to add to the HTCDag as a FINAL job.
819 """
820 # Add dag level attributes to each job
821 job.add_job_attrs(self.graph["attr"])
823 self.graph["final_job"] = job
825 def del_job(self, job_name):
826 """Delete the job from the DAG.
828 Parameters
829 ----------
830 job_name : `str`
831 Name of job in DAG to delete
832 """
833 # Reconnect edges around node to delete
834 parents = self.predecessors(job_name)
835 children = self.successors(job_name)
836 self.add_edges_from(itertools.product(parents, children))
838 # Delete job node (which deletes its edges).
839 self.remove_node(job_name)
841 def write(self, submit_path, job_subdir=""):
842 """Write DAG to a file.
844 Parameters
845 ----------
846 submit_path : `str`
847 Prefix path for dag filename to be combined with DAG name.
848 job_subdir : `str`, optional
849 Template for job subdir.
850 """
851 self.graph["submit_path"] = submit_path
852 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag")
853 os.makedirs(submit_path, exist_ok=True)
854 with open(self.graph["dag_filename"], "w") as fh:
855 for _, nodeval in self.nodes().items():
856 job = nodeval["data"]
857 job.write_submit_file(submit_path, job_subdir)
858 job.write_dag_commands(fh)
859 for edge in self.edges():
860 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
861 print(f"DOT {self.name}.dot", file=fh)
862 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh)
864 # Add bps attributes to dag submission
865 for key, value in self.graph["attr"].items():
866 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh)
868 if self.graph["final_job"]:
869 job = self.graph["final_job"]
870 job.write_submit_file(submit_path, job_subdir)
871 print(f"FINAL {job.name} {job.subfile}", file=fh)
872 if "pre" in job.dagcmds:
873 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
874 if "post" in job.dagcmds:
875 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
877 def dump(self, fh):
878 """Dump DAG info to output stream.
880 Parameters
881 ----------
882 fh : `io.IO` or `str`
883 Where to dump DAG info as text.
884 """
885 for key, value in self.graph:
886 print(f"{key}={value}", file=fh)
887 for name, data in self.nodes().items():
888 print(f"{name}:", file=fh)
889 data.dump(fh)
890 for edge in self.edges():
891 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
892 if self.graph["final_job"]:
893 print(f'FINAL {self.graph["final_job"].name}:', file=fh)
894 self.graph["final_job"].dump(fh)
896 def write_dot(self, filename):
897 """Write a dot version of the DAG.
899 Parameters
900 ----------
901 filename : `str`
902 dot filename
903 """
904 pos = networkx.nx_agraph.graphviz_layout(self)
905 networkx.draw(self, pos=pos)
906 networkx.drawing.nx_pydot.write_dot(self, filename)
909def condor_q(constraint=None, schedds=None):
910 """Query HTCondor for current jobs.
912 Parameters
913 ----------
914 constraint : `str`, optional
915 Constraints to be passed to job query.
916 schedds : `dict` [`str`, `htcondor.Schedd`], optional
917 HTCondor schedulers which to query for job information. If None
918 (default), the query will be run against local scheduler only.
920 Returns
921 -------
922 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
923 Information about jobs satisfying the search criteria where for each
924 Scheduler, local HTCondor job ids are mapped to their respective
925 classads.
926 """
927 if not schedds:
928 coll = htcondor.Collector()
929 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
930 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)}
932 queries = [schedd.xquery(requirements=constraint) for schedd in schedds.values()]
934 job_info = {}
935 for query in htcondor.poll(queries):
936 schedd_name = query.tag()
937 job_info.setdefault(schedd_name, {})
938 for job_ad in query.nextAdsNonBlocking():
939 del job_ad["Environment"]
940 del job_ad["Env"]
941 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}"
942 job_info[schedd_name][id_] = dict(job_ad)
943 _LOG.debug("condor_q returned %d jobs", sum(len(val) for val in job_info.values()))
945 # When returning the results filter out entries for schedulers with no jobs
946 # matching the search criteria.
947 return {key: val for key, val in job_info.items() if val}
950def condor_history(constraint=None, schedds=None):
951 """Get information about completed jobs from HTCondor history.
953 Parameters
954 ----------
955 constraint : `str`, optional
956 Constraints to be passed to job query.
957 schedds : `dict` [`str`, `htcondor.Schedd`], optional
958 HTCondor schedulers which to query for job information. If None
959 (default), the query will be run against the history file of
960 the local scheduler only.
962 Returns
963 -------
964 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
965 Information about jobs satisfying the search criteria where for each
966 Scheduler, local HTCondor job ids are mapped to their respective
967 classads.
968 """
969 if not schedds:
970 coll = htcondor.Collector()
971 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
972 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)}
974 job_info = {}
975 for schedd_name, schedd in schedds.items():
976 job_info[schedd_name] = {}
977 for job_ad in schedd.history(requirements=constraint, projection=[]):
978 del job_ad["Environment"]
979 del job_ad["Env"]
980 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}"
981 job_info[schedd_name][id_] = dict(job_ad)
982 _LOG.debug("condor_history returned %d jobs", sum(len(val) for val in job_info.values()))
984 # When returning the results filter out entries for schedulers with no jobs
985 # matching the search criteria.
986 return {key: val for key, val in job_info.items() if val}
989def condor_search(constraint=None, hist=None, schedds=None):
990 """Search for running and finished jobs satisfying given criteria.
992 Parameters
993 ----------
994 constraint : `str`, optional
995 Constraints to be passed to job query.
996 hist : `float`
997 Limit history search to this many days.
998 schedds : `dict` [`str`, `htcondor.Schedd`], optional
999 The list of the HTCondor schedulers which to query for job information.
1000 If None (default), only the local scheduler will be queried.
1002 Returns
1003 -------
1004 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1005 Information about jobs satisfying the search criteria where for each
1006 Scheduler, local HTCondor job ids are mapped to their respective
1007 classads.
1008 """
1009 if not schedds:
1010 coll = htcondor.Collector()
1011 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1012 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)}
1014 job_info = condor_q(constraint=constraint, schedds=schedds)
1015 if hist is not None:
1016 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
1017 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
1018 hist_info = condor_history(constraint, schedds=schedds)
1019 update_job_info(job_info, hist_info)
1020 return job_info
1023def condor_status(constraint=None, coll=None):
1024 """Get information about HTCondor pool.
1026 Parameters
1027 ----------
1028 constraint : `str`, optional
1029 Constraints to be passed to the query.
1030 coll : `htcondor.Collector`, optional
1031 Object representing HTCondor collector daemon.
1033 Returns
1034 -------
1035 pool_info : `dict` [`str`, `dict` [`str`, Any]]
1036 Mapping between HTCondor slot names and slot information (classAds).
1037 """
1038 if coll is None:
1039 coll = htcondor.Collector()
1040 try:
1041 pool_ads = coll.query(constraint=constraint)
1042 except OSError as ex:
1043 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex
1045 pool_info = {}
1046 for slot in pool_ads:
1047 pool_info[slot["name"]] = dict(slot)
1048 _LOG.debug("condor_status returned %d ads", len(pool_info))
1049 return pool_info
1052def update_job_info(job_info, other_info):
1053 """Update results of a job query with results from another query.
1055 Parameters
1056 ----------
1057 job_info : `dict` [`str`, `dict` [`str`, Any]]
1058 Results of the job query that needs to be updated.
1059 other_info : `dict` [`str`, `dict` [`str`, Any]]
1060 Results of the other job query.
1062 Returns
1063 -------
1064 job_info : `dict` [`str`, `dict` [`str`, Any]]
1065 The updated results.
1066 """
1067 for schedd_name, others in other_info.items():
1068 try:
1069 jobs = job_info[schedd_name]
1070 except KeyError:
1071 job_info[schedd_name] = others
1072 else:
1073 for id_, ad in others.items():
1074 jobs.setdefault(id_, {}).update(ad)
1075 return job_info
1078def summary_from_dag(dir_name):
1079 """Build bps_run_summary string from dag file.
1081 Parameters
1082 ----------
1083 dir_name : `str`
1084 Path that includes dag file for a run.
1086 Returns
1087 -------
1088 summary : `str`
1089 Semi-colon separated list of job labels and counts.
1090 (Same format as saved in dag classad.)
1091 job_name_to_pipetask : `dict` [`str`, `str`]
1092 Mapping of job names to job labels
1093 """
1094 dag = next(Path(dir_name).glob("*.dag"))
1096 # Later code depends upon insertion order
1097 counts = defaultdict(int)
1098 job_name_to_pipetask = {}
1099 try:
1100 with open(dag) as fh:
1101 for line in fh:
1102 if line.startswith("JOB"):
1103 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line)
1104 if m:
1105 label = m.group(2)
1106 if label == "init":
1107 label = "pipetaskInit"
1108 job_name_to_pipetask[m.group(1)] = label
1109 counts[label] += 1
1110 else: # Check if Pegasus submission
1111 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line)
1112 if m:
1113 label = pegasus_name_to_label(m.group(1))
1114 job_name_to_pipetask[m.group(1)] = label
1115 counts[label] += 1
1116 else:
1117 _LOG.warning("Parse DAG: unmatched job line: %s", line)
1118 elif line.startswith("FINAL"):
1119 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line)
1120 if m:
1121 label = m.group(2)
1122 job_name_to_pipetask[m.group(1)] = label
1123 counts[label] += 1
1125 except (OSError, PermissionError, StopIteration):
1126 pass
1128 summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1129 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask)
1130 return summary, job_name_to_pipetask
1133def pegasus_name_to_label(name):
1134 """Convert pegasus job name to a label for the report.
1136 Parameters
1137 ----------
1138 name : `str`
1139 Name of job.
1141 Returns
1142 -------
1143 label : `str`
1144 Label for job.
1145 """
1146 label = "UNK"
1147 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"):
1148 label = "pegasus"
1149 else:
1150 m = re.match(r"pipetask_(\d+_)?([^_]+)", name)
1151 if m:
1152 label = m.group(2)
1153 if label == "init":
1154 label = "pipetaskInit"
1156 return label
1159def read_dag_status(wms_path):
1160 """Read the node status file for DAG summary information
1162 Parameters
1163 ----------
1164 wms_path : `str`
1165 Path that includes node status file for a run.
1167 Returns
1168 -------
1169 dag_ad : `dict` [`str`, Any]
1170 DAG summary information.
1171 """
1172 dag_ad = {}
1174 # While this is probably more up to date than dag classad, only read from
1175 # file if need to.
1176 try:
1177 try:
1178 node_stat_file = next(Path(wms_path).glob("*.node_status"))
1179 _LOG.debug("Reading Node Status File %s", node_stat_file)
1180 with open(node_stat_file) as infh:
1181 dag_ad = classad.parseNext(infh) # pylint: disable=E1101
1182 except StopIteration:
1183 pass
1185 if not dag_ad:
1186 # Pegasus check here
1187 try:
1188 metrics_file = next(Path(wms_path).glob("*.dag.metrics"))
1189 with open(metrics_file) as infh:
1190 metrics = json.load(infh)
1191 dag_ad["NodesTotal"] = metrics.get("jobs", 0)
1192 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
1193 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
1194 dag_ad["pegasus_version"] = metrics.get("planner_version", "")
1195 except StopIteration:
1196 try:
1197 metrics_file = next(Path(wms_path).glob("*.metrics"))
1198 with open(metrics_file) as infh:
1199 metrics = json.load(infh)
1200 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1201 dag_ad["pegasus_version"] = metrics.get("version", "")
1202 except StopIteration:
1203 pass
1204 except (OSError, PermissionError):
1205 pass
1207 _LOG.debug("read_dag_status: %s", dag_ad)
1208 return dict(dag_ad)
1211def read_node_status(wms_path):
1212 """Read entire node status file.
1214 Parameters
1215 ----------
1216 wms_path : `str`
1217 Path that includes node status file for a run.
1219 Returns
1220 -------
1221 jobs : `dict` [`str`, Any]
1222 DAG summary information.
1223 """
1224 # Get jobid info from other places to fill in gaps in info from node_status
1225 _, job_name_to_pipetask = summary_from_dag(wms_path)
1226 wms_workflow_id, loginfo = read_dag_log(wms_path)
1227 loginfo = read_dag_nodes_log(wms_path)
1228 _LOG.debug("loginfo = %s", loginfo)
1229 job_name_to_id = {}
1230 for jid, jinfo in loginfo.items():
1231 if "LogNotes" in jinfo:
1232 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"])
1233 if m:
1234 job_name_to_id[m.group(1)] = jid
1235 jinfo["DAGNodeName"] = m.group(1)
1237 try:
1238 node_status = next(Path(wms_path).glob("*.node_status"))
1239 except StopIteration:
1240 return loginfo
1242 jobs = {}
1243 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1244 try:
1245 with open(node_status) as fh:
1246 ads = classad.parseAds(fh)
1248 for jclassad in ads:
1249 if jclassad["Type"] == "DagStatus":
1250 # skip DAG summary
1251 pass
1252 elif "Node" not in jclassad:
1253 if jclassad["Type"] != "StatusEnd":
1254 _LOG.debug("Key 'Node' not in classad: %s", jclassad)
1255 break
1256 else:
1257 if jclassad["Node"] in job_name_to_pipetask:
1258 try:
1259 label = job_name_to_pipetask[jclassad["Node"]]
1260 except KeyError:
1261 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys())
1262 raise
1263 elif "_" in jclassad["Node"]:
1264 label = jclassad["Node"].split("_")[1]
1265 else:
1266 label = jclassad["Node"]
1268 # Make job info as if came from condor_q
1269 if jclassad["Node"] in job_name_to_id:
1270 job_id = job_name_to_id[jclassad["Node"]]
1271 else:
1272 job_id = str(fake_id)
1273 fake_id -= 1
1275 job = dict(jclassad)
1276 job["ClusterId"] = int(float(job_id))
1277 job["DAGManJobID"] = wms_workflow_id
1278 job["DAGNodeName"] = jclassad["Node"]
1279 job["bps_job_label"] = label
1281 jobs[str(job_id)] = job
1282 except (OSError, PermissionError):
1283 pass
1285 return jobs
1288def read_dag_log(wms_path):
1289 """Read job information from the DAGMan log file.
1291 Parameters
1292 ----------
1293 wms_path : `str`
1294 Path containing the DAGMan log file.
1296 Returns
1297 -------
1298 wms_workflow_id : `str`
1299 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1300 dag_info : `dict` [`str`, `~collections.abc.Any`]
1301 HTCondor job information read from the log file mapped to HTCondor
1302 job id.
1304 Raises
1305 ------
1306 FileNotFoundError
1307 If cannot find DAGMan log in given wms_path.
1308 """
1309 wms_workflow_id = 0
1310 dag_info = {}
1312 path = Path(wms_path)
1313 if path.exists():
1314 try:
1315 filename = next(path.glob("*.dag.dagman.log"))
1316 except StopIteration as exc:
1317 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1318 _LOG.debug("dag node log filename: %s", filename)
1320 info = {}
1321 job_event_log = htcondor.JobEventLog(str(filename))
1322 for event in job_event_log.events(stop_after=0):
1323 id_ = f"{event['Cluster']}.{event['Proc']}"
1324 if id_ not in info:
1325 info[id_] = {}
1326 wms_workflow_id = id_ # taking last job id in case of restarts
1327 info[id_].update(event)
1328 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1330 # only save latest DAG job
1331 dag_info = {wms_workflow_id: info[wms_workflow_id]}
1332 for job in dag_info.values():
1333 _tweak_log_info(filename, job)
1335 return wms_workflow_id, dag_info
1338def read_dag_nodes_log(wms_path):
1339 """Read job information from the DAGMan nodes log file.
1341 Parameters
1342 ----------
1343 wms_path : `str`
1344 Path containing the DAGMan nodes log file.
1346 Returns
1347 -------
1348 info : `dict` [`str`, Any]
1349 HTCondor job information read from the log file mapped to HTCondor
1350 job id.
1352 Raises
1353 ------
1354 FileNotFoundError
1355 If cannot find DAGMan node log in given wms_path.
1356 """
1357 try:
1358 filename = next(Path(wms_path).glob("*.dag.nodes.log"))
1359 except StopIteration as exc:
1360 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
1361 _LOG.debug("dag node log filename: %s", filename)
1363 info = {}
1364 job_event_log = htcondor.JobEventLog(str(filename))
1365 for event in job_event_log.events(stop_after=0):
1366 id_ = f"{event['Cluster']}.{event['Proc']}"
1367 if id_ not in info:
1368 info[id_] = {}
1369 info[id_].update(event)
1370 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1372 # Add more condor_q-like info to info parsed from log file.
1373 for job in info.values():
1374 _tweak_log_info(filename, job)
1376 return info
1379def read_dag_info(wms_path):
1380 """Read custom DAGMan job information from the file.
1382 Parameters
1383 ----------
1384 wms_path : `str`
1385 Path containing the file with the DAGMan job info.
1387 Returns
1388 -------
1389 dag_info : `dict` [`str`, `dict` [`str`, Any]]
1390 HTCondor job information.
1392 Raises
1393 ------
1394 FileNotFoundError
1395 If cannot find DAGMan job info file in the given location.
1396 """
1397 try:
1398 filename = next(Path(wms_path).glob("*.info.json"))
1399 except StopIteration as exc:
1400 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc
1401 _LOG.debug("DAGMan job information filename: %s", filename)
1402 try:
1403 with open(filename) as fh:
1404 dag_info = json.load(fh)
1405 except (OSError, PermissionError) as exc:
1406 _LOG.debug("Retrieving DAGMan job information failed: %s", exc)
1407 dag_info = {}
1408 return dag_info
1411def write_dag_info(filename, dag_info):
1412 """Write custom job information about DAGMan job.
1414 Parameters
1415 ----------
1416 filename : `str`
1417 Name of the file where the information will be stored.
1418 dag_info : `dict` [`str` `dict` [`str`, Any]]
1419 Information about the DAGMan job.
1420 """
1421 schedd_name = next(iter(dag_info))
1422 dag_id = next(iter(dag_info[schedd_name]))
1423 dag_ad = dag_info[schedd_name][dag_id]
1424 try:
1425 with open(filename, "w") as fh:
1426 info = {
1427 schedd_name: {
1428 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]}
1429 }
1430 }
1431 json.dump(info, fh)
1432 except (KeyError, OSError, PermissionError) as exc:
1433 _LOG.debug("Persisting DAGMan job information failed: %s", exc)
1436def _tweak_log_info(filename, job):
1437 """Massage the given job info has same structure as if came from condor_q.
1439 Parameters
1440 ----------
1441 filename : `pathlib.Path`
1442 Name of the DAGMan log.
1443 job : `dict` [ `str`, Any ]
1444 A mapping between HTCondor job id and job information read from
1445 the log.
1446 """
1447 _LOG.debug("_tweak_log_info: %s %s", filename, job)
1448 try:
1449 job["ClusterId"] = job["Cluster"]
1450 job["ProcId"] = job["Proc"]
1451 job["Iwd"] = str(filename.parent)
1452 job["Owner"] = filename.owner()
1453 if job["MyType"] == "ExecuteEvent":
1454 job["JobStatus"] = JobStatus.RUNNING
1455 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent":
1456 job["JobStatus"] = JobStatus.COMPLETED
1457 try:
1458 if not job["TerminatedNormally"]:
1459 if "ReturnValue" in job:
1460 job["ExitCode"] = job["ReturnValue"]
1461 job["ExitBySignal"] = False
1462 elif "TerminatedBySignal" in job:
1463 job["ExitBySignal"] = True
1464 job["ExitSignal"] = job["TerminatedBySignal"]
1465 else:
1466 _LOG.warning("Could not determine exit status for completed job: %s", job)
1467 except KeyError as ex:
1468 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job)
1469 elif job["MyType"] == "SubmitEvent":
1470 job["JobStatus"] = JobStatus.IDLE
1471 elif job["MyType"] == "JobAbortedEvent":
1472 job["JobStatus"] = JobStatus.REMOVED
1473 else:
1474 _LOG.debug("Unknown log event type: %s", job["MyType"])
1475 except KeyError:
1476 _LOG.error("Missing key in job: %s", job)
1477 raise
1480def htc_check_dagman_output(wms_path):
1481 """Check the DAGMan output for error messages.
1483 Parameters
1484 ----------
1485 wms_path : `str`
1486 Directory containing the DAGman output file.
1488 Returns
1489 -------
1490 message : `str`
1491 Message containing error messages from the DAGMan output. Empty
1492 string if no messages.
1494 Raises
1495 ------
1496 FileNotFoundError
1497 If cannot find DAGMan standard output file in given wms_path.
1498 """
1499 try:
1500 filename = next(Path(wms_path).glob("*.dag.dagman.out"))
1501 except StopIteration as exc:
1502 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
1503 _LOG.debug("dag output filename: %s", filename)
1505 message = ""
1506 try:
1507 with open(filename) as fh:
1508 last_submit_failed = ""
1509 for line in fh:
1510 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line)
1511 if m:
1512 last_submit_failed = m.group(1)
1513 if last_submit_failed:
1514 message = f"Warn: Job submission issues (last: {last_submit_failed})"
1515 except (OSError, PermissionError):
1516 message = f"Warn: Could not read dagman output file from {wms_path}."
1517 return message