Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%
600 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:18 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:18 +0000
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Placeholder HTCondor DAGMan API.
30There is new work on a python DAGMan API from HTCondor. However, at this
31time, it tries to make things easier by assuming DAG is easily broken into
32levels where there are 1-1 or all-to-all relationships to nodes in next
33level. LSST workflows are more complicated.
34"""
36__all__ = [
37 "DagStatus",
38 "JobStatus",
39 "NodeStatus",
40 "RestrictedDict",
41 "HTCJob",
42 "HTCDag",
43 "htc_backup_files",
44 "htc_check_dagman_output",
45 "htc_create_submit_from_cmd",
46 "htc_create_submit_from_dag",
47 "htc_create_submit_from_file",
48 "htc_escape",
49 "htc_write_attribs",
50 "htc_write_condor_file",
51 "htc_query_history",
52 "htc_query_present",
53 "htc_version",
54 "htc_submit_dag",
55 "condor_history",
56 "condor_q",
57 "condor_search",
58 "condor_status",
59 "update_job_info",
60 "MISSING_ID",
61 "summary_from_dag",
62 "read_dag_info",
63 "read_dag_log",
64 "read_dag_nodes_log",
65 "read_dag_status",
66 "read_node_status",
67 "write_dag_info",
68 "pegasus_name_to_label",
69]
72import itertools
73import json
74import logging
75import os
76import pprint
77import re
78import subprocess
79from collections import defaultdict
80from collections.abc import MutableMapping
81from datetime import datetime, timedelta
82from enum import IntEnum
83from pathlib import Path
85import classad
86import htcondor
87import networkx
88from packaging import version
90_LOG = logging.getLogger(__name__)
92MISSING_ID = -99999
95class DagStatus(IntEnum):
96 """HTCondor DAGMan's statuses for a DAG."""
98 OK = 0
99 ERROR = 1 # an error condition different than those listed here
100 FAILED = 2 # one or more nodes in the DAG have failed
101 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification
102 REMOVED = 4 # the DAG has been removed by condor_rm
103 CYCLE = 5 # a cycle was found in the DAG
104 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
107class JobStatus(IntEnum):
108 """HTCondor's statuses for jobs."""
110 UNEXPANDED = 0 # Unexpanded
111 IDLE = 1 # Idle
112 RUNNING = 2 # Running
113 REMOVED = 3 # Removed
114 COMPLETED = 4 # Completed
115 HELD = 5 # Held
116 TRANSFERRING_OUTPUT = 6 # Transferring_Output
117 SUSPENDED = 7 # Suspended
120class NodeStatus(IntEnum):
121 """HTCondor's statuses for DAGman nodes."""
123 # (STATUS_NOT_READY): At least one parent has not yet finished or the node
124 # is a FINAL node.
125 NOT_READY = 0
127 # (STATUS_READY): All parents have finished, but the node is not yet
128 # running.
129 READY = 1
131 # (STATUS_PRERUN): The node’s PRE script is running.
132 PRERUN = 2
134 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue.
135 # StatusDetails = "not_idle" -> running.
136 # JobProcsHeld = 1-> hold.
137 # JobProcsQueued = 1 -> idle.
138 SUBMITTED = 3
140 # (STATUS_POSTRUN): The node’s POST script is running.
141 POSTRUN = 4
143 # (STATUS_DONE): The node has completed successfully.
144 DONE = 5
146 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g.,
147 # ULOG_JOB_ABORTED for deleted job).
148 ERROR = 6
151HTC_QUOTE_KEYS = {"environment"}
152HTC_VALID_JOB_KEYS = {
153 "universe",
154 "executable",
155 "arguments",
156 "environment",
157 "log",
158 "error",
159 "output",
160 "should_transfer_files",
161 "when_to_transfer_output",
162 "getenv",
163 "notification",
164 "notify_user",
165 "concurrency_limit",
166 "transfer_executable",
167 "transfer_input_files",
168 "transfer_output_files",
169 "request_cpus",
170 "request_memory",
171 "request_disk",
172 "priority",
173 "category",
174 "requirements",
175 "on_exit_hold",
176 "on_exit_hold_reason",
177 "on_exit_hold_subcode",
178 "max_retries",
179 "periodic_release",
180 "periodic_remove",
181 "accounting_group",
182 "accounting_group_user",
183}
184HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"}
185HTC_VERSION = version.parse(htcondor.__version__)
188class RestrictedDict(MutableMapping):
189 """A dictionary that only allows certain keys.
191 Parameters
192 ----------
193 valid_keys : `Container`
194 Strings that are valid keys.
195 init_data : `dict` or `RestrictedDict`, optional
196 Initial data.
198 Raises
199 ------
200 KeyError
201 If invalid key(s) in init_data.
202 """
204 def __init__(self, valid_keys, init_data=()):
205 self.valid_keys = valid_keys
206 self.data = {}
207 self.update(init_data)
209 def __getitem__(self, key):
210 """Return value for given key if exists.
212 Parameters
213 ----------
214 key : `str`
215 Identifier for value to return.
217 Returns
218 -------
219 value : `~collections.abc.Any`
220 Value associated with given key.
222 Raises
223 ------
224 KeyError
225 If key doesn't exist.
226 """
227 return self.data[key]
229 def __delitem__(self, key):
230 """Delete value for given key if exists.
232 Parameters
233 ----------
234 key : `str`
235 Identifier for value to delete.
237 Raises
238 ------
239 KeyError
240 If key doesn't exist.
241 """
242 del self.data[key]
244 def __setitem__(self, key, value):
245 """Store key,value in internal dict only if key is valid.
247 Parameters
248 ----------
249 key : `str`
250 Identifier to associate with given value.
251 value : `~collections.abc.Any`
252 Value to store.
254 Raises
255 ------
256 KeyError
257 If key is invalid.
258 """
259 if key not in self.valid_keys:
260 raise KeyError(f"Invalid key {key}")
261 self.data[key] = value
263 def __iter__(self):
264 return self.data.__iter__()
266 def __len__(self):
267 return len(self.data)
269 def __str__(self):
270 return str(self.data)
273def htc_backup_files(wms_path, subdir=None, limit=100):
274 """Backup select HTCondor files in the submit directory.
276 Files will be saved in separate subdirectories which will be created in
277 the submit directory where the files are located. These subdirectories
278 will be consecutive, zero-padded integers. Their values will correspond to
279 the number of HTCondor rescue DAGs in the submit directory.
281 Hence, with the default settings, copies after the initial failed run will
282 be placed in '001' subdirectory, '002' after the first restart, and so on
283 until the limit of backups is reached. If there's no rescue DAG yet, files
284 will be copied to '000' subdirectory.
286 Parameters
287 ----------
288 wms_path : `str` or `pathlib.Path`
289 Path to the submit directory either absolute or relative.
290 subdir : `str` or `pathlib.Path`, optional
291 A path, relative to the submit directory, where all subdirectories with
292 backup files will be kept. Defaults to None which means that the backup
293 subdirectories will be placed directly in the submit directory.
294 limit : `int`, optional
295 Maximal number of backups. If the number of backups reaches the limit,
296 the last backup files will be overwritten. The default value is 100
297 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
298 version 8.8+.
300 Raises
301 ------
302 FileNotFoundError
303 If the submit directory or the file that needs to be backed up does not
304 exist.
305 OSError
306 If the submit directory cannot be accessed or backing up a file failed
307 either due to permission or filesystem related issues.
309 Notes
310 -----
311 This is not a generic function for making backups. It is intended to be
312 used once, just before a restart, to make snapshots of files which will be
313 overwritten by HTCondor after during the next run.
314 """
315 width = len(str(limit))
317 path = Path(wms_path).resolve()
318 if not path.is_dir():
319 raise FileNotFoundError(f"Directory {path} not found")
321 # Initialize the backup counter.
322 rescue_dags = list(Path(wms_path).glob("*.rescue*"))
323 counter = min(len(rescue_dags), limit)
325 # Create the backup directory and move select files there.
326 dest = Path(wms_path)
327 if subdir:
328 # PurePath.is_relative_to() is not available before Python 3.9. Hence
329 # we need to check is 'subdir' is in the submit directory in some other
330 # way if it is an absolute path.
331 subdir = Path(subdir)
332 if subdir.is_absolute():
333 if dest not in subdir.parents:
334 _LOG.warning(
335 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
336 subdir,
337 wms_path,
338 )
339 else:
340 dest /= subdir
341 else:
342 dest /= subdir
343 dest /= f"{counter:0{width}}"
344 try:
345 dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
346 except FileExistsError:
347 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
348 else:
349 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]:
350 for source in path.glob(patt):
351 if source.is_file():
352 target = dest / source.relative_to(path)
353 try:
354 source.rename(target)
355 except OSError as exc:
356 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
357 else:
358 raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
361def htc_escape(value):
362 """Escape characters in given value based upon HTCondor syntax.
364 Parameters
365 ----------
366 value : `~collections.abc.Any`
367 Value that needs to have characters escaped if string.
369 Returns
370 -------
371 new_value : `~collections.abc.Any`
372 Given value with characters escaped appropriate for HTCondor if string.
373 """
374 if isinstance(value, str):
375 newval = value.replace('"', '""').replace("'", "''").replace(""", '"')
376 else:
377 newval = value
379 return newval
382def htc_write_attribs(stream, attrs):
383 """Write job attributes in HTCondor format to writeable stream.
385 Parameters
386 ----------
387 stream : `~io.TextIOBase`
388 Output text stream (typically an open file).
389 attrs : `dict`
390 HTCondor job attributes (dictionary of attribute key, value).
391 """
392 for key, value in attrs.items():
393 # Make sure strings are syntactically correct for HTCondor.
394 if isinstance(value, str):
395 pval = f'"{htc_escape(value)}"'
396 else:
397 pval = value
399 print(f"+{key} = {pval}", file=stream)
402def htc_write_condor_file(filename, job_name, job, job_attrs):
403 """Write an HTCondor submit file.
405 Parameters
406 ----------
407 filename : `str`
408 Filename for the HTCondor submit file.
409 job_name : `str`
410 Job name to use in submit file.
411 job : `RestrictedDict`
412 Submit script information.
413 job_attrs : `dict`
414 Job attributes.
415 """
416 os.makedirs(os.path.dirname(filename), exist_ok=True)
417 with open(filename, "w") as fh:
418 for key, value in job.items():
419 if value is not None:
420 if key in HTC_QUOTE_KEYS:
421 print(f'{key}="{htc_escape(value)}"', file=fh)
422 else:
423 print(f"{key}={value}", file=fh)
424 for key in ["output", "error", "log"]:
425 if key not in job:
426 filename = f"{job_name}.$(Cluster).${key[:3]}"
427 print(f"{key}={filename}", file=fh)
429 if job_attrs is not None:
430 htc_write_attribs(fh, job_attrs)
431 print("queue", file=fh)
434# To avoid doing the version check during every function call select
435# appropriate conversion function at the import time.
436#
437# Make sure that *each* version specific variant of the conversion function(s)
438# has the same signature after applying any changes!
439if HTC_VERSION < version.parse("8.9.8"): 439 ↛ 441line 439 didn't jump to line 441, because the condition on line 439 was never true
441 def htc_tune_schedd_args(**kwargs):
442 """Ensure that arguments for Schedd are version appropriate.
444 The old arguments: 'requirements' and 'attr_list' of
445 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were
446 deprecated in favor of 'constraint' and 'projection', respectively,
447 starting from version 8.9.8. The function will convert "new" keyword
448 arguments to "old" ones.
450 Parameters
451 ----------
452 **kwargs
453 Any keyword arguments that Schedd.history(), Schedd.query(), and
454 Schedd.xquery() accepts.
456 Returns
457 -------
458 kwargs : `dict` [`str`, Any]
459 Keywords arguments that are guaranteed to work with the Python
460 HTCondor API.
462 Notes
463 -----
464 Function doesn't validate provided keyword arguments beyond converting
465 selected arguments to their version specific form. For example,
466 it won't remove keywords that are not supported by the methods
467 mentioned earlier.
468 """
469 translation_table = {
470 "constraint": "requirements",
471 "projection": "attr_list",
472 }
473 for new, old in translation_table.items():
474 try:
475 kwargs[old] = kwargs.pop(new)
476 except KeyError:
477 pass
478 return kwargs
480else:
482 def htc_tune_schedd_args(**kwargs):
483 """Ensure that arguments for Schedd are version appropriate.
485 This is the fallback function if no version specific alteration are
486 necessary. Effectively, a no-op.
488 Parameters
489 ----------
490 **kwargs
491 Any keyword arguments that Schedd.history(), Schedd.query(), and
492 Schedd.xquery() accepts.
494 Returns
495 -------
496 kwargs : `dict` [`str`, Any]
497 Keywords arguments that were passed to the function.
498 """
499 return kwargs
502def htc_query_history(schedds, **kwargs):
503 """Fetch history records from the condor_schedd daemon.
505 Parameters
506 ----------
507 schedds : `htcondor.Schedd`
508 HTCondor schedulers which to query for job information.
509 **kwargs
510 Any keyword arguments that Schedd.history() accepts.
512 Yields
513 ------
514 schedd_name : `str`
515 Name of the HTCondor scheduler managing the job queue.
516 job_ad : `dict` [`str`, Any]
517 A dictionary representing HTCondor ClassAd describing a job. It maps
518 job attributes names to values of the ClassAd expressions they
519 represent.
520 """
521 # If not set, provide defaults for positional arguments.
522 kwargs.setdefault("constraint", None)
523 kwargs.setdefault("projection", [])
524 kwargs = htc_tune_schedd_args(**kwargs)
525 for schedd_name, schedd in schedds.items():
526 for job_ad in schedd.history(**kwargs):
527 yield schedd_name, dict(job_ad)
530def htc_query_present(schedds, **kwargs):
531 """Query the condor_schedd daemon for job ads.
533 Parameters
534 ----------
535 schedds : `htcondor.Schedd`
536 HTCondor schedulers which to query for job information.
537 **kwargs
538 Any keyword arguments that Schedd.xquery() accepts.
540 Yields
541 ------
542 schedd_name : `str`
543 Name of the HTCondor scheduler managing the job queue.
544 job_ad : `dict` [`str`, Any]
545 A dictionary representing HTCondor ClassAd describing a job. It maps
546 job attributes names to values of the ClassAd expressions they
547 represent.
548 """
549 kwargs = htc_tune_schedd_args(**kwargs)
550 for schedd_name, schedd in schedds.items():
551 for job_ad in schedd.query(**kwargs):
552 yield schedd_name, dict(job_ad)
555def htc_version():
556 """Return the version given by the HTCondor API.
558 Returns
559 -------
560 version : `str`
561 HTCondor version as easily comparable string.
562 """
563 return str(HTC_VERSION)
566def htc_submit_dag(sub):
567 """Submit job for execution.
569 Parameters
570 ----------
571 sub : `htcondor.Submit`
572 An object representing a job submit description.
574 Returns
575 -------
576 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
577 Information about jobs satisfying the search criteria where for each
578 Scheduler, local HTCondor job ids are mapped to their respective
579 classads.
580 """
581 coll = htcondor.Collector()
582 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
583 schedd = htcondor.Schedd(schedd_ad)
585 # If Schedd.submit() fails, the method will raise an exception. Usually,
586 # that implies issues with the HTCondor pool which BPS can't address.
587 # Hence, no effort is made to handle the exception.
588 submit_result = schedd.submit(sub)
590 # Sadly, the ClassAd from Schedd.submit() (see above) does not have
591 # 'GlobalJobId' so we need to run a regular query to get it anyway.
592 schedd_name = schedd_ad["Name"]
593 schedd_dag_info = condor_q(
594 constraint=f"ClusterId == {submit_result.cluster()}", schedds={schedd_name: schedd}
595 )
596 return schedd_dag_info
599def htc_create_submit_from_dag(dag_filename, submit_options=None):
600 """Create a DAGMan job submit description.
602 Parameters
603 ----------
604 dag_filename : `str`
605 Name of file containing HTCondor DAG commands.
606 submit_options : `dict` [`str`, Any], optional
607 Contains extra options for command line (Value of None means flag).
609 Returns
610 -------
611 sub : `htcondor.Submit`
612 An object representing a job submit description.
614 Notes
615 -----
616 Use with HTCondor versions which support htcondor.Submit.from_dag(),
617 i.e., 8.9.3 or newer.
618 """
619 return htcondor.Submit.from_dag(dag_filename, submit_options)
622def htc_create_submit_from_cmd(dag_filename, submit_options=None):
623 """Create a DAGMan job submit description.
625 Create a DAGMan job submit description by calling ``condor_submit_dag``
626 on given DAG description file.
628 Parameters
629 ----------
630 dag_filename : `str`
631 Name of file containing HTCondor DAG commands.
632 submit_options : `dict` [`str`, Any], optional
633 Contains extra options for command line (Value of None means flag).
635 Returns
636 -------
637 sub : `htcondor.Submit`
638 An object representing a job submit description.
640 Notes
641 -----
642 Use with HTCondor versions which do not support htcondor.Submit.from_dag(),
643 i.e., older than 8.9.3.
644 """
645 # Run command line condor_submit_dag command.
646 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse "
648 if submit_options is not None:
649 for opt, val in submit_options.items():
650 cmd += f" -{opt} {val or ''}"
651 cmd += f"{dag_filename}"
653 process = subprocess.Popen(
654 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8"
655 )
656 process.wait()
658 if process.returncode != 0:
659 print(f"Exit code: {process.returncode}")
660 print(process.communicate()[0])
661 raise RuntimeError("Problems running condor_submit_dag")
663 return htc_create_submit_from_file(f"{dag_filename}.condor.sub")
666def htc_create_submit_from_file(submit_file):
667 """Parse a submission file.
669 Parameters
670 ----------
671 submit_file : `str`
672 Name of the HTCondor submit file.
674 Returns
675 -------
676 sub : `htcondor.Submit`
677 An object representing a job submit description.
678 """
679 descriptors = {}
680 with open(submit_file) as fh:
681 for line in fh:
682 line = line.strip()
683 if not line.startswith("#") and not line == "queue":
684 (key, val) = re.split(r"\s*=\s*", line, 1)
685 descriptors[key] = val
687 # Avoid UserWarning: the line 'copy_to_spool = False' was
688 # unused by Submit object. Is it a typo?
689 try:
690 del descriptors["copy_to_spool"]
691 except KeyError:
692 pass
694 return htcondor.Submit(descriptors)
697def _htc_write_job_commands(stream, name, jobs):
698 """Output the DAGMan job lines for single job in DAG.
700 Parameters
701 ----------
702 stream : `~io.TextIOBase`
703 Writeable text stream (typically an opened file).
704 name : `str`
705 Job name.
706 jobs : `RestrictedDict`
707 DAG job keys and values.
708 """
709 if "pre" in jobs:
710 print(
711 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}"
712 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}",
713 file=stream,
714 )
716 if "post" in jobs:
717 print(
718 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}"
719 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}",
720 file=stream,
721 )
723 if "vars" in jobs:
724 for key, value in jobs["vars"]:
725 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
727 if "pre_skip" in jobs:
728 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream)
730 if "retry" in jobs and jobs["retry"]:
731 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream)
732 if "retry_unless_exit" in jobs:
733 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream)
734 print("\n", file=stream)
736 if "abort_dag_on" in jobs and jobs["abort_dag_on"]:
737 print(
738 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}"
739 f" RETURN {jobs['abort_dag_on']['abort_exit']}",
740 file=stream,
741 )
744class HTCJob:
745 """HTCondor job for use in building DAG.
747 Parameters
748 ----------
749 name : `str`
750 Name of the job.
751 label : `str`
752 Label that can used for grouping or lookup.
753 initcmds : `RestrictedDict`
754 Initial job commands for submit file.
755 initdagcmds : `RestrictedDict`
756 Initial commands for job inside DAG.
757 initattrs : `dict`
758 Initial dictionary of job attributes.
759 """
761 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None):
762 self.name = name
763 self.label = label
764 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds)
765 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
766 self.attrs = initattrs
767 self.subfile = None
769 def __str__(self):
770 return self.name
772 def add_job_cmds(self, new_commands):
773 """Add commands to Job (overwrite existing).
775 Parameters
776 ----------
777 new_commands : `dict`
778 Submit file commands to be added to Job.
779 """
780 self.cmds.update(new_commands)
782 def add_dag_cmds(self, new_commands):
783 """Add DAG commands to Job (overwrite existing).
785 Parameters
786 ----------
787 new_commands : `dict`
788 DAG file commands to be added to Job.
789 """
790 self.dagcmds.update(new_commands)
792 def add_job_attrs(self, new_attrs):
793 """Add attributes to Job (overwrite existing).
795 Parameters
796 ----------
797 new_attrs : `dict`
798 Attributes to be added to Job.
799 """
800 if self.attrs is None:
801 self.attrs = {}
802 if new_attrs:
803 self.attrs.update(new_attrs)
805 def write_submit_file(self, submit_path, job_subdir=""):
806 """Write job description to submit file.
808 Parameters
809 ----------
810 submit_path : `str`
811 Prefix path for the submit file.
812 job_subdir : `str`, optional
813 Template for job subdir.
814 """
815 if not self.subfile:
816 self.subfile = f"{self.name}.sub"
817 job_subdir = job_subdir.format(self=self)
818 if job_subdir:
819 self.subfile = os.path.join(job_subdir, self.subfile)
820 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
822 def write_dag_commands(self, stream):
823 """Write DAG commands for single job to output stream.
825 Parameters
826 ----------
827 stream : `IO` or `str`
828 Output Stream.
829 """
830 print(f"JOB {self.name} {self.subfile}", file=stream)
831 _htc_write_job_commands(stream, self.name, self.dagcmds)
833 def dump(self, fh):
834 """Dump job information to output stream.
836 Parameters
837 ----------
838 fh : `~io.TextIOBase`
839 Output stream.
840 """
841 printer = pprint.PrettyPrinter(indent=4, stream=fh)
842 printer.pprint(self.name)
843 printer.pprint(self.cmds)
844 printer.pprint(self.attrs)
847class HTCDag(networkx.DiGraph):
848 """HTCondor DAG.
850 Parameters
851 ----------
852 data : networkx.DiGraph.data
853 Initial graph.
854 name : `str`
855 Name for DAG.
856 """
858 def __init__(self, data=None, name=""):
859 super().__init__(data=data, name=name)
861 self.graph["attr"] = {}
862 self.graph["run_id"] = None
863 self.graph["submit_path"] = None
864 self.graph["final_job"] = None
866 def __str__(self):
867 """Represent basic DAG info as string.
869 Returns
870 -------
871 info : `str`
872 String containing basic DAG info.
873 """
874 return f"{self.graph['name']} {len(self)}"
876 def add_attribs(self, attribs=None):
877 """Add attributes to the DAG.
879 Parameters
880 ----------
881 attribs : `dict`
882 DAG attributes.
883 """
884 if attribs is not None:
885 self.graph["attr"].update(attribs)
887 def add_job(self, job, parent_names=None, child_names=None):
888 """Add an HTCJob to the HTCDag.
890 Parameters
891 ----------
892 job : `HTCJob`
893 HTCJob to add to the HTCDag.
894 parent_names : `~collections.abc.Iterable` [`str`], optional
895 Names of parent jobs.
896 child_names : `~collections.abc.Iterable` [`str`], optional
897 Names of child jobs.
898 """
899 assert isinstance(job, HTCJob)
901 # Add dag level attributes to each job
902 job.add_job_attrs(self.graph["attr"])
904 self.add_node(job.name, data=job)
906 if parent_names is not None:
907 self.add_job_relationships(parent_names, job.name)
909 if child_names is not None:
910 self.add_job_relationships(child_names, job.name)
912 def add_job_relationships(self, parents, children):
913 """Add DAG edge between parents and children jobs.
915 Parameters
916 ----------
917 parents : `list` [`str`]
918 Contains parent job name(s).
919 children : `list` [`str`]
920 Contains children job name(s).
921 """
922 self.add_edges_from(itertools.product(parents, children))
924 def add_final_job(self, job):
925 """Add an HTCJob for the FINAL job in HTCDag.
927 Parameters
928 ----------
929 job : `HTCJob`
930 HTCJob to add to the HTCDag as a FINAL job.
931 """
932 # Add dag level attributes to each job
933 job.add_job_attrs(self.graph["attr"])
935 self.graph["final_job"] = job
937 def del_job(self, job_name):
938 """Delete the job from the DAG.
940 Parameters
941 ----------
942 job_name : `str`
943 Name of job in DAG to delete.
944 """
945 # Reconnect edges around node to delete
946 parents = self.predecessors(job_name)
947 children = self.successors(job_name)
948 self.add_edges_from(itertools.product(parents, children))
950 # Delete job node (which deletes its edges).
951 self.remove_node(job_name)
953 def write(self, submit_path, job_subdir=""):
954 """Write DAG to a file.
956 Parameters
957 ----------
958 submit_path : `str`
959 Prefix path for dag filename to be combined with DAG name.
960 job_subdir : `str`, optional
961 Template for job subdir.
962 """
963 self.graph["submit_path"] = submit_path
964 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag")
965 os.makedirs(submit_path, exist_ok=True)
966 with open(self.graph["dag_filename"], "w") as fh:
967 for _, nodeval in self.nodes().items():
968 job = nodeval["data"]
969 job.write_submit_file(submit_path, job_subdir)
970 job.write_dag_commands(fh)
971 for edge in self.edges():
972 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
973 print(f"DOT {self.name}.dot", file=fh)
974 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh)
976 # Add bps attributes to dag submission
977 for key, value in self.graph["attr"].items():
978 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh)
980 if self.graph["final_job"]:
981 job = self.graph["final_job"]
982 job.write_submit_file(submit_path, job_subdir)
983 print(f"FINAL {job.name} {job.subfile}", file=fh)
984 if "pre" in job.dagcmds:
985 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
986 if "post" in job.dagcmds:
987 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
989 def dump(self, fh):
990 """Dump DAG info to output stream.
992 Parameters
993 ----------
994 fh : `io.IO` or `str`
995 Where to dump DAG info as text.
996 """
997 for key, value in self.graph:
998 print(f"{key}={value}", file=fh)
999 for name, data in self.nodes().items():
1000 print(f"{name}:", file=fh)
1001 data.dump(fh)
1002 for edge in self.edges():
1003 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
1004 if self.graph["final_job"]:
1005 print(f'FINAL {self.graph["final_job"].name}:', file=fh)
1006 self.graph["final_job"].dump(fh)
1008 def write_dot(self, filename):
1009 """Write a dot version of the DAG.
1011 Parameters
1012 ----------
1013 filename : `str`
1014 Name of the dot file.
1015 """
1016 pos = networkx.nx_agraph.graphviz_layout(self)
1017 networkx.draw(self, pos=pos)
1018 networkx.drawing.nx_pydot.write_dot(self, filename)
1021def condor_q(constraint=None, schedds=None, **kwargs):
1022 """Get information about the jobs in the HTCondor job queue(s).
1024 Parameters
1025 ----------
1026 constraint : `str`, optional
1027 Constraints to be passed to job query.
1028 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1029 HTCondor schedulers which to query for job information. If None
1030 (default), the query will be run against local scheduler only.
1031 **kwargs : `~typing.Any`
1032 Additional keyword arguments that need to be passed to the internal
1033 query method.
1035 Returns
1036 -------
1037 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1038 Information about jobs satisfying the search criteria where for each
1039 Scheduler, local HTCondor job ids are mapped to their respective
1040 classads.
1041 """
1042 return condor_query(constraint, schedds, htc_query_present, **kwargs)
1045def condor_history(constraint=None, schedds=None, **kwargs):
1046 """Get information about the jobs from HTCondor history records.
1048 Parameters
1049 ----------
1050 constraint : `str`, optional
1051 Constraints to be passed to job query.
1052 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1053 HTCondor schedulers which to query for job information. If None
1054 (default), the query will be run against the history file of
1055 the local scheduler only.
1056 **kwargs : `~typing.Any`
1057 Additional keyword arguments that need to be passed to the internal
1058 query method.
1060 Returns
1061 -------
1062 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1063 Information about jobs satisfying the search criteria where for each
1064 Scheduler, local HTCondor job ids are mapped to their respective
1065 classads.
1066 """
1067 return condor_query(constraint, schedds, htc_query_history, **kwargs)
1070def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs):
1071 """Get information about HTCondor jobs.
1073 Parameters
1074 ----------
1075 constraint : `str`, optional
1076 Constraints to be passed to job query.
1077 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1078 HTCondor schedulers which to query for job information. If None
1079 (default), the query will be run against the history file of
1080 the local scheduler only.
1081 query_func : callable
1082 An query function which takes following arguments:
1084 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]).
1085 - ``**kwargs``: Keyword arguments that will be passed to the query
1086 function.
1087 **kwargs : `~typing.Any`
1088 Additional keyword arguments that need to be passed to the query
1089 method.
1091 Returns
1092 -------
1093 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1094 Information about jobs satisfying the search criteria where for each
1095 Scheduler, local HTCondor job ids are mapped to their respective
1096 classads.
1097 """
1098 if not schedds:
1099 coll = htcondor.Collector()
1100 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1101 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)}
1103 # Make sure that 'ClusterId' and 'ProcId' attributes are always included
1104 # in the job classad. They are needed to construct the job id.
1105 added_attrs = set()
1106 if "projection" in kwargs and kwargs["projection"]:
1107 requested_attrs = set(kwargs["projection"])
1108 required_attrs = {"ClusterId", "ProcId"}
1109 added_attrs = required_attrs - requested_attrs
1110 for attr in added_attrs:
1111 kwargs["projection"].append(attr)
1113 unwanted_attrs = {"Env", "Environment"} | added_attrs
1114 job_info = defaultdict(dict)
1115 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs):
1116 id_ = f"{job_ad['ClusterId']}.{job_ad['ProcId']}"
1117 for attr in set(job_ad) & unwanted_attrs:
1118 del job_ad[attr]
1119 job_info[schedd_name][id_] = job_ad
1120 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values()))
1122 # Restore the list of the requested attributes to its original value
1123 # if needed.
1124 if added_attrs:
1125 for attr in added_attrs:
1126 kwargs["projection"].remove(attr)
1128 # When returning the results filter out entries for schedulers with no jobs
1129 # matching the search criteria.
1130 return {key: val for key, val in job_info.items() if val}
1133def condor_search(constraint=None, hist=None, schedds=None):
1134 """Search for running and finished jobs satisfying given criteria.
1136 Parameters
1137 ----------
1138 constraint : `str`, optional
1139 Constraints to be passed to job query.
1140 hist : `float`
1141 Limit history search to this many days.
1142 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1143 The list of the HTCondor schedulers which to query for job information.
1144 If None (default), only the local scheduler will be queried.
1146 Returns
1147 -------
1148 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1149 Information about jobs satisfying the search criteria where for each
1150 Scheduler, local HTCondor job ids are mapped to their respective
1151 classads.
1152 """
1153 if not schedds:
1154 coll = htcondor.Collector()
1155 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1156 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)}
1158 job_info = condor_q(constraint=constraint, schedds=schedds)
1159 if hist is not None:
1160 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
1161 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
1162 hist_info = condor_history(constraint, schedds=schedds)
1163 update_job_info(job_info, hist_info)
1164 return job_info
1167def condor_status(constraint=None, coll=None):
1168 """Get information about HTCondor pool.
1170 Parameters
1171 ----------
1172 constraint : `str`, optional
1173 Constraints to be passed to the query.
1174 coll : `htcondor.Collector`, optional
1175 Object representing HTCondor collector daemon.
1177 Returns
1178 -------
1179 pool_info : `dict` [`str`, `dict` [`str`, Any]]
1180 Mapping between HTCondor slot names and slot information (classAds).
1181 """
1182 if coll is None:
1183 coll = htcondor.Collector()
1184 try:
1185 pool_ads = coll.query(constraint=constraint)
1186 except OSError as ex:
1187 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex
1189 pool_info = {}
1190 for slot in pool_ads:
1191 pool_info[slot["name"]] = dict(slot)
1192 _LOG.debug("condor_status returned %d ads", len(pool_info))
1193 return pool_info
1196def update_job_info(job_info, other_info):
1197 """Update results of a job query with results from another query.
1199 Parameters
1200 ----------
1201 job_info : `dict` [`str`, `dict` [`str`, Any]]
1202 Results of the job query that needs to be updated.
1203 other_info : `dict` [`str`, `dict` [`str`, Any]]
1204 Results of the other job query.
1206 Returns
1207 -------
1208 job_info : `dict` [`str`, `dict` [`str`, Any]]
1209 The updated results.
1210 """
1211 for schedd_name, others in other_info.items():
1212 try:
1213 jobs = job_info[schedd_name]
1214 except KeyError:
1215 job_info[schedd_name] = others
1216 else:
1217 for id_, ad in others.items():
1218 jobs.setdefault(id_, {}).update(ad)
1219 return job_info
1222def summary_from_dag(dir_name):
1223 """Build bps_run_summary string from dag file.
1225 Parameters
1226 ----------
1227 dir_name : `str`
1228 Path that includes dag file for a run.
1230 Returns
1231 -------
1232 summary : `str`
1233 Semi-colon separated list of job labels and counts.
1234 (Same format as saved in dag classad).
1235 job_name_to_pipetask : `dict` [`str`, `str`]
1236 Mapping of job names to job labels.
1237 """
1238 dag = next(Path(dir_name).glob("*.dag"))
1240 # Later code depends upon insertion order
1241 counts = defaultdict(int)
1242 job_name_to_pipetask = {}
1243 try:
1244 with open(dag) as fh:
1245 for line in fh:
1246 if line.startswith("JOB"):
1247 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line)
1248 if m:
1249 label = m.group(2)
1250 if label == "init":
1251 label = "pipetaskInit"
1252 job_name_to_pipetask[m.group(1)] = label
1253 counts[label] += 1
1254 else: # Check if Pegasus submission
1255 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line)
1256 if m:
1257 label = pegasus_name_to_label(m.group(1))
1258 job_name_to_pipetask[m.group(1)] = label
1259 counts[label] += 1
1260 else:
1261 _LOG.warning("Parse DAG: unmatched job line: %s", line)
1262 elif line.startswith("FINAL"):
1263 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line)
1264 if m:
1265 label = m.group(2)
1266 job_name_to_pipetask[m.group(1)] = label
1267 counts[label] += 1
1269 except (OSError, PermissionError, StopIteration):
1270 pass
1272 summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1273 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask)
1274 return summary, job_name_to_pipetask
1277def pegasus_name_to_label(name):
1278 """Convert pegasus job name to a label for the report.
1280 Parameters
1281 ----------
1282 name : `str`
1283 Name of job.
1285 Returns
1286 -------
1287 label : `str`
1288 Label for job.
1289 """
1290 label = "UNK"
1291 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"):
1292 label = "pegasus"
1293 else:
1294 m = re.match(r"pipetask_(\d+_)?([^_]+)", name)
1295 if m:
1296 label = m.group(2)
1297 if label == "init":
1298 label = "pipetaskInit"
1300 return label
1303def read_dag_status(wms_path):
1304 """Read the node status file for DAG summary information.
1306 Parameters
1307 ----------
1308 wms_path : `str`
1309 Path that includes node status file for a run.
1311 Returns
1312 -------
1313 dag_ad : `dict` [`str`, Any]
1314 DAG summary information.
1315 """
1316 dag_ad = {}
1318 # While this is probably more up to date than dag classad, only read from
1319 # file if need to.
1320 try:
1321 try:
1322 node_stat_file = next(Path(wms_path).glob("*.node_status"))
1323 _LOG.debug("Reading Node Status File %s", node_stat_file)
1324 with open(node_stat_file) as infh:
1325 dag_ad = classad.parseNext(infh) # pylint: disable=E1101
1326 except StopIteration:
1327 pass
1329 if not dag_ad:
1330 # Pegasus check here
1331 try:
1332 metrics_file = next(Path(wms_path).glob("*.dag.metrics"))
1333 with open(metrics_file) as infh:
1334 metrics = json.load(infh)
1335 dag_ad["NodesTotal"] = metrics.get("jobs", 0)
1336 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
1337 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
1338 dag_ad["pegasus_version"] = metrics.get("planner_version", "")
1339 except StopIteration:
1340 try:
1341 metrics_file = next(Path(wms_path).glob("*.metrics"))
1342 with open(metrics_file) as infh:
1343 metrics = json.load(infh)
1344 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1345 dag_ad["pegasus_version"] = metrics.get("version", "")
1346 except StopIteration:
1347 pass
1348 except (OSError, PermissionError):
1349 pass
1351 _LOG.debug("read_dag_status: %s", dag_ad)
1352 return dict(dag_ad)
1355def read_node_status(wms_path):
1356 """Read entire node status file.
1358 Parameters
1359 ----------
1360 wms_path : `str`
1361 Path that includes node status file for a run.
1363 Returns
1364 -------
1365 jobs : `dict` [`str`, Any]
1366 DAG summary information.
1367 """
1368 # Get jobid info from other places to fill in gaps in info from node_status
1369 _, job_name_to_pipetask = summary_from_dag(wms_path)
1370 wms_workflow_id, loginfo = read_dag_log(wms_path)
1371 loginfo = read_dag_nodes_log(wms_path)
1372 _LOG.debug("loginfo = %s", loginfo)
1373 job_name_to_id = {}
1374 for jid, jinfo in loginfo.items():
1375 if "LogNotes" in jinfo:
1376 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"])
1377 if m:
1378 job_name_to_id[m.group(1)] = jid
1379 jinfo["DAGNodeName"] = m.group(1)
1381 try:
1382 node_status = next(Path(wms_path).glob("*.node_status"))
1383 except StopIteration:
1384 return loginfo
1386 jobs = {}
1387 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1388 try:
1389 with open(node_status) as fh:
1390 ads = classad.parseAds(fh)
1392 for jclassad in ads:
1393 if jclassad["Type"] == "DagStatus":
1394 # skip DAG summary
1395 pass
1396 elif "Node" not in jclassad:
1397 if jclassad["Type"] != "StatusEnd":
1398 _LOG.debug("Key 'Node' not in classad: %s", jclassad)
1399 break
1400 else:
1401 if jclassad["Node"] in job_name_to_pipetask:
1402 try:
1403 label = job_name_to_pipetask[jclassad["Node"]]
1404 except KeyError:
1405 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys())
1406 raise
1407 elif "_" in jclassad["Node"]:
1408 label = jclassad["Node"].split("_")[1]
1409 else:
1410 label = jclassad["Node"]
1412 # Make job info as if came from condor_q
1413 if jclassad["Node"] in job_name_to_id:
1414 job_id = job_name_to_id[jclassad["Node"]]
1415 else:
1416 job_id = str(fake_id)
1417 fake_id -= 1
1419 job = dict(jclassad)
1420 job["ClusterId"] = int(float(job_id))
1421 job["DAGManJobID"] = wms_workflow_id
1422 job["DAGNodeName"] = jclassad["Node"]
1423 job["bps_job_label"] = label
1425 jobs[str(job_id)] = job
1426 except (OSError, PermissionError):
1427 pass
1429 return jobs
1432def read_dag_log(wms_path):
1433 """Read job information from the DAGMan log file.
1435 Parameters
1436 ----------
1437 wms_path : `str`
1438 Path containing the DAGMan log file.
1440 Returns
1441 -------
1442 wms_workflow_id : `str`
1443 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1444 dag_info : `dict` [`str`, `~collections.abc.Any`]
1445 HTCondor job information read from the log file mapped to HTCondor
1446 job id.
1448 Raises
1449 ------
1450 FileNotFoundError
1451 If cannot find DAGMan log in given wms_path.
1452 """
1453 wms_workflow_id = 0
1454 dag_info = {}
1456 path = Path(wms_path)
1457 if path.exists():
1458 try:
1459 filename = next(path.glob("*.dag.dagman.log"))
1460 except StopIteration as exc:
1461 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1462 _LOG.debug("dag node log filename: %s", filename)
1464 info = {}
1465 job_event_log = htcondor.JobEventLog(str(filename))
1466 for event in job_event_log.events(stop_after=0):
1467 id_ = f"{event['Cluster']}.{event['Proc']}"
1468 if id_ not in info:
1469 info[id_] = {}
1470 wms_workflow_id = id_ # taking last job id in case of restarts
1471 info[id_].update(event)
1472 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1474 # only save latest DAG job
1475 dag_info = {wms_workflow_id: info[wms_workflow_id]}
1476 for job in dag_info.values():
1477 _tweak_log_info(filename, job)
1479 return wms_workflow_id, dag_info
1482def read_dag_nodes_log(wms_path):
1483 """Read job information from the DAGMan nodes log file.
1485 Parameters
1486 ----------
1487 wms_path : `str`
1488 Path containing the DAGMan nodes log file.
1490 Returns
1491 -------
1492 info : `dict` [`str`, Any]
1493 HTCondor job information read from the log file mapped to HTCondor
1494 job id.
1496 Raises
1497 ------
1498 FileNotFoundError
1499 If cannot find DAGMan node log in given wms_path.
1500 """
1501 try:
1502 filename = next(Path(wms_path).glob("*.dag.nodes.log"))
1503 except StopIteration as exc:
1504 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
1505 _LOG.debug("dag node log filename: %s", filename)
1507 info = {}
1508 job_event_log = htcondor.JobEventLog(str(filename))
1509 for event in job_event_log.events(stop_after=0):
1510 id_ = f"{event['Cluster']}.{event['Proc']}"
1511 if id_ not in info:
1512 info[id_] = {}
1513 info[id_].update(event)
1514 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1516 # Add more condor_q-like info to info parsed from log file.
1517 for job in info.values():
1518 _tweak_log_info(filename, job)
1520 return info
1523def read_dag_info(wms_path):
1524 """Read custom DAGMan job information from the file.
1526 Parameters
1527 ----------
1528 wms_path : `str`
1529 Path containing the file with the DAGMan job info.
1531 Returns
1532 -------
1533 dag_info : `dict` [`str`, `dict` [`str`, Any]]
1534 HTCondor job information.
1536 Raises
1537 ------
1538 FileNotFoundError
1539 If cannot find DAGMan job info file in the given location.
1540 """
1541 try:
1542 filename = next(Path(wms_path).glob("*.info.json"))
1543 except StopIteration as exc:
1544 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc
1545 _LOG.debug("DAGMan job information filename: %s", filename)
1546 try:
1547 with open(filename) as fh:
1548 dag_info = json.load(fh)
1549 except (OSError, PermissionError) as exc:
1550 _LOG.debug("Retrieving DAGMan job information failed: %s", exc)
1551 dag_info = {}
1552 return dag_info
1555def write_dag_info(filename, dag_info):
1556 """Write custom job information about DAGMan job.
1558 Parameters
1559 ----------
1560 filename : `str`
1561 Name of the file where the information will be stored.
1562 dag_info : `dict` [`str` `dict` [`str`, Any]]
1563 Information about the DAGMan job.
1564 """
1565 schedd_name = next(iter(dag_info))
1566 dag_id = next(iter(dag_info[schedd_name]))
1567 dag_ad = dag_info[schedd_name][dag_id]
1568 try:
1569 with open(filename, "w") as fh:
1570 info = {
1571 schedd_name: {
1572 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]}
1573 }
1574 }
1575 json.dump(info, fh)
1576 except (KeyError, OSError, PermissionError) as exc:
1577 _LOG.debug("Persisting DAGMan job information failed: %s", exc)
1580def _tweak_log_info(filename, job):
1581 """Massage the given job info has same structure as if came from condor_q.
1583 Parameters
1584 ----------
1585 filename : `pathlib.Path`
1586 Name of the DAGMan log.
1587 job : `dict` [ `str`, Any ]
1588 A mapping between HTCondor job id and job information read from
1589 the log.
1590 """
1591 _LOG.debug("_tweak_log_info: %s %s", filename, job)
1592 try:
1593 job["ClusterId"] = job["Cluster"]
1594 job["ProcId"] = job["Proc"]
1595 job["Iwd"] = str(filename.parent)
1596 job["Owner"] = filename.owner()
1597 if job["MyType"] == "ExecuteEvent":
1598 job["JobStatus"] = JobStatus.RUNNING
1599 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent":
1600 job["JobStatus"] = JobStatus.COMPLETED
1601 try:
1602 if not job["TerminatedNormally"]:
1603 if "ReturnValue" in job:
1604 job["ExitCode"] = job["ReturnValue"]
1605 job["ExitBySignal"] = False
1606 elif "TerminatedBySignal" in job:
1607 job["ExitBySignal"] = True
1608 job["ExitSignal"] = job["TerminatedBySignal"]
1609 else:
1610 _LOG.warning("Could not determine exit status for completed job: %s", job)
1611 except KeyError as ex:
1612 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job)
1613 elif job["MyType"] == "SubmitEvent":
1614 job["JobStatus"] = JobStatus.IDLE
1615 elif job["MyType"] == "JobAbortedEvent":
1616 job["JobStatus"] = JobStatus.REMOVED
1617 else:
1618 _LOG.debug("Unknown log event type: %s", job["MyType"])
1619 except KeyError:
1620 _LOG.error("Missing key in job: %s", job)
1621 raise
1624def htc_check_dagman_output(wms_path):
1625 """Check the DAGMan output for error messages.
1627 Parameters
1628 ----------
1629 wms_path : `str`
1630 Directory containing the DAGman output file.
1632 Returns
1633 -------
1634 message : `str`
1635 Message containing error messages from the DAGMan output. Empty
1636 string if no messages.
1638 Raises
1639 ------
1640 FileNotFoundError
1641 If cannot find DAGMan standard output file in given wms_path.
1642 """
1643 try:
1644 filename = next(Path(wms_path).glob("*.dag.dagman.out"))
1645 except StopIteration as exc:
1646 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
1647 _LOG.debug("dag output filename: %s", filename)
1649 message = ""
1650 try:
1651 with open(filename) as fh:
1652 last_submit_failed = ""
1653 for line in fh:
1654 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line)
1655 if m:
1656 last_submit_failed = m.group(1)
1657 if last_submit_failed:
1658 message = f"Warn: Job submission issues (last: {last_submit_failed})"
1659 except (OSError, PermissionError):
1660 message = f"Warn: Could not read dagman output file from {wms_path}."
1661 return message