Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%
605 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 18:09 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 18:09 +0000
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Placeholder HTCondor DAGMan API.
30There is new work on a python DAGMan API from HTCondor. However, at this
31time, it tries to make things easier by assuming DAG is easily broken into
32levels where there are 1-1 or all-to-all relationships to nodes in next
33level. LSST workflows are more complicated.
34"""
36__all__ = [
37 "DagStatus",
38 "JobStatus",
39 "NodeStatus",
40 "RestrictedDict",
41 "HTCJob",
42 "HTCDag",
43 "htc_backup_files",
44 "htc_check_dagman_output",
45 "htc_create_submit_from_cmd",
46 "htc_create_submit_from_dag",
47 "htc_create_submit_from_file",
48 "htc_escape",
49 "htc_write_attribs",
50 "htc_write_condor_file",
51 "htc_query_history",
52 "htc_query_present",
53 "htc_version",
54 "htc_submit_dag",
55 "condor_history",
56 "condor_q",
57 "condor_search",
58 "condor_status",
59 "update_job_info",
60 "MISSING_ID",
61 "summary_from_dag",
62 "read_dag_info",
63 "read_dag_log",
64 "read_dag_nodes_log",
65 "read_dag_status",
66 "read_node_status",
67 "write_dag_info",
68 "pegasus_name_to_label",
69]
72import itertools
73import json
74import logging
75import os
76import pprint
77import re
78import subprocess
79from collections import defaultdict
80from collections.abc import MutableMapping
81from datetime import datetime, timedelta
82from enum import IntEnum
83from pathlib import Path
85import classad
86import htcondor
87import networkx
88from packaging import version
90_LOG = logging.getLogger(__name__)
92MISSING_ID = -99999
95class DagStatus(IntEnum):
96 """HTCondor DAGMan's statuses for a DAG."""
98 OK = 0
99 ERROR = 1 # an error condition different than those listed here
100 FAILED = 2 # one or more nodes in the DAG have failed
101 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification
102 REMOVED = 4 # the DAG has been removed by condor_rm
103 CYCLE = 5 # a cycle was found in the DAG
104 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
107class JobStatus(IntEnum):
108 """HTCondor's statuses for jobs."""
110 UNEXPANDED = 0 # Unexpanded
111 IDLE = 1 # Idle
112 RUNNING = 2 # Running
113 REMOVED = 3 # Removed
114 COMPLETED = 4 # Completed
115 HELD = 5 # Held
116 TRANSFERRING_OUTPUT = 6 # Transferring_Output
117 SUSPENDED = 7 # Suspended
120class NodeStatus(IntEnum):
121 """HTCondor's statuses for DAGman nodes."""
123 # (STATUS_NOT_READY): At least one parent has not yet finished or the node
124 # is a FINAL node.
125 NOT_READY = 0
127 # (STATUS_READY): All parents have finished, but the node is not yet
128 # running.
129 READY = 1
131 # (STATUS_PRERUN): The node’s PRE script is running.
132 PRERUN = 2
134 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue.
135 # StatusDetails = "not_idle" -> running.
136 # JobProcsHeld = 1-> hold.
137 # JobProcsQueued = 1 -> idle.
138 SUBMITTED = 3
140 # (STATUS_POSTRUN): The node’s POST script is running.
141 POSTRUN = 4
143 # (STATUS_DONE): The node has completed successfully.
144 DONE = 5
146 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g.,
147 # ULOG_JOB_ABORTED for deleted job).
148 ERROR = 6
151HTC_QUOTE_KEYS = {"environment"}
152HTC_VALID_JOB_KEYS = {
153 "universe",
154 "executable",
155 "arguments",
156 "environment",
157 "log",
158 "error",
159 "output",
160 "should_transfer_files",
161 "when_to_transfer_output",
162 "getenv",
163 "notification",
164 "notify_user",
165 "concurrency_limit",
166 "transfer_executable",
167 "transfer_input_files",
168 "transfer_output_files",
169 "request_cpus",
170 "request_memory",
171 "request_disk",
172 "priority",
173 "category",
174 "requirements",
175 "on_exit_hold",
176 "on_exit_hold_reason",
177 "on_exit_hold_subcode",
178 "max_retries",
179 "periodic_release",
180 "periodic_remove",
181 "accounting_group",
182 "accounting_group_user",
183}
184HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"}
185HTC_VERSION = version.parse(htcondor.__version__)
188class RestrictedDict(MutableMapping):
189 """A dictionary that only allows certain keys.
191 Parameters
192 ----------
193 valid_keys : `Container`
194 Strings that are valid keys.
195 init_data : `dict` or `RestrictedDict`, optional
196 Initial data.
198 Raises
199 ------
200 KeyError
201 If invalid key(s) in init_data.
202 """
204 def __init__(self, valid_keys, init_data=()):
205 self.valid_keys = valid_keys
206 self.data = {}
207 self.update(init_data)
209 def __getitem__(self, key):
210 """Return value for given key if exists.
212 Parameters
213 ----------
214 key : `str`
215 Identifier for value to return.
217 Returns
218 -------
219 value : `~collections.abc.Any`
220 Value associated with given key.
222 Raises
223 ------
224 KeyError
225 If key doesn't exist.
226 """
227 return self.data[key]
229 def __delitem__(self, key):
230 """Delete value for given key if exists.
232 Parameters
233 ----------
234 key : `str`
235 Identifier for value to delete.
237 Raises
238 ------
239 KeyError
240 If key doesn't exist.
241 """
242 del self.data[key]
244 def __setitem__(self, key, value):
245 """Store key,value in internal dict only if key is valid.
247 Parameters
248 ----------
249 key : `str`
250 Identifier to associate with given value.
251 value : `~collections.abc.Any`
252 Value to store.
254 Raises
255 ------
256 KeyError
257 If key is invalid.
258 """
259 if key not in self.valid_keys:
260 raise KeyError(f"Invalid key {key}")
261 self.data[key] = value
263 def __iter__(self):
264 return self.data.__iter__()
266 def __len__(self):
267 return len(self.data)
269 def __str__(self):
270 return str(self.data)
273def htc_backup_files(wms_path, subdir=None, limit=100):
274 """Backup select HTCondor files in the submit directory.
276 Files will be saved in separate subdirectories which will be created in
277 the submit directory where the files are located. These subdirectories
278 will be consecutive, zero-padded integers. Their values will correspond to
279 the number of HTCondor rescue DAGs in the submit directory.
281 Hence, with the default settings, copies after the initial failed run will
282 be placed in '001' subdirectory, '002' after the first restart, and so on
283 until the limit of backups is reached. If there's no rescue DAG yet, files
284 will be copied to '000' subdirectory.
286 Parameters
287 ----------
288 wms_path : `str` or `pathlib.Path`
289 Path to the submit directory either absolute or relative.
290 subdir : `str` or `pathlib.Path`, optional
291 A path, relative to the submit directory, where all subdirectories with
292 backup files will be kept. Defaults to None which means that the backup
293 subdirectories will be placed directly in the submit directory.
294 limit : `int`, optional
295 Maximal number of backups. If the number of backups reaches the limit,
296 the last backup files will be overwritten. The default value is 100
297 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
298 version 8.8+.
300 Raises
301 ------
302 FileNotFoundError
303 If the submit directory or the file that needs to be backed up does not
304 exist.
305 OSError
306 If the submit directory cannot be accessed or backing up a file failed
307 either due to permission or filesystem related issues.
309 Notes
310 -----
311 This is not a generic function for making backups. It is intended to be
312 used once, just before a restart, to make snapshots of files which will be
313 overwritten by HTCondor after during the next run.
314 """
315 width = len(str(limit))
317 path = Path(wms_path).resolve()
318 if not path.is_dir():
319 raise FileNotFoundError(f"Directory {path} not found")
321 # Initialize the backup counter.
322 rescue_dags = list(Path(wms_path).glob("*.rescue*"))
323 counter = min(len(rescue_dags), limit)
325 # Create the backup directory and move select files there.
326 dest = Path(wms_path)
327 if subdir:
328 # PurePath.is_relative_to() is not available before Python 3.9. Hence
329 # we need to check is 'subdir' is in the submit directory in some other
330 # way if it is an absolute path.
331 subdir = Path(subdir)
332 if subdir.is_absolute():
333 if dest not in subdir.parents:
334 _LOG.warning(
335 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
336 subdir,
337 wms_path,
338 )
339 else:
340 dest /= subdir
341 else:
342 dest /= subdir
343 dest /= f"{counter:0{width}}"
344 try:
345 dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
346 except FileExistsError:
347 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
348 else:
349 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]:
350 for source in path.glob(patt):
351 if source.is_file():
352 target = dest / source.relative_to(path)
353 try:
354 source.rename(target)
355 except OSError as exc:
356 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
357 else:
358 raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
361def htc_escape(value):
362 """Escape characters in given value based upon HTCondor syntax.
364 Parameters
365 ----------
366 value : `~collections.abc.Any`
367 Value that needs to have characters escaped if string.
369 Returns
370 -------
371 new_value : `~collections.abc.Any`
372 Given value with characters escaped appropriate for HTCondor if string.
373 """
374 if isinstance(value, str):
375 newval = value.replace('"', '""').replace("'", "''").replace(""", '"')
376 else:
377 newval = value
379 return newval
382def htc_write_attribs(stream, attrs):
383 """Write job attributes in HTCondor format to writeable stream.
385 Parameters
386 ----------
387 stream : `~io.TextIOBase`
388 Output text stream (typically an open file).
389 attrs : `dict`
390 HTCondor job attributes (dictionary of attribute key, value).
391 """
392 for key, value in attrs.items():
393 # Make sure strings are syntactically correct for HTCondor.
394 if isinstance(value, str):
395 pval = f'"{htc_escape(value)}"'
396 else:
397 pval = value
399 print(f"+{key} = {pval}", file=stream)
402def htc_write_condor_file(filename, job_name, job, job_attrs):
403 """Write an HTCondor submit file.
405 Parameters
406 ----------
407 filename : `str`
408 Filename for the HTCondor submit file.
409 job_name : `str`
410 Job name to use in submit file.
411 job : `RestrictedDict`
412 Submit script information.
413 job_attrs : `dict`
414 Job attributes.
415 """
416 os.makedirs(os.path.dirname(filename), exist_ok=True)
417 with open(filename, "w") as fh:
418 for key, value in job.items():
419 if value is not None:
420 if key in HTC_QUOTE_KEYS:
421 print(f'{key}="{htc_escape(value)}"', file=fh)
422 else:
423 print(f"{key}={value}", file=fh)
424 for key in ["output", "error", "log"]:
425 if key not in job:
426 filename = f"{job_name}.$(Cluster).${key[:3]}"
427 print(f"{key}={filename}", file=fh)
429 if job_attrs is not None:
430 htc_write_attribs(fh, job_attrs)
431 print("queue", file=fh)
434# To avoid doing the version check during every function call select
435# appropriate conversion function at the import time.
436#
437# Make sure that *each* version specific variant of the conversion function(s)
438# has the same signature after applying any changes!
439if HTC_VERSION < version.parse("8.9.8"): 439 ↛ 441line 439 didn't jump to line 441, because the condition on line 439 was never true
441 def htc_tune_schedd_args(**kwargs):
442 """Ensure that arguments for Schedd are version appropriate.
444 The old arguments: 'requirements' and 'attr_list' of
445 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were
446 deprecated in favor of 'constraint' and 'projection', respectively,
447 starting from version 8.9.8. The function will convert "new" keyword
448 arguments to "old" ones.
450 Parameters
451 ----------
452 **kwargs
453 Any keyword arguments that Schedd.history(), Schedd.query(), and
454 Schedd.xquery() accepts.
456 Returns
457 -------
458 kwargs : `dict` [`str`, Any]
459 Keywords arguments that are guaranteed to work with the Python
460 HTCondor API.
462 Notes
463 -----
464 Function doesn't validate provided keyword arguments beyond converting
465 selected arguments to their version specific form. For example,
466 it won't remove keywords that are not supported by the methods
467 mentioned earlier.
468 """
469 translation_table = {
470 "constraint": "requirements",
471 "projection": "attr_list",
472 }
473 for new, old in translation_table.items():
474 try:
475 kwargs[old] = kwargs.pop(new)
476 except KeyError:
477 pass
478 return kwargs
480else:
482 def htc_tune_schedd_args(**kwargs):
483 """Ensure that arguments for Schedd are version appropriate.
485 This is the fallback function if no version specific alteration are
486 necessary. Effectively, a no-op.
488 Parameters
489 ----------
490 **kwargs
491 Any keyword arguments that Schedd.history(), Schedd.query(), and
492 Schedd.xquery() accepts.
494 Returns
495 -------
496 kwargs : `dict` [`str`, Any]
497 Keywords arguments that were passed to the function.
498 """
499 return kwargs
502def htc_query_history(schedds, **kwargs):
503 """Fetch history records from the condor_schedd daemon.
505 Parameters
506 ----------
507 schedds : `htcondor.Schedd`
508 HTCondor schedulers which to query for job information.
509 **kwargs
510 Any keyword arguments that Schedd.history() accepts.
512 Yields
513 ------
514 schedd_name : `str`
515 Name of the HTCondor scheduler managing the job queue.
516 job_ad : `dict` [`str`, Any]
517 A dictionary representing HTCondor ClassAd describing a job. It maps
518 job attributes names to values of the ClassAd expressions they
519 represent.
520 """
521 # If not set, provide defaults for positional arguments.
522 kwargs.setdefault("constraint", None)
523 kwargs.setdefault("projection", [])
524 kwargs = htc_tune_schedd_args(**kwargs)
525 for schedd_name, schedd in schedds.items():
526 for job_ad in schedd.history(**kwargs):
527 yield schedd_name, dict(job_ad)
530def htc_query_present(schedds, **kwargs):
531 """Query the condor_schedd daemon for job ads.
533 Parameters
534 ----------
535 schedds : `htcondor.Schedd`
536 HTCondor schedulers which to query for job information.
537 **kwargs
538 Any keyword arguments that Schedd.xquery() accepts.
540 Yields
541 ------
542 schedd_name : `str`
543 Name of the HTCondor scheduler managing the job queue.
544 job_ad : `dict` [`str`, Any]
545 A dictionary representing HTCondor ClassAd describing a job. It maps
546 job attributes names to values of the ClassAd expressions they
547 represent.
548 """
549 kwargs = htc_tune_schedd_args(**kwargs)
550 queries = [schedd.xquery(**kwargs) for schedd in schedds.values()]
551 for query in htcondor.poll(queries):
552 schedd_name = query.tag()
553 for job_ad in query.nextAdsNonBlocking():
554 yield schedd_name, dict(job_ad)
557def htc_version():
558 """Return the version given by the HTCondor API.
560 Returns
561 -------
562 version : `str`
563 HTCondor version as easily comparable string.
564 """
565 return str(HTC_VERSION)
568def htc_submit_dag(sub):
569 """Submit job for execution.
571 Parameters
572 ----------
573 sub : `htcondor.Submit`
574 An object representing a job submit description.
576 Returns
577 -------
578 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
579 Information about jobs satisfying the search criteria where for each
580 Scheduler, local HTCondor job ids are mapped to their respective
581 classads.
582 """
583 coll = htcondor.Collector()
584 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
585 schedd = htcondor.Schedd(schedd_ad)
587 jobs_ads = []
588 with schedd.transaction() as txn:
589 sub.queue(txn, ad_results=jobs_ads)
591 # Submit.queue() above will raise RuntimeError if submission fails, so
592 # 'jobs_ads' should contain the ad at this point.
593 dag_ad = jobs_ads[0]
595 # Sadly, the ClassAd from Submit.queue() (see above) does not have
596 # 'GlobalJobId' so we need to run a regular query to get it anyway.
597 schedd_name = schedd_ad["Name"]
598 schedd_dag_info = condor_q(
599 constraint=f"ClusterId == {dag_ad['ClusterId']}", schedds={schedd_name: schedd}
600 )
601 return schedd_dag_info
604def htc_create_submit_from_dag(dag_filename, submit_options=None):
605 """Create a DAGMan job submit description.
607 Parameters
608 ----------
609 dag_filename : `str`
610 Name of file containing HTCondor DAG commands.
611 submit_options : `dict` [`str`, Any], optional
612 Contains extra options for command line (Value of None means flag).
614 Returns
615 -------
616 sub : `htcondor.Submit`
617 An object representing a job submit description.
619 Notes
620 -----
621 Use with HTCondor versions which support htcondor.Submit.from_dag(),
622 i.e., 8.9.3 or newer.
623 """
624 return htcondor.Submit.from_dag(dag_filename, submit_options)
627def htc_create_submit_from_cmd(dag_filename, submit_options=None):
628 """Create a DAGMan job submit description.
630 Create a DAGMan job submit description by calling ``condor_submit_dag``
631 on given DAG description file.
633 Parameters
634 ----------
635 dag_filename : `str`
636 Name of file containing HTCondor DAG commands.
637 submit_options : `dict` [`str`, Any], optional
638 Contains extra options for command line (Value of None means flag).
640 Returns
641 -------
642 sub : `htcondor.Submit`
643 An object representing a job submit description.
645 Notes
646 -----
647 Use with HTCondor versions which do not support htcondor.Submit.from_dag(),
648 i.e., older than 8.9.3.
649 """
650 # Run command line condor_submit_dag command.
651 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse "
653 if submit_options is not None:
654 for opt, val in submit_options.items():
655 cmd += f" -{opt} {val or ''}"
656 cmd += f"{dag_filename}"
658 process = subprocess.Popen(
659 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8"
660 )
661 process.wait()
663 if process.returncode != 0:
664 print(f"Exit code: {process.returncode}")
665 print(process.communicate()[0])
666 raise RuntimeError("Problems running condor_submit_dag")
668 return htc_create_submit_from_file(f"{dag_filename}.condor.sub")
671def htc_create_submit_from_file(submit_file):
672 """Parse a submission file.
674 Parameters
675 ----------
676 submit_file : `str`
677 Name of the HTCondor submit file.
679 Returns
680 -------
681 sub : `htcondor.Submit`
682 An object representing a job submit description.
683 """
684 descriptors = {}
685 with open(submit_file) as fh:
686 for line in fh:
687 line = line.strip()
688 if not line.startswith("#") and not line == "queue":
689 (key, val) = re.split(r"\s*=\s*", line, 1)
690 descriptors[key] = val
692 # Avoid UserWarning: the line 'copy_to_spool = False' was
693 # unused by Submit object. Is it a typo?
694 try:
695 del descriptors["copy_to_spool"]
696 except KeyError:
697 pass
699 return htcondor.Submit(descriptors)
702def _htc_write_job_commands(stream, name, jobs):
703 """Output the DAGMan job lines for single job in DAG.
705 Parameters
706 ----------
707 stream : `~io.TextIOBase`
708 Writeable text stream (typically an opened file).
709 name : `str`
710 Job name.
711 jobs : `RestrictedDict`
712 DAG job keys and values.
713 """
714 if "pre" in jobs:
715 print(
716 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}"
717 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}",
718 file=stream,
719 )
721 if "post" in jobs:
722 print(
723 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}"
724 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}",
725 file=stream,
726 )
728 if "vars" in jobs:
729 for key, value in jobs["vars"]:
730 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
732 if "pre_skip" in jobs:
733 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream)
735 if "retry" in jobs and jobs["retry"]:
736 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream)
737 if "retry_unless_exit" in jobs:
738 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream)
739 print("\n", file=stream)
741 if "abort_dag_on" in jobs and jobs["abort_dag_on"]:
742 print(
743 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}"
744 f" RETURN {jobs['abort_dag_on']['abort_exit']}",
745 file=stream,
746 )
749class HTCJob:
750 """HTCondor job for use in building DAG.
752 Parameters
753 ----------
754 name : `str`
755 Name of the job.
756 label : `str`
757 Label that can used for grouping or lookup.
758 initcmds : `RestrictedDict`
759 Initial job commands for submit file.
760 initdagcmds : `RestrictedDict`
761 Initial commands for job inside DAG.
762 initattrs : `dict`
763 Initial dictionary of job attributes.
764 """
766 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None):
767 self.name = name
768 self.label = label
769 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds)
770 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
771 self.attrs = initattrs
772 self.subfile = None
774 def __str__(self):
775 return self.name
777 def add_job_cmds(self, new_commands):
778 """Add commands to Job (overwrite existing).
780 Parameters
781 ----------
782 new_commands : `dict`
783 Submit file commands to be added to Job.
784 """
785 self.cmds.update(new_commands)
787 def add_dag_cmds(self, new_commands):
788 """Add DAG commands to Job (overwrite existing).
790 Parameters
791 ----------
792 new_commands : `dict`
793 DAG file commands to be added to Job.
794 """
795 self.dagcmds.update(new_commands)
797 def add_job_attrs(self, new_attrs):
798 """Add attributes to Job (overwrite existing).
800 Parameters
801 ----------
802 new_attrs : `dict`
803 Attributes to be added to Job.
804 """
805 if self.attrs is None:
806 self.attrs = {}
807 if new_attrs:
808 self.attrs.update(new_attrs)
810 def write_submit_file(self, submit_path, job_subdir=""):
811 """Write job description to submit file.
813 Parameters
814 ----------
815 submit_path : `str`
816 Prefix path for the submit file.
817 job_subdir : `str`, optional
818 Template for job subdir.
819 """
820 if not self.subfile:
821 self.subfile = f"{self.name}.sub"
822 job_subdir = job_subdir.format(self=self)
823 if job_subdir:
824 self.subfile = os.path.join(job_subdir, self.subfile)
825 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
827 def write_dag_commands(self, stream):
828 """Write DAG commands for single job to output stream.
830 Parameters
831 ----------
832 stream : `IO` or `str`
833 Output Stream.
834 """
835 print(f"JOB {self.name} {self.subfile}", file=stream)
836 _htc_write_job_commands(stream, self.name, self.dagcmds)
838 def dump(self, fh):
839 """Dump job information to output stream.
841 Parameters
842 ----------
843 fh : `~io.TextIOBase`
844 Output stream.
845 """
846 printer = pprint.PrettyPrinter(indent=4, stream=fh)
847 printer.pprint(self.name)
848 printer.pprint(self.cmds)
849 printer.pprint(self.attrs)
852class HTCDag(networkx.DiGraph):
853 """HTCondor DAG.
855 Parameters
856 ----------
857 data : networkx.DiGraph.data
858 Initial graph.
859 name : `str`
860 Name for DAG.
861 """
863 def __init__(self, data=None, name=""):
864 super().__init__(data=data, name=name)
866 self.graph["attr"] = {}
867 self.graph["run_id"] = None
868 self.graph["submit_path"] = None
869 self.graph["final_job"] = None
871 def __str__(self):
872 """Represent basic DAG info as string.
874 Returns
875 -------
876 info : `str`
877 String containing basic DAG info.
878 """
879 return f"{self.graph['name']} {len(self)}"
881 def add_attribs(self, attribs=None):
882 """Add attributes to the DAG.
884 Parameters
885 ----------
886 attribs : `dict`
887 DAG attributes.
888 """
889 if attribs is not None:
890 self.graph["attr"].update(attribs)
892 def add_job(self, job, parent_names=None, child_names=None):
893 """Add an HTCJob to the HTCDag.
895 Parameters
896 ----------
897 job : `HTCJob`
898 HTCJob to add to the HTCDag.
899 parent_names : `~collections.abc.Iterable` [`str`], optional
900 Names of parent jobs.
901 child_names : `~collections.abc.Iterable` [`str`], optional
902 Names of child jobs.
903 """
904 assert isinstance(job, HTCJob)
906 # Add dag level attributes to each job
907 job.add_job_attrs(self.graph["attr"])
909 self.add_node(job.name, data=job)
911 if parent_names is not None:
912 self.add_job_relationships(parent_names, job.name)
914 if child_names is not None:
915 self.add_job_relationships(child_names, job.name)
917 def add_job_relationships(self, parents, children):
918 """Add DAG edge between parents and children jobs.
920 Parameters
921 ----------
922 parents : `list` [`str`]
923 Contains parent job name(s).
924 children : `list` [`str`]
925 Contains children job name(s).
926 """
927 self.add_edges_from(itertools.product(parents, children))
929 def add_final_job(self, job):
930 """Add an HTCJob for the FINAL job in HTCDag.
932 Parameters
933 ----------
934 job : `HTCJob`
935 HTCJob to add to the HTCDag as a FINAL job.
936 """
937 # Add dag level attributes to each job
938 job.add_job_attrs(self.graph["attr"])
940 self.graph["final_job"] = job
942 def del_job(self, job_name):
943 """Delete the job from the DAG.
945 Parameters
946 ----------
947 job_name : `str`
948 Name of job in DAG to delete.
949 """
950 # Reconnect edges around node to delete
951 parents = self.predecessors(job_name)
952 children = self.successors(job_name)
953 self.add_edges_from(itertools.product(parents, children))
955 # Delete job node (which deletes its edges).
956 self.remove_node(job_name)
958 def write(self, submit_path, job_subdir=""):
959 """Write DAG to a file.
961 Parameters
962 ----------
963 submit_path : `str`
964 Prefix path for dag filename to be combined with DAG name.
965 job_subdir : `str`, optional
966 Template for job subdir.
967 """
968 self.graph["submit_path"] = submit_path
969 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag")
970 os.makedirs(submit_path, exist_ok=True)
971 with open(self.graph["dag_filename"], "w") as fh:
972 for _, nodeval in self.nodes().items():
973 job = nodeval["data"]
974 job.write_submit_file(submit_path, job_subdir)
975 job.write_dag_commands(fh)
976 for edge in self.edges():
977 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
978 print(f"DOT {self.name}.dot", file=fh)
979 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh)
981 # Add bps attributes to dag submission
982 for key, value in self.graph["attr"].items():
983 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh)
985 if self.graph["final_job"]:
986 job = self.graph["final_job"]
987 job.write_submit_file(submit_path, job_subdir)
988 print(f"FINAL {job.name} {job.subfile}", file=fh)
989 if "pre" in job.dagcmds:
990 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
991 if "post" in job.dagcmds:
992 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
994 def dump(self, fh):
995 """Dump DAG info to output stream.
997 Parameters
998 ----------
999 fh : `io.IO` or `str`
1000 Where to dump DAG info as text.
1001 """
1002 for key, value in self.graph:
1003 print(f"{key}={value}", file=fh)
1004 for name, data in self.nodes().items():
1005 print(f"{name}:", file=fh)
1006 data.dump(fh)
1007 for edge in self.edges():
1008 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
1009 if self.graph["final_job"]:
1010 print(f'FINAL {self.graph["final_job"].name}:', file=fh)
1011 self.graph["final_job"].dump(fh)
1013 def write_dot(self, filename):
1014 """Write a dot version of the DAG.
1016 Parameters
1017 ----------
1018 filename : `str`
1019 Name of the dot file.
1020 """
1021 pos = networkx.nx_agraph.graphviz_layout(self)
1022 networkx.draw(self, pos=pos)
1023 networkx.drawing.nx_pydot.write_dot(self, filename)
1026def condor_q(constraint=None, schedds=None, **kwargs):
1027 """Get information about the jobs in the HTCondor job queue(s).
1029 Parameters
1030 ----------
1031 constraint : `str`, optional
1032 Constraints to be passed to job query.
1033 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1034 HTCondor schedulers which to query for job information. If None
1035 (default), the query will be run against local scheduler only.
1036 **kwargs : `~typing.Any`
1037 Additional keyword arguments that need to be passed to the internal
1038 query method.
1040 Returns
1041 -------
1042 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1043 Information about jobs satisfying the search criteria where for each
1044 Scheduler, local HTCondor job ids are mapped to their respective
1045 classads.
1046 """
1047 return condor_query(constraint, schedds, htc_query_present, **kwargs)
1050def condor_history(constraint=None, schedds=None, **kwargs):
1051 """Get information about the jobs from HTCondor history records.
1053 Parameters
1054 ----------
1055 constraint : `str`, optional
1056 Constraints to be passed to job query.
1057 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1058 HTCondor schedulers which to query for job information. If None
1059 (default), the query will be run against the history file of
1060 the local scheduler only.
1061 **kwargs : `~typing.Any`
1062 Additional keyword arguments that need to be passed to the internal
1063 query method.
1065 Returns
1066 -------
1067 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1068 Information about jobs satisfying the search criteria where for each
1069 Scheduler, local HTCondor job ids are mapped to their respective
1070 classads.
1071 """
1072 return condor_query(constraint, schedds, htc_query_history, **kwargs)
1075def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs):
1076 """Get information about HTCondor jobs.
1078 Parameters
1079 ----------
1080 constraint : `str`, optional
1081 Constraints to be passed to job query.
1082 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1083 HTCondor schedulers which to query for job information. If None
1084 (default), the query will be run against the history file of
1085 the local scheduler only.
1086 query_func : callable
1087 An query function which takes following arguments:
1089 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]).
1090 - ``**kwargs``: Keyword arguments that will be passed to the query
1091 function.
1092 **kwargs : `~typing.Any`
1093 Additional keyword arguments that need to be passed to the query
1094 method.
1096 Returns
1097 -------
1098 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1099 Information about jobs satisfying the search criteria where for each
1100 Scheduler, local HTCondor job ids are mapped to their respective
1101 classads.
1102 """
1103 if not schedds:
1104 coll = htcondor.Collector()
1105 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1106 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)}
1108 # Make sure that 'ClusterId' and 'ProcId' attributes are always included
1109 # in the job classad. They are needed to construct the job id.
1110 added_attrs = set()
1111 if "projection" in kwargs and kwargs["projection"]:
1112 requested_attrs = set(kwargs["projection"])
1113 required_attrs = {"ClusterId", "ProcId"}
1114 added_attrs = required_attrs - requested_attrs
1115 for attr in added_attrs:
1116 kwargs["projection"].append(attr)
1118 unwanted_attrs = {"Env", "Environment"} | added_attrs
1119 job_info = defaultdict(dict)
1120 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs):
1121 id_ = f"{job_ad['ClusterId']}.{job_ad['ProcId']}"
1122 for attr in set(job_ad) & unwanted_attrs:
1123 del job_ad[attr]
1124 job_info[schedd_name][id_] = job_ad
1125 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values()))
1127 # Restore the list of the requested attributes to its original value
1128 # if needed.
1129 if added_attrs:
1130 for attr in added_attrs:
1131 kwargs["projection"].remove(attr)
1133 # When returning the results filter out entries for schedulers with no jobs
1134 # matching the search criteria.
1135 return {key: val for key, val in job_info.items() if val}
1138def condor_search(constraint=None, hist=None, schedds=None):
1139 """Search for running and finished jobs satisfying given criteria.
1141 Parameters
1142 ----------
1143 constraint : `str`, optional
1144 Constraints to be passed to job query.
1145 hist : `float`
1146 Limit history search to this many days.
1147 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1148 The list of the HTCondor schedulers which to query for job information.
1149 If None (default), only the local scheduler will be queried.
1151 Returns
1152 -------
1153 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1154 Information about jobs satisfying the search criteria where for each
1155 Scheduler, local HTCondor job ids are mapped to their respective
1156 classads.
1157 """
1158 if not schedds:
1159 coll = htcondor.Collector()
1160 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1161 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)}
1163 job_info = condor_q(constraint=constraint, schedds=schedds)
1164 if hist is not None:
1165 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
1166 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
1167 hist_info = condor_history(constraint, schedds=schedds)
1168 update_job_info(job_info, hist_info)
1169 return job_info
1172def condor_status(constraint=None, coll=None):
1173 """Get information about HTCondor pool.
1175 Parameters
1176 ----------
1177 constraint : `str`, optional
1178 Constraints to be passed to the query.
1179 coll : `htcondor.Collector`, optional
1180 Object representing HTCondor collector daemon.
1182 Returns
1183 -------
1184 pool_info : `dict` [`str`, `dict` [`str`, Any]]
1185 Mapping between HTCondor slot names and slot information (classAds).
1186 """
1187 if coll is None:
1188 coll = htcondor.Collector()
1189 try:
1190 pool_ads = coll.query(constraint=constraint)
1191 except OSError as ex:
1192 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex
1194 pool_info = {}
1195 for slot in pool_ads:
1196 pool_info[slot["name"]] = dict(slot)
1197 _LOG.debug("condor_status returned %d ads", len(pool_info))
1198 return pool_info
1201def update_job_info(job_info, other_info):
1202 """Update results of a job query with results from another query.
1204 Parameters
1205 ----------
1206 job_info : `dict` [`str`, `dict` [`str`, Any]]
1207 Results of the job query that needs to be updated.
1208 other_info : `dict` [`str`, `dict` [`str`, Any]]
1209 Results of the other job query.
1211 Returns
1212 -------
1213 job_info : `dict` [`str`, `dict` [`str`, Any]]
1214 The updated results.
1215 """
1216 for schedd_name, others in other_info.items():
1217 try:
1218 jobs = job_info[schedd_name]
1219 except KeyError:
1220 job_info[schedd_name] = others
1221 else:
1222 for id_, ad in others.items():
1223 jobs.setdefault(id_, {}).update(ad)
1224 return job_info
1227def summary_from_dag(dir_name):
1228 """Build bps_run_summary string from dag file.
1230 Parameters
1231 ----------
1232 dir_name : `str`
1233 Path that includes dag file for a run.
1235 Returns
1236 -------
1237 summary : `str`
1238 Semi-colon separated list of job labels and counts.
1239 (Same format as saved in dag classad).
1240 job_name_to_pipetask : `dict` [`str`, `str`]
1241 Mapping of job names to job labels.
1242 """
1243 dag = next(Path(dir_name).glob("*.dag"))
1245 # Later code depends upon insertion order
1246 counts = defaultdict(int)
1247 job_name_to_pipetask = {}
1248 try:
1249 with open(dag) as fh:
1250 for line in fh:
1251 if line.startswith("JOB"):
1252 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line)
1253 if m:
1254 label = m.group(2)
1255 if label == "init":
1256 label = "pipetaskInit"
1257 job_name_to_pipetask[m.group(1)] = label
1258 counts[label] += 1
1259 else: # Check if Pegasus submission
1260 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line)
1261 if m:
1262 label = pegasus_name_to_label(m.group(1))
1263 job_name_to_pipetask[m.group(1)] = label
1264 counts[label] += 1
1265 else:
1266 _LOG.warning("Parse DAG: unmatched job line: %s", line)
1267 elif line.startswith("FINAL"):
1268 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line)
1269 if m:
1270 label = m.group(2)
1271 job_name_to_pipetask[m.group(1)] = label
1272 counts[label] += 1
1274 except (OSError, PermissionError, StopIteration):
1275 pass
1277 summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1278 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask)
1279 return summary, job_name_to_pipetask
1282def pegasus_name_to_label(name):
1283 """Convert pegasus job name to a label for the report.
1285 Parameters
1286 ----------
1287 name : `str`
1288 Name of job.
1290 Returns
1291 -------
1292 label : `str`
1293 Label for job.
1294 """
1295 label = "UNK"
1296 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"):
1297 label = "pegasus"
1298 else:
1299 m = re.match(r"pipetask_(\d+_)?([^_]+)", name)
1300 if m:
1301 label = m.group(2)
1302 if label == "init":
1303 label = "pipetaskInit"
1305 return label
1308def read_dag_status(wms_path):
1309 """Read the node status file for DAG summary information.
1311 Parameters
1312 ----------
1313 wms_path : `str`
1314 Path that includes node status file for a run.
1316 Returns
1317 -------
1318 dag_ad : `dict` [`str`, Any]
1319 DAG summary information.
1320 """
1321 dag_ad = {}
1323 # While this is probably more up to date than dag classad, only read from
1324 # file if need to.
1325 try:
1326 try:
1327 node_stat_file = next(Path(wms_path).glob("*.node_status"))
1328 _LOG.debug("Reading Node Status File %s", node_stat_file)
1329 with open(node_stat_file) as infh:
1330 dag_ad = classad.parseNext(infh) # pylint: disable=E1101
1331 except StopIteration:
1332 pass
1334 if not dag_ad:
1335 # Pegasus check here
1336 try:
1337 metrics_file = next(Path(wms_path).glob("*.dag.metrics"))
1338 with open(metrics_file) as infh:
1339 metrics = json.load(infh)
1340 dag_ad["NodesTotal"] = metrics.get("jobs", 0)
1341 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
1342 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
1343 dag_ad["pegasus_version"] = metrics.get("planner_version", "")
1344 except StopIteration:
1345 try:
1346 metrics_file = next(Path(wms_path).glob("*.metrics"))
1347 with open(metrics_file) as infh:
1348 metrics = json.load(infh)
1349 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1350 dag_ad["pegasus_version"] = metrics.get("version", "")
1351 except StopIteration:
1352 pass
1353 except (OSError, PermissionError):
1354 pass
1356 _LOG.debug("read_dag_status: %s", dag_ad)
1357 return dict(dag_ad)
1360def read_node_status(wms_path):
1361 """Read entire node status file.
1363 Parameters
1364 ----------
1365 wms_path : `str`
1366 Path that includes node status file for a run.
1368 Returns
1369 -------
1370 jobs : `dict` [`str`, Any]
1371 DAG summary information.
1372 """
1373 # Get jobid info from other places to fill in gaps in info from node_status
1374 _, job_name_to_pipetask = summary_from_dag(wms_path)
1375 wms_workflow_id, loginfo = read_dag_log(wms_path)
1376 loginfo = read_dag_nodes_log(wms_path)
1377 _LOG.debug("loginfo = %s", loginfo)
1378 job_name_to_id = {}
1379 for jid, jinfo in loginfo.items():
1380 if "LogNotes" in jinfo:
1381 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"])
1382 if m:
1383 job_name_to_id[m.group(1)] = jid
1384 jinfo["DAGNodeName"] = m.group(1)
1386 try:
1387 node_status = next(Path(wms_path).glob("*.node_status"))
1388 except StopIteration:
1389 return loginfo
1391 jobs = {}
1392 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1393 try:
1394 with open(node_status) as fh:
1395 ads = classad.parseAds(fh)
1397 for jclassad in ads:
1398 if jclassad["Type"] == "DagStatus":
1399 # skip DAG summary
1400 pass
1401 elif "Node" not in jclassad:
1402 if jclassad["Type"] != "StatusEnd":
1403 _LOG.debug("Key 'Node' not in classad: %s", jclassad)
1404 break
1405 else:
1406 if jclassad["Node"] in job_name_to_pipetask:
1407 try:
1408 label = job_name_to_pipetask[jclassad["Node"]]
1409 except KeyError:
1410 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys())
1411 raise
1412 elif "_" in jclassad["Node"]:
1413 label = jclassad["Node"].split("_")[1]
1414 else:
1415 label = jclassad["Node"]
1417 # Make job info as if came from condor_q
1418 if jclassad["Node"] in job_name_to_id:
1419 job_id = job_name_to_id[jclassad["Node"]]
1420 else:
1421 job_id = str(fake_id)
1422 fake_id -= 1
1424 job = dict(jclassad)
1425 job["ClusterId"] = int(float(job_id))
1426 job["DAGManJobID"] = wms_workflow_id
1427 job["DAGNodeName"] = jclassad["Node"]
1428 job["bps_job_label"] = label
1430 jobs[str(job_id)] = job
1431 except (OSError, PermissionError):
1432 pass
1434 return jobs
1437def read_dag_log(wms_path):
1438 """Read job information from the DAGMan log file.
1440 Parameters
1441 ----------
1442 wms_path : `str`
1443 Path containing the DAGMan log file.
1445 Returns
1446 -------
1447 wms_workflow_id : `str`
1448 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1449 dag_info : `dict` [`str`, `~collections.abc.Any`]
1450 HTCondor job information read from the log file mapped to HTCondor
1451 job id.
1453 Raises
1454 ------
1455 FileNotFoundError
1456 If cannot find DAGMan log in given wms_path.
1457 """
1458 wms_workflow_id = 0
1459 dag_info = {}
1461 path = Path(wms_path)
1462 if path.exists():
1463 try:
1464 filename = next(path.glob("*.dag.dagman.log"))
1465 except StopIteration as exc:
1466 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1467 _LOG.debug("dag node log filename: %s", filename)
1469 info = {}
1470 job_event_log = htcondor.JobEventLog(str(filename))
1471 for event in job_event_log.events(stop_after=0):
1472 id_ = f"{event['Cluster']}.{event['Proc']}"
1473 if id_ not in info:
1474 info[id_] = {}
1475 wms_workflow_id = id_ # taking last job id in case of restarts
1476 info[id_].update(event)
1477 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1479 # only save latest DAG job
1480 dag_info = {wms_workflow_id: info[wms_workflow_id]}
1481 for job in dag_info.values():
1482 _tweak_log_info(filename, job)
1484 return wms_workflow_id, dag_info
1487def read_dag_nodes_log(wms_path):
1488 """Read job information from the DAGMan nodes log file.
1490 Parameters
1491 ----------
1492 wms_path : `str`
1493 Path containing the DAGMan nodes log file.
1495 Returns
1496 -------
1497 info : `dict` [`str`, Any]
1498 HTCondor job information read from the log file mapped to HTCondor
1499 job id.
1501 Raises
1502 ------
1503 FileNotFoundError
1504 If cannot find DAGMan node log in given wms_path.
1505 """
1506 try:
1507 filename = next(Path(wms_path).glob("*.dag.nodes.log"))
1508 except StopIteration as exc:
1509 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
1510 _LOG.debug("dag node log filename: %s", filename)
1512 info = {}
1513 job_event_log = htcondor.JobEventLog(str(filename))
1514 for event in job_event_log.events(stop_after=0):
1515 id_ = f"{event['Cluster']}.{event['Proc']}"
1516 if id_ not in info:
1517 info[id_] = {}
1518 info[id_].update(event)
1519 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1521 # Add more condor_q-like info to info parsed from log file.
1522 for job in info.values():
1523 _tweak_log_info(filename, job)
1525 return info
1528def read_dag_info(wms_path):
1529 """Read custom DAGMan job information from the file.
1531 Parameters
1532 ----------
1533 wms_path : `str`
1534 Path containing the file with the DAGMan job info.
1536 Returns
1537 -------
1538 dag_info : `dict` [`str`, `dict` [`str`, Any]]
1539 HTCondor job information.
1541 Raises
1542 ------
1543 FileNotFoundError
1544 If cannot find DAGMan job info file in the given location.
1545 """
1546 try:
1547 filename = next(Path(wms_path).glob("*.info.json"))
1548 except StopIteration as exc:
1549 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc
1550 _LOG.debug("DAGMan job information filename: %s", filename)
1551 try:
1552 with open(filename) as fh:
1553 dag_info = json.load(fh)
1554 except (OSError, PermissionError) as exc:
1555 _LOG.debug("Retrieving DAGMan job information failed: %s", exc)
1556 dag_info = {}
1557 return dag_info
1560def write_dag_info(filename, dag_info):
1561 """Write custom job information about DAGMan job.
1563 Parameters
1564 ----------
1565 filename : `str`
1566 Name of the file where the information will be stored.
1567 dag_info : `dict` [`str` `dict` [`str`, Any]]
1568 Information about the DAGMan job.
1569 """
1570 schedd_name = next(iter(dag_info))
1571 dag_id = next(iter(dag_info[schedd_name]))
1572 dag_ad = dag_info[schedd_name][dag_id]
1573 try:
1574 with open(filename, "w") as fh:
1575 info = {
1576 schedd_name: {
1577 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]}
1578 }
1579 }
1580 json.dump(info, fh)
1581 except (KeyError, OSError, PermissionError) as exc:
1582 _LOG.debug("Persisting DAGMan job information failed: %s", exc)
1585def _tweak_log_info(filename, job):
1586 """Massage the given job info has same structure as if came from condor_q.
1588 Parameters
1589 ----------
1590 filename : `pathlib.Path`
1591 Name of the DAGMan log.
1592 job : `dict` [ `str`, Any ]
1593 A mapping between HTCondor job id and job information read from
1594 the log.
1595 """
1596 _LOG.debug("_tweak_log_info: %s %s", filename, job)
1597 try:
1598 job["ClusterId"] = job["Cluster"]
1599 job["ProcId"] = job["Proc"]
1600 job["Iwd"] = str(filename.parent)
1601 job["Owner"] = filename.owner()
1602 if job["MyType"] == "ExecuteEvent":
1603 job["JobStatus"] = JobStatus.RUNNING
1604 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent":
1605 job["JobStatus"] = JobStatus.COMPLETED
1606 try:
1607 if not job["TerminatedNormally"]:
1608 if "ReturnValue" in job:
1609 job["ExitCode"] = job["ReturnValue"]
1610 job["ExitBySignal"] = False
1611 elif "TerminatedBySignal" in job:
1612 job["ExitBySignal"] = True
1613 job["ExitSignal"] = job["TerminatedBySignal"]
1614 else:
1615 _LOG.warning("Could not determine exit status for completed job: %s", job)
1616 except KeyError as ex:
1617 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job)
1618 elif job["MyType"] == "SubmitEvent":
1619 job["JobStatus"] = JobStatus.IDLE
1620 elif job["MyType"] == "JobAbortedEvent":
1621 job["JobStatus"] = JobStatus.REMOVED
1622 else:
1623 _LOG.debug("Unknown log event type: %s", job["MyType"])
1624 except KeyError:
1625 _LOG.error("Missing key in job: %s", job)
1626 raise
1629def htc_check_dagman_output(wms_path):
1630 """Check the DAGMan output for error messages.
1632 Parameters
1633 ----------
1634 wms_path : `str`
1635 Directory containing the DAGman output file.
1637 Returns
1638 -------
1639 message : `str`
1640 Message containing error messages from the DAGMan output. Empty
1641 string if no messages.
1643 Raises
1644 ------
1645 FileNotFoundError
1646 If cannot find DAGMan standard output file in given wms_path.
1647 """
1648 try:
1649 filename = next(Path(wms_path).glob("*.dag.dagman.out"))
1650 except StopIteration as exc:
1651 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
1652 _LOG.debug("dag output filename: %s", filename)
1654 message = ""
1655 try:
1656 with open(filename) as fh:
1657 last_submit_failed = ""
1658 for line in fh:
1659 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line)
1660 if m:
1661 last_submit_failed = m.group(1)
1662 if last_submit_failed:
1663 message = f"Warn: Job submission issues (last: {last_submit_failed})"
1664 except (OSError, PermissionError):
1665 message = f"Warn: Could not read dagman output file from {wms_path}."
1666 return message