Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%
604 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 03:42 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 03:42 -0700
1# This file is part of ctrl_bps_htcondor.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Placeholder HTCondor DAGMan API.
30There is new work on a python DAGMan API from HTCondor. However, at this
31time, it tries to make things easier by assuming DAG is easily broken into
32levels where there are 1-1 or all-to-all relationships to nodes in next
33level. LSST workflows are more complicated.
34"""
36__all__ = [
37 "DagStatus",
38 "JobStatus",
39 "NodeStatus",
40 "RestrictedDict",
41 "HTCJob",
42 "HTCDag",
43 "htc_backup_files",
44 "htc_check_dagman_output",
45 "htc_create_submit_from_cmd",
46 "htc_create_submit_from_dag",
47 "htc_create_submit_from_file",
48 "htc_escape",
49 "htc_write_attribs",
50 "htc_write_condor_file",
51 "htc_query_history",
52 "htc_query_present",
53 "htc_version",
54 "htc_submit_dag",
55 "condor_history",
56 "condor_q",
57 "condor_search",
58 "condor_status",
59 "update_job_info",
60 "MISSING_ID",
61 "summary_from_dag",
62 "read_dag_info",
63 "read_dag_log",
64 "read_dag_nodes_log",
65 "read_dag_status",
66 "read_node_status",
67 "write_dag_info",
68 "pegasus_name_to_label",
69]
72import itertools
73import json
74import logging
75import os
76import pprint
77import re
78import subprocess
79from collections import defaultdict
80from collections.abc import MutableMapping
81from datetime import datetime, timedelta
82from enum import IntEnum
83from pathlib import Path
85import classad
86import htcondor
87import networkx
88from packaging import version
90from .handlers import HTC_JOB_AD_HANDLERS
92_LOG = logging.getLogger(__name__)
94MISSING_ID = -99999
97class DagStatus(IntEnum):
98 """HTCondor DAGMan's statuses for a DAG."""
100 OK = 0
101 ERROR = 1 # an error condition different than those listed here
102 FAILED = 2 # one or more nodes in the DAG have failed
103 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification
104 REMOVED = 4 # the DAG has been removed by condor_rm
105 CYCLE = 5 # a cycle was found in the DAG
106 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
109class JobStatus(IntEnum):
110 """HTCondor's statuses for jobs."""
112 UNEXPANDED = 0 # Unexpanded
113 IDLE = 1 # Idle
114 RUNNING = 2 # Running
115 REMOVED = 3 # Removed
116 COMPLETED = 4 # Completed
117 HELD = 5 # Held
118 TRANSFERRING_OUTPUT = 6 # Transferring_Output
119 SUSPENDED = 7 # Suspended
122class NodeStatus(IntEnum):
123 """HTCondor's statuses for DAGman nodes."""
125 # (STATUS_NOT_READY): At least one parent has not yet finished or the node
126 # is a FINAL node.
127 NOT_READY = 0
129 # (STATUS_READY): All parents have finished, but the node is not yet
130 # running.
131 READY = 1
133 # (STATUS_PRERUN): The node’s PRE script is running.
134 PRERUN = 2
136 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue.
137 # StatusDetails = "not_idle" -> running.
138 # JobProcsHeld = 1-> hold.
139 # JobProcsQueued = 1 -> idle.
140 SUBMITTED = 3
142 # (STATUS_POSTRUN): The node’s POST script is running.
143 POSTRUN = 4
145 # (STATUS_DONE): The node has completed successfully.
146 DONE = 5
148 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g.,
149 # ULOG_JOB_ABORTED for deleted job).
150 ERROR = 6
153HTC_QUOTE_KEYS = {"environment"}
154HTC_VALID_JOB_KEYS = {
155 "universe",
156 "executable",
157 "arguments",
158 "environment",
159 "log",
160 "error",
161 "output",
162 "should_transfer_files",
163 "when_to_transfer_output",
164 "getenv",
165 "notification",
166 "notify_user",
167 "concurrency_limit",
168 "transfer_executable",
169 "transfer_input_files",
170 "transfer_output_files",
171 "request_cpus",
172 "request_memory",
173 "request_disk",
174 "priority",
175 "category",
176 "requirements",
177 "on_exit_hold",
178 "on_exit_hold_reason",
179 "on_exit_hold_subcode",
180 "max_retries",
181 "periodic_release",
182 "periodic_remove",
183 "accounting_group",
184 "accounting_group_user",
185}
186HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"}
187HTC_VERSION = version.parse(htcondor.__version__)
190class RestrictedDict(MutableMapping):
191 """A dictionary that only allows certain keys.
193 Parameters
194 ----------
195 valid_keys : `Container`
196 Strings that are valid keys.
197 init_data : `dict` or `RestrictedDict`, optional
198 Initial data.
200 Raises
201 ------
202 KeyError
203 If invalid key(s) in init_data.
204 """
206 def __init__(self, valid_keys, init_data=()):
207 self.valid_keys = valid_keys
208 self.data = {}
209 self.update(init_data)
211 def __getitem__(self, key):
212 """Return value for given key if exists.
214 Parameters
215 ----------
216 key : `str`
217 Identifier for value to return.
219 Returns
220 -------
221 value : `~collections.abc.Any`
222 Value associated with given key.
224 Raises
225 ------
226 KeyError
227 If key doesn't exist.
228 """
229 return self.data[key]
231 def __delitem__(self, key):
232 """Delete value for given key if exists.
234 Parameters
235 ----------
236 key : `str`
237 Identifier for value to delete.
239 Raises
240 ------
241 KeyError
242 If key doesn't exist.
243 """
244 del self.data[key]
246 def __setitem__(self, key, value):
247 """Store key,value in internal dict only if key is valid.
249 Parameters
250 ----------
251 key : `str`
252 Identifier to associate with given value.
253 value : `~collections.abc.Any`
254 Value to store.
256 Raises
257 ------
258 KeyError
259 If key is invalid.
260 """
261 if key not in self.valid_keys:
262 raise KeyError(f"Invalid key {key}")
263 self.data[key] = value
265 def __iter__(self):
266 return self.data.__iter__()
268 def __len__(self):
269 return len(self.data)
271 def __str__(self):
272 return str(self.data)
275def htc_backup_files(wms_path, subdir=None, limit=100):
276 """Backup select HTCondor files in the submit directory.
278 Files will be saved in separate subdirectories which will be created in
279 the submit directory where the files are located. These subdirectories
280 will be consecutive, zero-padded integers. Their values will correspond to
281 the number of HTCondor rescue DAGs in the submit directory.
283 Hence, with the default settings, copies after the initial failed run will
284 be placed in '001' subdirectory, '002' after the first restart, and so on
285 until the limit of backups is reached. If there's no rescue DAG yet, files
286 will be copied to '000' subdirectory.
288 Parameters
289 ----------
290 wms_path : `str` or `pathlib.Path`
291 Path to the submit directory either absolute or relative.
292 subdir : `str` or `pathlib.Path`, optional
293 A path, relative to the submit directory, where all subdirectories with
294 backup files will be kept. Defaults to None which means that the backup
295 subdirectories will be placed directly in the submit directory.
296 limit : `int`, optional
297 Maximal number of backups. If the number of backups reaches the limit,
298 the last backup files will be overwritten. The default value is 100
299 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
300 version 8.8+.
302 Raises
303 ------
304 FileNotFoundError
305 If the submit directory or the file that needs to be backed up does not
306 exist.
307 OSError
308 If the submit directory cannot be accessed or backing up a file failed
309 either due to permission or filesystem related issues.
311 Notes
312 -----
313 This is not a generic function for making backups. It is intended to be
314 used once, just before a restart, to make snapshots of files which will be
315 overwritten by HTCondor after during the next run.
316 """
317 width = len(str(limit))
319 path = Path(wms_path).resolve()
320 if not path.is_dir():
321 raise FileNotFoundError(f"Directory {path} not found")
323 # Initialize the backup counter.
324 rescue_dags = list(Path(wms_path).glob("*.rescue*"))
325 counter = min(len(rescue_dags), limit)
327 # Create the backup directory and move select files there.
328 dest = Path(wms_path)
329 if subdir:
330 # PurePath.is_relative_to() is not available before Python 3.9. Hence
331 # we need to check is 'subdir' is in the submit directory in some other
332 # way if it is an absolute path.
333 subdir = Path(subdir)
334 if subdir.is_absolute():
335 if dest not in subdir.parents:
336 _LOG.warning(
337 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
338 subdir,
339 wms_path,
340 )
341 else:
342 dest /= subdir
343 else:
344 dest /= subdir
345 dest /= f"{counter:0{width}}"
346 try:
347 dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
348 except FileExistsError:
349 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
350 else:
351 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]:
352 for source in path.glob(patt):
353 if source.is_file():
354 target = dest / source.relative_to(path)
355 try:
356 source.rename(target)
357 except OSError as exc:
358 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
359 else:
360 raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
363def htc_escape(value):
364 """Escape characters in given value based upon HTCondor syntax.
366 Parameters
367 ----------
368 value : `~collections.abc.Any`
369 Value that needs to have characters escaped if string.
371 Returns
372 -------
373 new_value : `~collections.abc.Any`
374 Given value with characters escaped appropriate for HTCondor if string.
375 """
376 if isinstance(value, str):
377 newval = value.replace('"', '""').replace("'", "''").replace(""", '"')
378 else:
379 newval = value
381 return newval
384def htc_write_attribs(stream, attrs):
385 """Write job attributes in HTCondor format to writeable stream.
387 Parameters
388 ----------
389 stream : `~io.TextIOBase`
390 Output text stream (typically an open file).
391 attrs : `dict`
392 HTCondor job attributes (dictionary of attribute key, value).
393 """
394 for key, value in attrs.items():
395 # Make sure strings are syntactically correct for HTCondor.
396 if isinstance(value, str):
397 pval = f'"{htc_escape(value)}"'
398 else:
399 pval = value
401 print(f"+{key} = {pval}", file=stream)
404def htc_write_condor_file(filename, job_name, job, job_attrs):
405 """Write an HTCondor submit file.
407 Parameters
408 ----------
409 filename : `str`
410 Filename for the HTCondor submit file.
411 job_name : `str`
412 Job name to use in submit file.
413 job : `RestrictedDict`
414 Submit script information.
415 job_attrs : `dict`
416 Job attributes.
417 """
418 os.makedirs(os.path.dirname(filename), exist_ok=True)
419 with open(filename, "w") as fh:
420 for key, value in job.items():
421 if value is not None:
422 if key in HTC_QUOTE_KEYS:
423 print(f'{key}="{htc_escape(value)}"', file=fh)
424 else:
425 print(f"{key}={value}", file=fh)
426 for key in ["output", "error", "log"]:
427 if key not in job:
428 filename = f"{job_name}.$(Cluster).${key[:3]}"
429 print(f"{key}={filename}", file=fh)
431 if job_attrs is not None:
432 htc_write_attribs(fh, job_attrs)
433 print("queue", file=fh)
436# To avoid doing the version check during every function call select
437# appropriate conversion function at the import time.
438#
439# Make sure that *each* version specific variant of the conversion function(s)
440# has the same signature after applying any changes!
441if HTC_VERSION < version.parse("8.9.8"): 441 ↛ 443line 441 didn't jump to line 443, because the condition on line 441 was never true
443 def htc_tune_schedd_args(**kwargs):
444 """Ensure that arguments for Schedd are version appropriate.
446 The old arguments: 'requirements' and 'attr_list' of
447 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were
448 deprecated in favor of 'constraint' and 'projection', respectively,
449 starting from version 8.9.8. The function will convert "new" keyword
450 arguments to "old" ones.
452 Parameters
453 ----------
454 **kwargs
455 Any keyword arguments that Schedd.history(), Schedd.query(), and
456 Schedd.xquery() accepts.
458 Returns
459 -------
460 kwargs : `dict` [`str`, Any]
461 Keywords arguments that are guaranteed to work with the Python
462 HTCondor API.
464 Notes
465 -----
466 Function doesn't validate provided keyword arguments beyond converting
467 selected arguments to their version specific form. For example,
468 it won't remove keywords that are not supported by the methods
469 mentioned earlier.
470 """
471 translation_table = {
472 "constraint": "requirements",
473 "projection": "attr_list",
474 }
475 for new, old in translation_table.items():
476 try:
477 kwargs[old] = kwargs.pop(new)
478 except KeyError:
479 pass
480 return kwargs
482else:
484 def htc_tune_schedd_args(**kwargs):
485 """Ensure that arguments for Schedd are version appropriate.
487 This is the fallback function if no version specific alteration are
488 necessary. Effectively, a no-op.
490 Parameters
491 ----------
492 **kwargs
493 Any keyword arguments that Schedd.history(), Schedd.query(), and
494 Schedd.xquery() accepts.
496 Returns
497 -------
498 kwargs : `dict` [`str`, Any]
499 Keywords arguments that were passed to the function.
500 """
501 return kwargs
504def htc_query_history(schedds, **kwargs):
505 """Fetch history records from the condor_schedd daemon.
507 Parameters
508 ----------
509 schedds : `htcondor.Schedd`
510 HTCondor schedulers which to query for job information.
511 **kwargs
512 Any keyword arguments that Schedd.history() accepts.
514 Yields
515 ------
516 schedd_name : `str`
517 Name of the HTCondor scheduler managing the job queue.
518 job_ad : `dict` [`str`, Any]
519 A dictionary representing HTCondor ClassAd describing a job. It maps
520 job attributes names to values of the ClassAd expressions they
521 represent.
522 """
523 # If not set, provide defaults for positional arguments.
524 kwargs.setdefault("constraint", None)
525 kwargs.setdefault("projection", [])
526 kwargs = htc_tune_schedd_args(**kwargs)
527 for schedd_name, schedd in schedds.items():
528 for job_ad in schedd.history(**kwargs):
529 yield schedd_name, dict(job_ad)
532def htc_query_present(schedds, **kwargs):
533 """Query the condor_schedd daemon for job ads.
535 Parameters
536 ----------
537 schedds : `htcondor.Schedd`
538 HTCondor schedulers which to query for job information.
539 **kwargs
540 Any keyword arguments that Schedd.xquery() accepts.
542 Yields
543 ------
544 schedd_name : `str`
545 Name of the HTCondor scheduler managing the job queue.
546 job_ad : `dict` [`str`, Any]
547 A dictionary representing HTCondor ClassAd describing a job. It maps
548 job attributes names to values of the ClassAd expressions they
549 represent.
550 """
551 kwargs = htc_tune_schedd_args(**kwargs)
552 for schedd_name, schedd in schedds.items():
553 for job_ad in schedd.query(**kwargs):
554 yield schedd_name, dict(job_ad)
557def htc_version():
558 """Return the version given by the HTCondor API.
560 Returns
561 -------
562 version : `str`
563 HTCondor version as easily comparable string.
564 """
565 return str(HTC_VERSION)
568def htc_submit_dag(sub):
569 """Submit job for execution.
571 Parameters
572 ----------
573 sub : `htcondor.Submit`
574 An object representing a job submit description.
576 Returns
577 -------
578 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
579 Information about jobs satisfying the search criteria where for each
580 Scheduler, local HTCondor job ids are mapped to their respective
581 classads.
582 """
583 coll = htcondor.Collector()
584 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
585 schedd = htcondor.Schedd(schedd_ad)
587 # If Schedd.submit() fails, the method will raise an exception. Usually,
588 # that implies issues with the HTCondor pool which BPS can't address.
589 # Hence, no effort is made to handle the exception.
590 submit_result = schedd.submit(sub)
592 # Sadly, the ClassAd from Schedd.submit() (see above) does not have
593 # 'GlobalJobId' so we need to run a regular query to get it anyway.
594 schedd_name = schedd_ad["Name"]
595 schedd_dag_info = condor_q(
596 constraint=f"ClusterId == {submit_result.cluster()}", schedds={schedd_name: schedd}
597 )
598 return schedd_dag_info
601def htc_create_submit_from_dag(dag_filename, submit_options=None):
602 """Create a DAGMan job submit description.
604 Parameters
605 ----------
606 dag_filename : `str`
607 Name of file containing HTCondor DAG commands.
608 submit_options : `dict` [`str`, Any], optional
609 Contains extra options for command line (Value of None means flag).
611 Returns
612 -------
613 sub : `htcondor.Submit`
614 An object representing a job submit description.
616 Notes
617 -----
618 Use with HTCondor versions which support htcondor.Submit.from_dag(),
619 i.e., 8.9.3 or newer.
620 """
621 return htcondor.Submit.from_dag(dag_filename, submit_options)
624def htc_create_submit_from_cmd(dag_filename, submit_options=None):
625 """Create a DAGMan job submit description.
627 Create a DAGMan job submit description by calling ``condor_submit_dag``
628 on given DAG description file.
630 Parameters
631 ----------
632 dag_filename : `str`
633 Name of file containing HTCondor DAG commands.
634 submit_options : `dict` [`str`, Any], optional
635 Contains extra options for command line (Value of None means flag).
637 Returns
638 -------
639 sub : `htcondor.Submit`
640 An object representing a job submit description.
642 Notes
643 -----
644 Use with HTCondor versions which do not support htcondor.Submit.from_dag(),
645 i.e., older than 8.9.3.
646 """
647 # Run command line condor_submit_dag command.
648 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse "
650 if submit_options is not None:
651 for opt, val in submit_options.items():
652 cmd += f" -{opt} {val or ''}"
653 cmd += f"{dag_filename}"
655 process = subprocess.Popen(
656 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8"
657 )
658 process.wait()
660 if process.returncode != 0:
661 print(f"Exit code: {process.returncode}")
662 print(process.communicate()[0])
663 raise RuntimeError("Problems running condor_submit_dag")
665 return htc_create_submit_from_file(f"{dag_filename}.condor.sub")
668def htc_create_submit_from_file(submit_file):
669 """Parse a submission file.
671 Parameters
672 ----------
673 submit_file : `str`
674 Name of the HTCondor submit file.
676 Returns
677 -------
678 sub : `htcondor.Submit`
679 An object representing a job submit description.
680 """
681 descriptors = {}
682 with open(submit_file) as fh:
683 for line in fh:
684 line = line.strip()
685 if not line.startswith("#") and not line == "queue":
686 (key, val) = re.split(r"\s*=\s*", line, 1)
687 descriptors[key] = val
689 # Avoid UserWarning: the line 'copy_to_spool = False' was
690 # unused by Submit object. Is it a typo?
691 try:
692 del descriptors["copy_to_spool"]
693 except KeyError:
694 pass
696 return htcondor.Submit(descriptors)
699def _htc_write_job_commands(stream, name, jobs):
700 """Output the DAGMan job lines for single job in DAG.
702 Parameters
703 ----------
704 stream : `~io.TextIOBase`
705 Writeable text stream (typically an opened file).
706 name : `str`
707 Job name.
708 jobs : `RestrictedDict`
709 DAG job keys and values.
710 """
711 if "pre" in jobs:
712 print(
713 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}"
714 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}",
715 file=stream,
716 )
718 if "post" in jobs:
719 print(
720 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}"
721 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}",
722 file=stream,
723 )
725 if "vars" in jobs:
726 for key, value in jobs["vars"]:
727 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
729 if "pre_skip" in jobs:
730 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream)
732 if "retry" in jobs and jobs["retry"]:
733 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream)
734 if "retry_unless_exit" in jobs:
735 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream)
736 print("\n", file=stream)
738 if "abort_dag_on" in jobs and jobs["abort_dag_on"]:
739 print(
740 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}"
741 f" RETURN {jobs['abort_dag_on']['abort_exit']}",
742 file=stream,
743 )
746class HTCJob:
747 """HTCondor job for use in building DAG.
749 Parameters
750 ----------
751 name : `str`
752 Name of the job.
753 label : `str`
754 Label that can used for grouping or lookup.
755 initcmds : `RestrictedDict`
756 Initial job commands for submit file.
757 initdagcmds : `RestrictedDict`
758 Initial commands for job inside DAG.
759 initattrs : `dict`
760 Initial dictionary of job attributes.
761 """
763 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None):
764 self.name = name
765 self.label = label
766 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds)
767 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
768 self.attrs = initattrs
769 self.subfile = None
771 def __str__(self):
772 return self.name
774 def add_job_cmds(self, new_commands):
775 """Add commands to Job (overwrite existing).
777 Parameters
778 ----------
779 new_commands : `dict`
780 Submit file commands to be added to Job.
781 """
782 self.cmds.update(new_commands)
784 def add_dag_cmds(self, new_commands):
785 """Add DAG commands to Job (overwrite existing).
787 Parameters
788 ----------
789 new_commands : `dict`
790 DAG file commands to be added to Job.
791 """
792 self.dagcmds.update(new_commands)
794 def add_job_attrs(self, new_attrs):
795 """Add attributes to Job (overwrite existing).
797 Parameters
798 ----------
799 new_attrs : `dict`
800 Attributes to be added to Job.
801 """
802 if self.attrs is None:
803 self.attrs = {}
804 if new_attrs:
805 self.attrs.update(new_attrs)
807 def write_submit_file(self, submit_path, job_subdir=""):
808 """Write job description to submit file.
810 Parameters
811 ----------
812 submit_path : `str`
813 Prefix path for the submit file.
814 job_subdir : `str`, optional
815 Template for job subdir.
816 """
817 if not self.subfile:
818 self.subfile = f"{self.name}.sub"
819 job_subdir = job_subdir.format(self=self)
820 if job_subdir:
821 self.subfile = os.path.join(job_subdir, self.subfile)
822 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
824 def write_dag_commands(self, stream):
825 """Write DAG commands for single job to output stream.
827 Parameters
828 ----------
829 stream : `IO` or `str`
830 Output Stream.
831 """
832 print(f"JOB {self.name} {self.subfile}", file=stream)
833 _htc_write_job_commands(stream, self.name, self.dagcmds)
835 def dump(self, fh):
836 """Dump job information to output stream.
838 Parameters
839 ----------
840 fh : `~io.TextIOBase`
841 Output stream.
842 """
843 printer = pprint.PrettyPrinter(indent=4, stream=fh)
844 printer.pprint(self.name)
845 printer.pprint(self.cmds)
846 printer.pprint(self.attrs)
849class HTCDag(networkx.DiGraph):
850 """HTCondor DAG.
852 Parameters
853 ----------
854 data : networkx.DiGraph.data
855 Initial graph.
856 name : `str`
857 Name for DAG.
858 """
860 def __init__(self, data=None, name=""):
861 super().__init__(data=data, name=name)
863 self.graph["attr"] = {}
864 self.graph["run_id"] = None
865 self.graph["submit_path"] = None
866 self.graph["final_job"] = None
868 def __str__(self):
869 """Represent basic DAG info as string.
871 Returns
872 -------
873 info : `str`
874 String containing basic DAG info.
875 """
876 return f"{self.graph['name']} {len(self)}"
878 def add_attribs(self, attribs=None):
879 """Add attributes to the DAG.
881 Parameters
882 ----------
883 attribs : `dict`
884 DAG attributes.
885 """
886 if attribs is not None:
887 self.graph["attr"].update(attribs)
889 def add_job(self, job, parent_names=None, child_names=None):
890 """Add an HTCJob to the HTCDag.
892 Parameters
893 ----------
894 job : `HTCJob`
895 HTCJob to add to the HTCDag.
896 parent_names : `~collections.abc.Iterable` [`str`], optional
897 Names of parent jobs.
898 child_names : `~collections.abc.Iterable` [`str`], optional
899 Names of child jobs.
900 """
901 assert isinstance(job, HTCJob)
903 # Add dag level attributes to each job
904 job.add_job_attrs(self.graph["attr"])
906 self.add_node(job.name, data=job)
908 if parent_names is not None:
909 self.add_job_relationships(parent_names, job.name)
911 if child_names is not None:
912 self.add_job_relationships(child_names, job.name)
914 def add_job_relationships(self, parents, children):
915 """Add DAG edge between parents and children jobs.
917 Parameters
918 ----------
919 parents : `list` [`str`]
920 Contains parent job name(s).
921 children : `list` [`str`]
922 Contains children job name(s).
923 """
924 self.add_edges_from(itertools.product(parents, children))
926 def add_final_job(self, job):
927 """Add an HTCJob for the FINAL job in HTCDag.
929 Parameters
930 ----------
931 job : `HTCJob`
932 HTCJob to add to the HTCDag as a FINAL job.
933 """
934 # Add dag level attributes to each job
935 job.add_job_attrs(self.graph["attr"])
937 self.graph["final_job"] = job
939 def del_job(self, job_name):
940 """Delete the job from the DAG.
942 Parameters
943 ----------
944 job_name : `str`
945 Name of job in DAG to delete.
946 """
947 # Reconnect edges around node to delete
948 parents = self.predecessors(job_name)
949 children = self.successors(job_name)
950 self.add_edges_from(itertools.product(parents, children))
952 # Delete job node (which deletes its edges).
953 self.remove_node(job_name)
955 def write(self, submit_path, job_subdir=""):
956 """Write DAG to a file.
958 Parameters
959 ----------
960 submit_path : `str`
961 Prefix path for dag filename to be combined with DAG name.
962 job_subdir : `str`, optional
963 Template for job subdir.
964 """
965 self.graph["submit_path"] = submit_path
966 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag")
967 os.makedirs(submit_path, exist_ok=True)
968 with open(self.graph["dag_filename"], "w") as fh:
969 for _, nodeval in self.nodes().items():
970 job = nodeval["data"]
971 job.write_submit_file(submit_path, job_subdir)
972 job.write_dag_commands(fh)
973 for edge in self.edges():
974 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
975 print(f"DOT {self.name}.dot", file=fh)
976 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh)
978 # Add bps attributes to dag submission
979 for key, value in self.graph["attr"].items():
980 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh)
982 if self.graph["final_job"]:
983 job = self.graph["final_job"]
984 job.write_submit_file(submit_path, job_subdir)
985 print(f"FINAL {job.name} {job.subfile}", file=fh)
986 if "pre" in job.dagcmds:
987 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
988 if "post" in job.dagcmds:
989 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
991 def dump(self, fh):
992 """Dump DAG info to output stream.
994 Parameters
995 ----------
996 fh : `io.IO` or `str`
997 Where to dump DAG info as text.
998 """
999 for key, value in self.graph:
1000 print(f"{key}={value}", file=fh)
1001 for name, data in self.nodes().items():
1002 print(f"{name}:", file=fh)
1003 data.dump(fh)
1004 for edge in self.edges():
1005 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
1006 if self.graph["final_job"]:
1007 print(f'FINAL {self.graph["final_job"].name}:', file=fh)
1008 self.graph["final_job"].dump(fh)
1010 def write_dot(self, filename):
1011 """Write a dot version of the DAG.
1013 Parameters
1014 ----------
1015 filename : `str`
1016 Name of the dot file.
1017 """
1018 pos = networkx.nx_agraph.graphviz_layout(self)
1019 networkx.draw(self, pos=pos)
1020 networkx.drawing.nx_pydot.write_dot(self, filename)
1023def condor_q(constraint=None, schedds=None, **kwargs):
1024 """Get information about the jobs in the HTCondor job queue(s).
1026 Parameters
1027 ----------
1028 constraint : `str`, optional
1029 Constraints to be passed to job query.
1030 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1031 HTCondor schedulers which to query for job information. If None
1032 (default), the query will be run against local scheduler only.
1033 **kwargs : `~typing.Any`
1034 Additional keyword arguments that need to be passed to the internal
1035 query method.
1037 Returns
1038 -------
1039 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1040 Information about jobs satisfying the search criteria where for each
1041 Scheduler, local HTCondor job ids are mapped to their respective
1042 classads.
1043 """
1044 return condor_query(constraint, schedds, htc_query_present, **kwargs)
1047def condor_history(constraint=None, schedds=None, **kwargs):
1048 """Get information about the jobs from HTCondor history records.
1050 Parameters
1051 ----------
1052 constraint : `str`, optional
1053 Constraints to be passed to job query.
1054 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1055 HTCondor schedulers which to query for job information. If None
1056 (default), the query will be run against the history file of
1057 the local scheduler only.
1058 **kwargs : `~typing.Any`
1059 Additional keyword arguments that need to be passed to the internal
1060 query method.
1062 Returns
1063 -------
1064 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1065 Information about jobs satisfying the search criteria where for each
1066 Scheduler, local HTCondor job ids are mapped to their respective
1067 classads.
1068 """
1069 return condor_query(constraint, schedds, htc_query_history, **kwargs)
1072def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs):
1073 """Get information about HTCondor jobs.
1075 Parameters
1076 ----------
1077 constraint : `str`, optional
1078 Constraints to be passed to job query.
1079 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1080 HTCondor schedulers which to query for job information. If None
1081 (default), the query will be run against the history file of
1082 the local scheduler only.
1083 query_func : callable
1084 An query function which takes following arguments:
1086 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]).
1087 - ``**kwargs``: Keyword arguments that will be passed to the query
1088 function.
1089 **kwargs : `~typing.Any`
1090 Additional keyword arguments that need to be passed to the query
1091 method.
1093 Returns
1094 -------
1095 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1096 Information about jobs satisfying the search criteria where for each
1097 Scheduler, local HTCondor job ids are mapped to their respective
1098 classads.
1099 """
1100 if not schedds:
1101 coll = htcondor.Collector()
1102 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1103 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)}
1105 # Make sure that 'ClusterId' and 'ProcId' attributes are always included
1106 # in the job classad. They are needed to construct the job id.
1107 added_attrs = set()
1108 if "projection" in kwargs and kwargs["projection"]:
1109 requested_attrs = set(kwargs["projection"])
1110 required_attrs = {"ClusterId", "ProcId"}
1111 added_attrs = required_attrs - requested_attrs
1112 for attr in added_attrs:
1113 kwargs["projection"].append(attr)
1115 unwanted_attrs = {"Env", "Environment"} | added_attrs
1116 job_info = defaultdict(dict)
1117 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs):
1118 id_ = f"{job_ad['ClusterId']}.{job_ad['ProcId']}"
1119 for attr in set(job_ad) & unwanted_attrs:
1120 del job_ad[attr]
1121 job_info[schedd_name][id_] = job_ad
1122 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values()))
1124 # Restore the list of the requested attributes to its original value
1125 # if needed.
1126 if added_attrs:
1127 for attr in added_attrs:
1128 kwargs["projection"].remove(attr)
1130 # When returning the results filter out entries for schedulers with no jobs
1131 # matching the search criteria.
1132 return {key: val for key, val in job_info.items() if val}
1135def condor_search(constraint=None, hist=None, schedds=None):
1136 """Search for running and finished jobs satisfying given criteria.
1138 Parameters
1139 ----------
1140 constraint : `str`, optional
1141 Constraints to be passed to job query.
1142 hist : `float`
1143 Limit history search to this many days.
1144 schedds : `dict` [`str`, `htcondor.Schedd`], optional
1145 The list of the HTCondor schedulers which to query for job information.
1146 If None (default), only the local scheduler will be queried.
1148 Returns
1149 -------
1150 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1151 Information about jobs satisfying the search criteria where for each
1152 Scheduler, local HTCondor job ids are mapped to their respective
1153 classads.
1154 """
1155 if not schedds:
1156 coll = htcondor.Collector()
1157 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
1158 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)}
1160 job_info = condor_q(constraint=constraint, schedds=schedds)
1161 if hist is not None:
1162 epoch = (datetime.now() - timedelta(days=hist)).timestamp()
1163 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
1164 hist_info = condor_history(constraint, schedds=schedds)
1165 update_job_info(job_info, hist_info)
1166 return job_info
1169def condor_status(constraint=None, coll=None):
1170 """Get information about HTCondor pool.
1172 Parameters
1173 ----------
1174 constraint : `str`, optional
1175 Constraints to be passed to the query.
1176 coll : `htcondor.Collector`, optional
1177 Object representing HTCondor collector daemon.
1179 Returns
1180 -------
1181 pool_info : `dict` [`str`, `dict` [`str`, Any]]
1182 Mapping between HTCondor slot names and slot information (classAds).
1183 """
1184 if coll is None:
1185 coll = htcondor.Collector()
1186 try:
1187 pool_ads = coll.query(constraint=constraint)
1188 except OSError as ex:
1189 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex
1191 pool_info = {}
1192 for slot in pool_ads:
1193 pool_info[slot["name"]] = dict(slot)
1194 _LOG.debug("condor_status returned %d ads", len(pool_info))
1195 return pool_info
1198def update_job_info(job_info, other_info):
1199 """Update results of a job query with results from another query.
1201 Parameters
1202 ----------
1203 job_info : `dict` [`str`, `dict` [`str`, Any]]
1204 Results of the job query that needs to be updated.
1205 other_info : `dict` [`str`, `dict` [`str`, Any]]
1206 Results of the other job query.
1208 Returns
1209 -------
1210 job_info : `dict` [`str`, `dict` [`str`, Any]]
1211 The updated results.
1212 """
1213 for schedd_name, others in other_info.items():
1214 try:
1215 jobs = job_info[schedd_name]
1216 except KeyError:
1217 job_info[schedd_name] = others
1218 else:
1219 for id_, ad in others.items():
1220 jobs.setdefault(id_, {}).update(ad)
1221 return job_info
1224def summary_from_dag(dir_name):
1225 """Build bps_run_summary string from dag file.
1227 Parameters
1228 ----------
1229 dir_name : `str`
1230 Path that includes dag file for a run.
1232 Returns
1233 -------
1234 summary : `str`
1235 Semi-colon separated list of job labels and counts.
1236 (Same format as saved in dag classad).
1237 job_name_to_pipetask : `dict` [`str`, `str`]
1238 Mapping of job names to job labels.
1239 """
1240 dag = next(Path(dir_name).glob("*.dag"))
1242 # Later code depends upon insertion order
1243 counts = defaultdict(int)
1244 job_name_to_pipetask = {}
1245 try:
1246 with open(dag) as fh:
1247 for line in fh:
1248 if line.startswith("JOB"):
1249 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line)
1250 if m:
1251 label = m.group(2)
1252 if label == "init":
1253 label = "pipetaskInit"
1254 job_name_to_pipetask[m.group(1)] = label
1255 counts[label] += 1
1256 else: # Check if Pegasus submission
1257 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line)
1258 if m:
1259 label = pegasus_name_to_label(m.group(1))
1260 job_name_to_pipetask[m.group(1)] = label
1261 counts[label] += 1
1262 else:
1263 _LOG.warning("Parse DAG: unmatched job line: %s", line)
1264 elif line.startswith("FINAL"):
1265 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line)
1266 if m:
1267 label = m.group(2)
1268 job_name_to_pipetask[m.group(1)] = label
1269 counts[label] += 1
1271 except (OSError, PermissionError, StopIteration):
1272 pass
1274 summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1275 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask)
1276 return summary, job_name_to_pipetask
1279def pegasus_name_to_label(name):
1280 """Convert pegasus job name to a label for the report.
1282 Parameters
1283 ----------
1284 name : `str`
1285 Name of job.
1287 Returns
1288 -------
1289 label : `str`
1290 Label for job.
1291 """
1292 label = "UNK"
1293 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"):
1294 label = "pegasus"
1295 else:
1296 m = re.match(r"pipetask_(\d+_)?([^_]+)", name)
1297 if m:
1298 label = m.group(2)
1299 if label == "init":
1300 label = "pipetaskInit"
1302 return label
1305def read_dag_status(wms_path):
1306 """Read the node status file for DAG summary information.
1308 Parameters
1309 ----------
1310 wms_path : `str`
1311 Path that includes node status file for a run.
1313 Returns
1314 -------
1315 dag_ad : `dict` [`str`, Any]
1316 DAG summary information.
1317 """
1318 dag_ad = {}
1320 # While this is probably more up to date than dag classad, only read from
1321 # file if need to.
1322 try:
1323 try:
1324 node_stat_file = next(Path(wms_path).glob("*.node_status"))
1325 _LOG.debug("Reading Node Status File %s", node_stat_file)
1326 with open(node_stat_file) as infh:
1327 dag_ad = classad.parseNext(infh) # pylint: disable=E1101
1328 except StopIteration:
1329 pass
1331 if not dag_ad:
1332 # Pegasus check here
1333 try:
1334 metrics_file = next(Path(wms_path).glob("*.dag.metrics"))
1335 with open(metrics_file) as infh:
1336 metrics = json.load(infh)
1337 dag_ad["NodesTotal"] = metrics.get("jobs", 0)
1338 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
1339 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
1340 dag_ad["pegasus_version"] = metrics.get("planner_version", "")
1341 except StopIteration:
1342 try:
1343 metrics_file = next(Path(wms_path).glob("*.metrics"))
1344 with open(metrics_file) as infh:
1345 metrics = json.load(infh)
1346 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1347 dag_ad["pegasus_version"] = metrics.get("version", "")
1348 except StopIteration:
1349 pass
1350 except (OSError, PermissionError):
1351 pass
1353 _LOG.debug("read_dag_status: %s", dag_ad)
1354 return dict(dag_ad)
1357def read_node_status(wms_path):
1358 """Read entire node status file.
1360 Parameters
1361 ----------
1362 wms_path : `str`
1363 Path that includes node status file for a run.
1365 Returns
1366 -------
1367 jobs : `dict` [`str`, Any]
1368 DAG summary information compiled from the node status file combined
1369 with the information found in the node event log.
1371 Currently, if the same job attribute is found in both files, its value
1372 from the event log takes precedence over the value from the node status
1373 file.
1374 """
1375 # Get jobid info from other places to fill in gaps in info from node_status
1376 _, job_name_to_pipetask = summary_from_dag(wms_path)
1377 wms_workflow_id, loginfo = read_dag_log(wms_path)
1378 loginfo = read_dag_nodes_log(wms_path)
1379 _LOG.debug("loginfo = %s", loginfo)
1380 job_name_to_id = {}
1381 for jid, jinfo in loginfo.items():
1382 if "LogNotes" in jinfo:
1383 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"])
1384 if m:
1385 job_name_to_id[m.group(1)] = jid
1386 jinfo["DAGNodeName"] = m.group(1)
1388 try:
1389 node_status = next(Path(wms_path).glob("*.node_status"))
1390 except StopIteration:
1391 return loginfo
1393 jobs = {}
1394 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1395 try:
1396 with open(node_status) as fh:
1397 ads = classad.parseAds(fh)
1399 for jclassad in ads:
1400 if jclassad["Type"] == "DagStatus":
1401 # skip DAG summary
1402 pass
1403 elif "Node" not in jclassad:
1404 if jclassad["Type"] != "StatusEnd":
1405 _LOG.debug("Key 'Node' not in classad: %s", jclassad)
1406 break
1407 else:
1408 if jclassad["Node"] in job_name_to_pipetask:
1409 try:
1410 label = job_name_to_pipetask[jclassad["Node"]]
1411 except KeyError:
1412 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys())
1413 raise
1414 elif "_" in jclassad["Node"]:
1415 label = jclassad["Node"].split("_")[1]
1416 else:
1417 label = jclassad["Node"]
1419 # Make job info as if came from condor_q
1420 if jclassad["Node"] in job_name_to_id:
1421 job_id = str(job_name_to_id[jclassad["Node"]])
1422 else:
1423 job_id = str(fake_id)
1424 fake_id -= 1
1426 job = dict(jclassad)
1427 job["ClusterId"] = int(float(job_id))
1428 job["DAGManJobID"] = wms_workflow_id
1429 job["DAGNodeName"] = jclassad["Node"]
1430 job["bps_job_label"] = label
1432 jobs[job_id] = job
1433 try:
1434 jobs[job_id] |= loginfo[job_id]
1435 except KeyError:
1436 pass
1437 except (OSError, PermissionError):
1438 pass
1440 return jobs
1443def read_dag_log(wms_path):
1444 """Read job information from the DAGMan log file.
1446 Parameters
1447 ----------
1448 wms_path : `str`
1449 Path containing the DAGMan log file.
1451 Returns
1452 -------
1453 wms_workflow_id : `str`
1454 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1455 dag_info : `dict` [`str`, `~collections.abc.Any`]
1456 HTCondor job information read from the log file mapped to HTCondor
1457 job id.
1459 Raises
1460 ------
1461 FileNotFoundError
1462 If cannot find DAGMan log in given wms_path.
1463 """
1464 wms_workflow_id = 0
1465 dag_info = {}
1467 path = Path(wms_path)
1468 if path.exists():
1469 try:
1470 filename = next(path.glob("*.dag.dagman.log"))
1471 except StopIteration as exc:
1472 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1473 _LOG.debug("dag node log filename: %s", filename)
1475 info = {}
1476 job_event_log = htcondor.JobEventLog(str(filename))
1477 for event in job_event_log.events(stop_after=0):
1478 id_ = f"{event['Cluster']}.{event['Proc']}"
1479 if id_ not in info:
1480 info[id_] = {}
1481 wms_workflow_id = id_ # taking last job id in case of restarts
1482 info[id_].update(event)
1483 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1485 # only save latest DAG job
1486 dag_info = {wms_workflow_id: info[wms_workflow_id]}
1487 for job in dag_info.values():
1488 _tweak_log_info(filename, job)
1490 return wms_workflow_id, dag_info
1493def read_dag_nodes_log(wms_path):
1494 """Read job information from the DAGMan nodes log file.
1496 Parameters
1497 ----------
1498 wms_path : `str`
1499 Path containing the DAGMan nodes log file.
1501 Returns
1502 -------
1503 info : `dict` [`str`, Any]
1504 HTCondor job information read from the log file mapped to HTCondor
1505 job id.
1507 Raises
1508 ------
1509 FileNotFoundError
1510 If cannot find DAGMan node log in given wms_path.
1511 """
1512 try:
1513 filename = next(Path(wms_path).glob("*.dag.nodes.log"))
1514 except StopIteration as exc:
1515 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
1516 _LOG.debug("dag node log filename: %s", filename)
1518 info = {}
1519 job_event_log = htcondor.JobEventLog(str(filename))
1520 for event in job_event_log.events(stop_after=0):
1521 id_ = f"{event['Cluster']}.{event['Proc']}"
1522 if id_ not in info:
1523 info[id_] = {}
1524 info[id_].update(event)
1525 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1527 # Add more condor_q-like info to info parsed from log file.
1528 for job in info.values():
1529 _tweak_log_info(filename, job)
1531 return info
1534def read_dag_info(wms_path):
1535 """Read custom DAGMan job information from the file.
1537 Parameters
1538 ----------
1539 wms_path : `str`
1540 Path containing the file with the DAGMan job info.
1542 Returns
1543 -------
1544 dag_info : `dict` [`str`, `dict` [`str`, Any]]
1545 HTCondor job information.
1547 Raises
1548 ------
1549 FileNotFoundError
1550 If cannot find DAGMan job info file in the given location.
1551 """
1552 try:
1553 filename = next(Path(wms_path).glob("*.info.json"))
1554 except StopIteration as exc:
1555 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc
1556 _LOG.debug("DAGMan job information filename: %s", filename)
1557 try:
1558 with open(filename) as fh:
1559 dag_info = json.load(fh)
1560 except (OSError, PermissionError) as exc:
1561 _LOG.debug("Retrieving DAGMan job information failed: %s", exc)
1562 dag_info = {}
1563 return dag_info
1566def write_dag_info(filename, dag_info):
1567 """Write custom job information about DAGMan job.
1569 Parameters
1570 ----------
1571 filename : `str`
1572 Name of the file where the information will be stored.
1573 dag_info : `dict` [`str` `dict` [`str`, Any]]
1574 Information about the DAGMan job.
1575 """
1576 schedd_name = next(iter(dag_info))
1577 dag_id = next(iter(dag_info[schedd_name]))
1578 dag_ad = dag_info[schedd_name][dag_id]
1579 try:
1580 with open(filename, "w") as fh:
1581 info = {
1582 schedd_name: {
1583 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]}
1584 }
1585 }
1586 json.dump(info, fh)
1587 except (KeyError, OSError, PermissionError) as exc:
1588 _LOG.debug("Persisting DAGMan job information failed: %s", exc)
1591def _tweak_log_info(filename, job):
1592 """Massage the given job info has same structure as if came from condor_q.
1594 Parameters
1595 ----------
1596 filename : `pathlib.Path`
1597 Name of the DAGMan log.
1598 job : `dict` [ `str`, Any ]
1599 A mapping between HTCondor job id and job information read from
1600 the log.
1601 """
1602 _LOG.debug("_tweak_log_info: %s %s", filename, job)
1604 try:
1605 job["ClusterId"] = job["Cluster"]
1606 job["ProcId"] = job["Proc"]
1607 job["Iwd"] = str(filename.parent)
1608 job["Owner"] = filename.owner()
1610 match job["MyType"]:
1611 case "ExecuteEvent":
1612 job["JobStatus"] = JobStatus.RUNNING
1613 case "JobTerminatedEvent" | "PostScriptTerminatedEvent":
1614 job["JobStatus"] = JobStatus.COMPLETED
1615 case "SubmitEvent":
1616 job["JobStatus"] = JobStatus.IDLE
1617 case "JobAbortedEvent":
1618 job["JobStatus"] = JobStatus.REMOVED
1619 case "JobHeldEvent":
1620 job["JobStatus"] = JobStatus.HELD
1621 case _:
1622 _LOG.debug("Unknown log event type: %s", job["MyType"])
1623 job["JobStatus"] = JobStatus.UNEXPANDED
1625 if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}:
1626 new_job = HTC_JOB_AD_HANDLERS.handle(job)
1627 if new_job is not None:
1628 job = new_job
1629 else:
1630 _LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"])
1632 except KeyError as e:
1633 _LOG.error("Missing key %s in job: %s", str(e), job)
1634 raise
1637def htc_check_dagman_output(wms_path):
1638 """Check the DAGMan output for error messages.
1640 Parameters
1641 ----------
1642 wms_path : `str`
1643 Directory containing the DAGman output file.
1645 Returns
1646 -------
1647 message : `str`
1648 Message containing error messages from the DAGMan output. Empty
1649 string if no messages.
1651 Raises
1652 ------
1653 FileNotFoundError
1654 If cannot find DAGMan standard output file in given wms_path.
1655 """
1656 try:
1657 filename = next(Path(wms_path).glob("*.dag.dagman.out"))
1658 except StopIteration as exc:
1659 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
1660 _LOG.debug("dag output filename: %s", filename)
1662 message = ""
1663 try:
1664 with open(filename) as fh:
1665 last_submit_failed = ""
1666 for line in fh:
1667 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line)
1668 if m:
1669 last_submit_failed = m.group(1)
1670 if last_submit_failed:
1671 message = f"Warn: Job submission issues (last: {last_submit_failed})"
1672 except (OSError, PermissionError):
1673 message = f"Warn: Could not read dagman output file from {wms_path}."
1674 return message