Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 12%

585 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-13 10:07 +0000

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Placeholder HTCondor DAGMan API. 

29 

30There is new work on a python DAGMan API from HTCondor. However, at this 

31time, it tries to make things easier by assuming DAG is easily broken into 

32levels where there are 1-1 or all-to-all relationships to nodes in next 

33level. LSST workflows are more complicated. 

34""" 

35 

36__all__ = [ 

37 "DagStatus", 

38 "JobStatus", 

39 "NodeStatus", 

40 "RestrictedDict", 

41 "HTCJob", 

42 "HTCDag", 

43 "htc_backup_files", 

44 "htc_check_dagman_output", 

45 "htc_create_submit_from_cmd", 

46 "htc_create_submit_from_dag", 

47 "htc_create_submit_from_file", 

48 "htc_escape", 

49 "htc_write_attribs", 

50 "htc_write_condor_file", 

51 "htc_version", 

52 "htc_submit_dag", 

53 "condor_history", 

54 "condor_q", 

55 "condor_search", 

56 "condor_status", 

57 "update_job_info", 

58 "MISSING_ID", 

59 "summary_from_dag", 

60 "read_dag_info", 

61 "read_dag_log", 

62 "read_dag_nodes_log", 

63 "read_dag_status", 

64 "read_node_status", 

65 "write_dag_info", 

66 "pegasus_name_to_label", 

67] 

68 

69 

70import itertools 

71import json 

72import logging 

73import os 

74import pprint 

75import re 

76import subprocess 

77from collections import defaultdict 

78from collections.abc import MutableMapping 

79from datetime import datetime, timedelta 

80from enum import IntEnum 

81from pathlib import Path 

82 

83import classad 

84import htcondor 

85import networkx 

86 

87_LOG = logging.getLogger(__name__) 

88 

89MISSING_ID = -99999 

90 

91 

92class DagStatus(IntEnum): 

93 """HTCondor DAGMan's statuses for a DAG.""" 

94 

95 OK = 0 

96 ERROR = 1 # an error condition different than those listed here 

97 FAILED = 2 # one or more nodes in the DAG have failed 

98 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification 

99 REMOVED = 4 # the DAG has been removed by condor_rm 

100 CYCLE = 5 # a cycle was found in the DAG 

101 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8) 

102 

103 

104class JobStatus(IntEnum): 

105 """HTCondor's statuses for jobs.""" 

106 

107 UNEXPANDED = 0 # Unexpanded 

108 IDLE = 1 # Idle 

109 RUNNING = 2 # Running 

110 REMOVED = 3 # Removed 

111 COMPLETED = 4 # Completed 

112 HELD = 5 # Held 

113 TRANSFERRING_OUTPUT = 6 # Transferring_Output 

114 SUSPENDED = 7 # Suspended 

115 

116 

117class NodeStatus(IntEnum): 

118 """HTCondor's statuses for DAGman nodes.""" 

119 

120 # (STATUS_NOT_READY): At least one parent has not yet finished or the node 

121 # is a FINAL node. 

122 NOT_READY = 0 

123 

124 # (STATUS_READY): All parents have finished, but the node is not yet 

125 # running. 

126 READY = 1 

127 

128 # (STATUS_PRERUN): The node’s PRE script is running. 

129 PRERUN = 2 

130 

131 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue. 

132 # StatusDetails = "not_idle" -> running. 

133 # JobProcsHeld = 1-> hold. 

134 # JobProcsQueued = 1 -> idle. 

135 SUBMITTED = 3 

136 

137 # (STATUS_POSTRUN): The node’s POST script is running. 

138 POSTRUN = 4 

139 

140 # (STATUS_DONE): The node has completed successfully. 

141 DONE = 5 

142 

143 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g., 

144 # ULOG_JOB_ABORTED for deleted job). 

145 ERROR = 6 

146 

147 

148HTC_QUOTE_KEYS = {"environment"} 

149HTC_VALID_JOB_KEYS = { 

150 "universe", 

151 "executable", 

152 "arguments", 

153 "environment", 

154 "log", 

155 "error", 

156 "output", 

157 "should_transfer_files", 

158 "when_to_transfer_output", 

159 "getenv", 

160 "notification", 

161 "notify_user", 

162 "concurrency_limit", 

163 "transfer_executable", 

164 "transfer_input_files", 

165 "transfer_output_files", 

166 "request_cpus", 

167 "request_memory", 

168 "request_disk", 

169 "priority", 

170 "category", 

171 "requirements", 

172 "on_exit_hold", 

173 "on_exit_hold_reason", 

174 "on_exit_hold_subcode", 

175 "max_retries", 

176 "periodic_release", 

177 "periodic_remove", 

178 "accounting_group", 

179 "accounting_group_user", 

180} 

181HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"} 

182 

183 

184class RestrictedDict(MutableMapping): 

185 """A dictionary that only allows certain keys. 

186 

187 Parameters 

188 ---------- 

189 valid_keys : `Container` 

190 Strings that are valid keys. 

191 init_data : `dict` or `RestrictedDict`, optional 

192 Initial data. 

193 

194 Raises 

195 ------ 

196 KeyError 

197 If invalid key(s) in init_data. 

198 """ 

199 

200 def __init__(self, valid_keys, init_data=()): 

201 self.valid_keys = valid_keys 

202 self.data = {} 

203 self.update(init_data) 

204 

205 def __getitem__(self, key): 

206 """Return value for given key if exists. 

207 

208 Parameters 

209 ---------- 

210 key : `str` 

211 Identifier for value to return. 

212 

213 Returns 

214 ------- 

215 value : `~collections.abc.Any` 

216 Value associated with given key. 

217 

218 Raises 

219 ------ 

220 KeyError 

221 If key doesn't exist. 

222 """ 

223 return self.data[key] 

224 

225 def __delitem__(self, key): 

226 """Delete value for given key if exists. 

227 

228 Parameters 

229 ---------- 

230 key : `str` 

231 Identifier for value to delete. 

232 

233 Raises 

234 ------ 

235 KeyError 

236 If key doesn't exist. 

237 """ 

238 del self.data[key] 

239 

240 def __setitem__(self, key, value): 

241 """Store key,value in internal dict only if key is valid. 

242 

243 Parameters 

244 ---------- 

245 key : `str` 

246 Identifier to associate with given value. 

247 value : `~collections.abc.Any` 

248 Value to store. 

249 

250 Raises 

251 ------ 

252 KeyError 

253 If key is invalid. 

254 """ 

255 if key not in self.valid_keys: 

256 raise KeyError(f"Invalid key {key}") 

257 self.data[key] = value 

258 

259 def __iter__(self): 

260 return self.data.__iter__() 

261 

262 def __len__(self): 

263 return len(self.data) 

264 

265 def __str__(self): 

266 return str(self.data) 

267 

268 

269def htc_backup_files(wms_path, subdir=None, limit=100): 

270 """Backup select HTCondor files in the submit directory. 

271 

272 Files will be saved in separate subdirectories which will be created in 

273 the submit directory where the files are located. These subdirectories 

274 will be consecutive, zero-padded integers. Their values will correspond to 

275 the number of HTCondor rescue DAGs in the submit directory. 

276 

277 Hence, with the default settings, copies after the initial failed run will 

278 be placed in '001' subdirectory, '002' after the first restart, and so on 

279 until the limit of backups is reached. If there's no rescue DAG yet, files 

280 will be copied to '000' subdirectory. 

281 

282 Parameters 

283 ---------- 

284 wms_path : `str` or `pathlib.Path` 

285 Path to the submit directory either absolute or relative. 

286 subdir : `str` or `pathlib.Path`, optional 

287 A path, relative to the submit directory, where all subdirectories with 

288 backup files will be kept. Defaults to None which means that the backup 

289 subdirectories will be placed directly in the submit directory. 

290 limit : `int`, optional 

291 Maximal number of backups. If the number of backups reaches the limit, 

292 the last backup files will be overwritten. The default value is 100 

293 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in 

294 version 8.8+. 

295 

296 Raises 

297 ------ 

298 FileNotFoundError 

299 If the submit directory or the file that needs to be backed up does not 

300 exist. 

301 OSError 

302 If the submit directory cannot be accessed or backing up a file failed 

303 either due to permission or filesystem related issues. 

304 

305 Notes 

306 ----- 

307 This is not a generic function for making backups. It is intended to be 

308 used once, just before a restart, to make snapshots of files which will be 

309 overwritten by HTCondor after during the next run. 

310 """ 

311 width = len(str(limit)) 

312 

313 path = Path(wms_path).resolve() 

314 if not path.is_dir(): 

315 raise FileNotFoundError(f"Directory {path} not found") 

316 

317 # Initialize the backup counter. 

318 rescue_dags = list(Path(wms_path).glob("*.rescue*")) 

319 counter = min(len(rescue_dags), limit) 

320 

321 # Create the backup directory and move select files there. 

322 dest = Path(wms_path) 

323 if subdir: 

324 # PurePath.is_relative_to() is not available before Python 3.9. Hence 

325 # we need to check is 'subdir' is in the submit directory in some other 

326 # way if it is an absolute path. 

327 subdir = Path(subdir) 

328 if subdir.is_absolute(): 

329 if dest not in subdir.parents: 

330 _LOG.warning( 

331 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.", 

332 subdir, 

333 wms_path, 

334 ) 

335 else: 

336 dest /= subdir 

337 else: 

338 dest /= subdir 

339 dest /= f"{counter:0{width}}" 

340 try: 

341 dest.mkdir(parents=True, exist_ok=False if counter < limit else True) 

342 except FileExistsError: 

343 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest) 

344 else: 

345 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]: 

346 for source in path.glob(patt): 

347 if source.is_file(): 

348 target = dest / source.relative_to(path) 

349 try: 

350 source.rename(target) 

351 except OSError as exc: 

352 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None 

353 else: 

354 raise FileNotFoundError(f"Backing up '{source}' failed: not a file") 

355 

356 

357def htc_escape(value): 

358 """Escape characters in given value based upon HTCondor syntax. 

359 

360 Parameters 

361 ---------- 

362 value : `~collections.abc.Any` 

363 Value that needs to have characters escaped if string. 

364 

365 Returns 

366 ------- 

367 new_value : `~collections.abc.Any` 

368 Given value with characters escaped appropriate for HTCondor if string. 

369 """ 

370 if isinstance(value, str): 

371 newval = value.replace('"', '""').replace("'", "''").replace("&quot;", '"') 

372 else: 

373 newval = value 

374 

375 return newval 

376 

377 

378def htc_write_attribs(stream, attrs): 

379 """Write job attributes in HTCondor format to writeable stream. 

380 

381 Parameters 

382 ---------- 

383 stream : `~io.TextIOBase` 

384 Output text stream (typically an open file). 

385 attrs : `dict` 

386 HTCondor job attributes (dictionary of attribute key, value). 

387 """ 

388 for key, value in attrs.items(): 

389 # Make sure strings are syntactically correct for HTCondor. 

390 if isinstance(value, str): 

391 pval = f'"{htc_escape(value)}"' 

392 else: 

393 pval = value 

394 

395 print(f"+{key} = {pval}", file=stream) 

396 

397 

398def htc_write_condor_file(filename, job_name, job, job_attrs): 

399 """Write an HTCondor submit file. 

400 

401 Parameters 

402 ---------- 

403 filename : `str` 

404 Filename for the HTCondor submit file. 

405 job_name : `str` 

406 Job name to use in submit file. 

407 job : `RestrictedDict` 

408 Submit script information. 

409 job_attrs : `dict` 

410 Job attributes. 

411 """ 

412 os.makedirs(os.path.dirname(filename), exist_ok=True) 

413 with open(filename, "w") as fh: 

414 for key, value in job.items(): 

415 if value is not None: 

416 if key in HTC_QUOTE_KEYS: 

417 print(f'{key}="{htc_escape(value)}"', file=fh) 

418 else: 

419 print(f"{key}={value}", file=fh) 

420 for key in ["output", "error", "log"]: 

421 if key not in job: 

422 filename = f"{job_name}.$(Cluster).${key[:3]}" 

423 print(f"{key}={filename}", file=fh) 

424 

425 if job_attrs is not None: 

426 htc_write_attribs(fh, job_attrs) 

427 print("queue", file=fh) 

428 

429 

430def htc_version(): 

431 """Return the version given by the HTCondor API. 

432 

433 Returns 

434 ------- 

435 version : `str` 

436 HTCondor version as easily comparable string. 

437 

438 Raises 

439 ------ 

440 RuntimeError 

441 Raised if fail to parse htcondor API string. 

442 """ 

443 # Example string returned by htcondor.version: 

444 # $CondorVersion: 8.8.6 Nov 13 2019 BuildID: 489199 PackageID: 8.8.6-1 $ 

445 version_info = re.match(r"\$CondorVersion: (\d+).(\d+).(\d+)", htcondor.version()) 

446 if version_info is None: 

447 raise RuntimeError("Problems parsing condor version") 

448 return f"{int(version_info.group(1))}.{int(version_info.group(2))}.{int(version_info.group(3))}" 

449 

450 

451def htc_submit_dag(sub): 

452 """Submit job for execution. 

453 

454 Parameters 

455 ---------- 

456 sub : `htcondor.Submit` 

457 An object representing a job submit description. 

458 

459 Returns 

460 ------- 

461 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

462 Information about jobs satisfying the search criteria where for each 

463 Scheduler, local HTCondor job ids are mapped to their respective 

464 classads. 

465 """ 

466 coll = htcondor.Collector() 

467 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

468 schedd = htcondor.Schedd(schedd_ad) 

469 

470 jobs_ads = [] 

471 with schedd.transaction() as txn: 

472 sub.queue(txn, ad_results=jobs_ads) 

473 

474 # Submit.queue() above will raise RuntimeError if submission fails, so 

475 # 'jobs_ads' should contain the ad at this point. 

476 dag_ad = jobs_ads[0] 

477 

478 # Sadly, the ClassAd from Submit.queue() (see above) does not have 

479 # 'GlobalJobId' so we need to run a regular query to get it anyway. 

480 schedd_name = schedd_ad["Name"] 

481 schedd_dag_info = condor_q( 

482 constraint=f"ClusterId == {dag_ad['ClusterId']}", schedds={schedd_name: schedd} 

483 ) 

484 return schedd_dag_info 

485 

486 

487def htc_create_submit_from_dag(dag_filename, submit_options=None): 

488 """Create a DAGMan job submit description. 

489 

490 Parameters 

491 ---------- 

492 dag_filename : `str` 

493 Name of file containing HTCondor DAG commands. 

494 submit_options : `dict` [`str`, Any], optional 

495 Contains extra options for command line (Value of None means flag). 

496 

497 Returns 

498 ------- 

499 sub : `htcondor.Submit` 

500 An object representing a job submit description. 

501 

502 Notes 

503 ----- 

504 Use with HTCondor versions which support htcondor.Submit.from_dag(), 

505 i.e., 8.9.3 or newer. 

506 """ 

507 return htcondor.Submit.from_dag(dag_filename, submit_options) 

508 

509 

510def htc_create_submit_from_cmd(dag_filename, submit_options=None): 

511 """Create a DAGMan job submit description. 

512 

513 Create a DAGMan job submit description by calling ``condor_submit_dag`` 

514 on given DAG description file. 

515 

516 Parameters 

517 ---------- 

518 dag_filename : `str` 

519 Name of file containing HTCondor DAG commands. 

520 submit_options : `dict` [`str`, Any], optional 

521 Contains extra options for command line (Value of None means flag). 

522 

523 Returns 

524 ------- 

525 sub : `htcondor.Submit` 

526 An object representing a job submit description. 

527 

528 Notes 

529 ----- 

530 Use with HTCondor versions which do not support htcondor.Submit.from_dag(), 

531 i.e., older than 8.9.3. 

532 """ 

533 # Run command line condor_submit_dag command. 

534 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse " 

535 

536 if submit_options is not None: 

537 for opt, val in submit_options.items(): 

538 cmd += f" -{opt} {val or ''}" 

539 cmd += f"{dag_filename}" 

540 

541 process = subprocess.Popen( 

542 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" 

543 ) 

544 process.wait() 

545 

546 if process.returncode != 0: 

547 print(f"Exit code: {process.returncode}") 

548 print(process.communicate()[0]) 

549 raise RuntimeError("Problems running condor_submit_dag") 

550 

551 return htc_create_submit_from_file(f"{dag_filename}.condor.sub") 

552 

553 

554def htc_create_submit_from_file(submit_file): 

555 """Parse a submission file. 

556 

557 Parameters 

558 ---------- 

559 submit_file : `str` 

560 Name of the HTCondor submit file. 

561 

562 Returns 

563 ------- 

564 sub : `htcondor.Submit` 

565 An object representing a job submit description. 

566 """ 

567 descriptors = {} 

568 with open(submit_file) as fh: 

569 for line in fh: 

570 line = line.strip() 

571 if not line.startswith("#") and not line == "queue": 

572 (key, val) = re.split(r"\s*=\s*", line, 1) 

573 descriptors[key] = val 

574 

575 # Avoid UserWarning: the line 'copy_to_spool = False' was 

576 # unused by Submit object. Is it a typo? 

577 try: 

578 del descriptors["copy_to_spool"] 

579 except KeyError: 

580 pass 

581 

582 return htcondor.Submit(descriptors) 

583 

584 

585def _htc_write_job_commands(stream, name, jobs): 

586 """Output the DAGMan job lines for single job in DAG. 

587 

588 Parameters 

589 ---------- 

590 stream : `~io.TextIOBase` 

591 Writeable text stream (typically an opened file). 

592 name : `str` 

593 Job name. 

594 jobs : `RestrictedDict` 

595 DAG job keys and values. 

596 """ 

597 if "pre" in jobs: 

598 print( 

599 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}" 

600 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}", 

601 file=stream, 

602 ) 

603 

604 if "post" in jobs: 

605 print( 

606 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}" 

607 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}", 

608 file=stream, 

609 ) 

610 

611 if "vars" in jobs: 

612 for key, value in jobs["vars"]: 

613 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream) 

614 

615 if "pre_skip" in jobs: 

616 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream) 

617 

618 if "retry" in jobs and jobs["retry"]: 

619 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream) 

620 if "retry_unless_exit" in jobs: 

621 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream) 

622 print("\n", file=stream) 

623 

624 if "abort_dag_on" in jobs and jobs["abort_dag_on"]: 

625 print( 

626 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}" 

627 f" RETURN {jobs['abort_dag_on']['abort_exit']}", 

628 file=stream, 

629 ) 

630 

631 

632class HTCJob: 

633 """HTCondor job for use in building DAG. 

634 

635 Parameters 

636 ---------- 

637 name : `str` 

638 Name of the job 

639 label : `str` 

640 Label that can used for grouping or lookup. 

641 initcmds : `RestrictedDict` 

642 Initial job commands for submit file. 

643 initdagcmds : `RestrictedDict` 

644 Initial commands for job inside DAG. 

645 initattrs : `dict` 

646 Initial dictionary of job attributes. 

647 """ 

648 

649 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None): 

650 self.name = name 

651 self.label = label 

652 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds) 

653 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds) 

654 self.attrs = initattrs 

655 self.subfile = None 

656 

657 def __str__(self): 

658 return self.name 

659 

660 def add_job_cmds(self, new_commands): 

661 """Add commands to Job (overwrite existing). 

662 

663 Parameters 

664 ---------- 

665 new_commands : `dict` 

666 Submit file commands to be added to Job. 

667 """ 

668 self.cmds.update(new_commands) 

669 

670 def add_dag_cmds(self, new_commands): 

671 """Add DAG commands to Job (overwrite existing). 

672 

673 Parameters 

674 ---------- 

675 new_commands : `dict` 

676 DAG file commands to be added to Job 

677 """ 

678 self.dagcmds.update(new_commands) 

679 

680 def add_job_attrs(self, new_attrs): 

681 """Add attributes to Job (overwrite existing). 

682 

683 Parameters 

684 ---------- 

685 new_attrs : `dict` 

686 Attributes to be added to Job 

687 """ 

688 if self.attrs is None: 

689 self.attrs = {} 

690 if new_attrs: 

691 self.attrs.update(new_attrs) 

692 

693 def write_submit_file(self, submit_path, job_subdir=""): 

694 """Write job description to submit file. 

695 

696 Parameters 

697 ---------- 

698 submit_path : `str` 

699 Prefix path for the submit file. 

700 job_subdir : `str`, optional 

701 Template for job subdir. 

702 """ 

703 if not self.subfile: 

704 self.subfile = f"{self.name}.sub" 

705 job_subdir = job_subdir.format(self=self) 

706 if job_subdir: 

707 self.subfile = os.path.join(job_subdir, self.subfile) 

708 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs) 

709 

710 def write_dag_commands(self, stream): 

711 """Write DAG commands for single job to output stream. 

712 

713 Parameters 

714 ---------- 

715 stream : `IO` or `str` 

716 Output Stream 

717 """ 

718 print(f"JOB {self.name} {self.subfile}", file=stream) 

719 _htc_write_job_commands(stream, self.name, self.dagcmds) 

720 

721 def dump(self, fh): 

722 """Dump job information to output stream. 

723 

724 Parameters 

725 ---------- 

726 fh : `~io.TextIOBase` 

727 Output stream 

728 """ 

729 printer = pprint.PrettyPrinter(indent=4, stream=fh) 

730 printer.pprint(self.name) 

731 printer.pprint(self.cmds) 

732 printer.pprint(self.attrs) 

733 

734 

735class HTCDag(networkx.DiGraph): 

736 """HTCondor DAG. 

737 

738 Parameters 

739 ---------- 

740 data : networkx.DiGraph.data 

741 Initial graph. 

742 name : `str` 

743 Name for DAG. 

744 """ 

745 

746 def __init__(self, data=None, name=""): 

747 super().__init__(data=data, name=name) 

748 

749 self.graph["attr"] = {} 

750 self.graph["run_id"] = None 

751 self.graph["submit_path"] = None 

752 self.graph["final_job"] = None 

753 

754 def __str__(self): 

755 """Represent basic DAG info as string. 

756 

757 Returns 

758 ------- 

759 info : `str` 

760 String containing basic DAG info. 

761 """ 

762 return f"{self.graph['name']} {len(self)}" 

763 

764 def add_attribs(self, attribs=None): 

765 """Add attributes to the DAG. 

766 

767 Parameters 

768 ---------- 

769 attribs : `dict` 

770 DAG attributes 

771 """ 

772 if attribs is not None: 

773 self.graph["attr"].update(attribs) 

774 

775 def add_job(self, job, parent_names=None, child_names=None): 

776 """Add an HTCJob to the HTCDag. 

777 

778 Parameters 

779 ---------- 

780 job : `HTCJob` 

781 HTCJob to add to the HTCDag 

782 parent_names : `~collections.abc.Iterable` [`str`], optional 

783 Names of parent jobs 

784 child_names : `~collections.abc.Iterable` [`str`], optional 

785 Names of child jobs 

786 """ 

787 assert isinstance(job, HTCJob) 

788 

789 # Add dag level attributes to each job 

790 job.add_job_attrs(self.graph["attr"]) 

791 

792 self.add_node(job.name, data=job) 

793 

794 if parent_names is not None: 

795 self.add_job_relationships(parent_names, job.name) 

796 

797 if child_names is not None: 

798 self.add_job_relationships(child_names, job.name) 

799 

800 def add_job_relationships(self, parents, children): 

801 """Add DAG edge between parents and children jobs. 

802 

803 Parameters 

804 ---------- 

805 parents : `list` [`str`] 

806 Contains parent job name(s). 

807 children : `list` [`str`] 

808 Contains children job name(s). 

809 """ 

810 self.add_edges_from(itertools.product(parents, children)) 

811 

812 def add_final_job(self, job): 

813 """Add an HTCJob for the FINAL job in HTCDag. 

814 

815 Parameters 

816 ---------- 

817 job : `HTCJob` 

818 HTCJob to add to the HTCDag as a FINAL job. 

819 """ 

820 # Add dag level attributes to each job 

821 job.add_job_attrs(self.graph["attr"]) 

822 

823 self.graph["final_job"] = job 

824 

825 def del_job(self, job_name): 

826 """Delete the job from the DAG. 

827 

828 Parameters 

829 ---------- 

830 job_name : `str` 

831 Name of job in DAG to delete 

832 """ 

833 # Reconnect edges around node to delete 

834 parents = self.predecessors(job_name) 

835 children = self.successors(job_name) 

836 self.add_edges_from(itertools.product(parents, children)) 

837 

838 # Delete job node (which deletes its edges). 

839 self.remove_node(job_name) 

840 

841 def write(self, submit_path, job_subdir=""): 

842 """Write DAG to a file. 

843 

844 Parameters 

845 ---------- 

846 submit_path : `str` 

847 Prefix path for dag filename to be combined with DAG name. 

848 job_subdir : `str`, optional 

849 Template for job subdir. 

850 """ 

851 self.graph["submit_path"] = submit_path 

852 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag") 

853 os.makedirs(submit_path, exist_ok=True) 

854 with open(self.graph["dag_filename"], "w") as fh: 

855 for _, nodeval in self.nodes().items(): 

856 job = nodeval["data"] 

857 job.write_submit_file(submit_path, job_subdir) 

858 job.write_dag_commands(fh) 

859 for edge in self.edges(): 

860 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

861 print(f"DOT {self.name}.dot", file=fh) 

862 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh) 

863 

864 # Add bps attributes to dag submission 

865 for key, value in self.graph["attr"].items(): 

866 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh) 

867 

868 if self.graph["final_job"]: 

869 job = self.graph["final_job"] 

870 job.write_submit_file(submit_path, job_subdir) 

871 print(f"FINAL {job.name} {job.subfile}", file=fh) 

872 if "pre" in job.dagcmds: 

873 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh) 

874 if "post" in job.dagcmds: 

875 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh) 

876 

877 def dump(self, fh): 

878 """Dump DAG info to output stream. 

879 

880 Parameters 

881 ---------- 

882 fh : `io.IO` or `str` 

883 Where to dump DAG info as text. 

884 """ 

885 for key, value in self.graph: 

886 print(f"{key}={value}", file=fh) 

887 for name, data in self.nodes().items(): 

888 print(f"{name}:", file=fh) 

889 data.dump(fh) 

890 for edge in self.edges(): 

891 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

892 if self.graph["final_job"]: 

893 print(f'FINAL {self.graph["final_job"].name}:', file=fh) 

894 self.graph["final_job"].dump(fh) 

895 

896 def write_dot(self, filename): 

897 """Write a dot version of the DAG. 

898 

899 Parameters 

900 ---------- 

901 filename : `str` 

902 dot filename 

903 """ 

904 pos = networkx.nx_agraph.graphviz_layout(self) 

905 networkx.draw(self, pos=pos) 

906 networkx.drawing.nx_pydot.write_dot(self, filename) 

907 

908 

909def condor_q(constraint=None, schedds=None): 

910 """Query HTCondor for current jobs. 

911 

912 Parameters 

913 ---------- 

914 constraint : `str`, optional 

915 Constraints to be passed to job query. 

916 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

917 HTCondor schedulers which to query for job information. If None 

918 (default), the query will be run against local scheduler only. 

919 

920 Returns 

921 ------- 

922 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

923 Information about jobs satisfying the search criteria where for each 

924 Scheduler, local HTCondor job ids are mapped to their respective 

925 classads. 

926 """ 

927 if not schedds: 

928 coll = htcondor.Collector() 

929 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

930 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

931 

932 queries = [schedd.xquery(requirements=constraint) for schedd in schedds.values()] 

933 

934 job_info = {} 

935 for query in htcondor.poll(queries): 

936 schedd_name = query.tag() 

937 job_info.setdefault(schedd_name, {}) 

938 for job_ad in query.nextAdsNonBlocking(): 

939 del job_ad["Environment"] 

940 del job_ad["Env"] 

941 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}" 

942 job_info[schedd_name][id_] = dict(job_ad) 

943 _LOG.debug("condor_q returned %d jobs", sum(len(val) for val in job_info.values())) 

944 

945 # When returning the results filter out entries for schedulers with no jobs 

946 # matching the search criteria. 

947 return {key: val for key, val in job_info.items() if val} 

948 

949 

950def condor_history(constraint=None, schedds=None): 

951 """Get information about completed jobs from HTCondor history. 

952 

953 Parameters 

954 ---------- 

955 constraint : `str`, optional 

956 Constraints to be passed to job query. 

957 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

958 HTCondor schedulers which to query for job information. If None 

959 (default), the query will be run against the history file of 

960 the local scheduler only. 

961 

962 Returns 

963 ------- 

964 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

965 Information about jobs satisfying the search criteria where for each 

966 Scheduler, local HTCondor job ids are mapped to their respective 

967 classads. 

968 """ 

969 if not schedds: 

970 coll = htcondor.Collector() 

971 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

972 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

973 

974 job_info = {} 

975 for schedd_name, schedd in schedds.items(): 

976 job_info[schedd_name] = {} 

977 for job_ad in schedd.history(requirements=constraint, projection=[]): 

978 del job_ad["Environment"] 

979 del job_ad["Env"] 

980 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}" 

981 job_info[schedd_name][id_] = dict(job_ad) 

982 _LOG.debug("condor_history returned %d jobs", sum(len(val) for val in job_info.values())) 

983 

984 # When returning the results filter out entries for schedulers with no jobs 

985 # matching the search criteria. 

986 return {key: val for key, val in job_info.items() if val} 

987 

988 

989def condor_search(constraint=None, hist=None, schedds=None): 

990 """Search for running and finished jobs satisfying given criteria. 

991 

992 Parameters 

993 ---------- 

994 constraint : `str`, optional 

995 Constraints to be passed to job query. 

996 hist : `float` 

997 Limit history search to this many days. 

998 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

999 The list of the HTCondor schedulers which to query for job information. 

1000 If None (default), only the local scheduler will be queried. 

1001 

1002 Returns 

1003 ------- 

1004 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1005 Information about jobs satisfying the search criteria where for each 

1006 Scheduler, local HTCondor job ids are mapped to their respective 

1007 classads. 

1008 """ 

1009 if not schedds: 

1010 coll = htcondor.Collector() 

1011 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1012 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)} 

1013 

1014 job_info = condor_q(constraint=constraint, schedds=schedds) 

1015 if hist is not None: 

1016 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

1017 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

1018 hist_info = condor_history(constraint, schedds=schedds) 

1019 update_job_info(job_info, hist_info) 

1020 return job_info 

1021 

1022 

1023def condor_status(constraint=None, coll=None): 

1024 """Get information about HTCondor pool. 

1025 

1026 Parameters 

1027 ---------- 

1028 constraint : `str`, optional 

1029 Constraints to be passed to the query. 

1030 coll : `htcondor.Collector`, optional 

1031 Object representing HTCondor collector daemon. 

1032 

1033 Returns 

1034 ------- 

1035 pool_info : `dict` [`str`, `dict` [`str`, Any]] 

1036 Mapping between HTCondor slot names and slot information (classAds). 

1037 """ 

1038 if coll is None: 

1039 coll = htcondor.Collector() 

1040 try: 

1041 pool_ads = coll.query(constraint=constraint) 

1042 except OSError as ex: 

1043 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex 

1044 

1045 pool_info = {} 

1046 for slot in pool_ads: 

1047 pool_info[slot["name"]] = dict(slot) 

1048 _LOG.debug("condor_status returned %d ads", len(pool_info)) 

1049 return pool_info 

1050 

1051 

1052def update_job_info(job_info, other_info): 

1053 """Update results of a job query with results from another query. 

1054 

1055 Parameters 

1056 ---------- 

1057 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1058 Results of the job query that needs to be updated. 

1059 other_info : `dict` [`str`, `dict` [`str`, Any]] 

1060 Results of the other job query. 

1061 

1062 Returns 

1063 ------- 

1064 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1065 The updated results. 

1066 """ 

1067 for schedd_name, others in other_info.items(): 

1068 try: 

1069 jobs = job_info[schedd_name] 

1070 except KeyError: 

1071 job_info[schedd_name] = others 

1072 else: 

1073 for id_, ad in others.items(): 

1074 jobs.setdefault(id_, {}).update(ad) 

1075 return job_info 

1076 

1077 

1078def summary_from_dag(dir_name): 

1079 """Build bps_run_summary string from dag file. 

1080 

1081 Parameters 

1082 ---------- 

1083 dir_name : `str` 

1084 Path that includes dag file for a run. 

1085 

1086 Returns 

1087 ------- 

1088 summary : `str` 

1089 Semi-colon separated list of job labels and counts. 

1090 (Same format as saved in dag classad.) 

1091 job_name_to_pipetask : `dict` [`str`, `str`] 

1092 Mapping of job names to job labels 

1093 """ 

1094 dag = next(Path(dir_name).glob("*.dag")) 

1095 

1096 # Later code depends upon insertion order 

1097 counts = defaultdict(int) 

1098 job_name_to_pipetask = {} 

1099 try: 

1100 with open(dag) as fh: 

1101 for line in fh: 

1102 if line.startswith("JOB"): 

1103 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line) 

1104 if m: 

1105 label = m.group(2) 

1106 if label == "init": 

1107 label = "pipetaskInit" 

1108 job_name_to_pipetask[m.group(1)] = label 

1109 counts[label] += 1 

1110 else: # Check if Pegasus submission 

1111 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line) 

1112 if m: 

1113 label = pegasus_name_to_label(m.group(1)) 

1114 job_name_to_pipetask[m.group(1)] = label 

1115 counts[label] += 1 

1116 else: 

1117 _LOG.warning("Parse DAG: unmatched job line: %s", line) 

1118 elif line.startswith("FINAL"): 

1119 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line) 

1120 if m: 

1121 label = m.group(2) 

1122 job_name_to_pipetask[m.group(1)] = label 

1123 counts[label] += 1 

1124 

1125 except (OSError, PermissionError, StopIteration): 

1126 pass 

1127 

1128 summary = ";".join([f"{name}:{counts[name]}" for name in counts]) 

1129 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask) 

1130 return summary, job_name_to_pipetask 

1131 

1132 

1133def pegasus_name_to_label(name): 

1134 """Convert pegasus job name to a label for the report. 

1135 

1136 Parameters 

1137 ---------- 

1138 name : `str` 

1139 Name of job. 

1140 

1141 Returns 

1142 ------- 

1143 label : `str` 

1144 Label for job. 

1145 """ 

1146 label = "UNK" 

1147 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"): 

1148 label = "pegasus" 

1149 else: 

1150 m = re.match(r"pipetask_(\d+_)?([^_]+)", name) 

1151 if m: 

1152 label = m.group(2) 

1153 if label == "init": 

1154 label = "pipetaskInit" 

1155 

1156 return label 

1157 

1158 

1159def read_dag_status(wms_path): 

1160 """Read the node status file for DAG summary information 

1161 

1162 Parameters 

1163 ---------- 

1164 wms_path : `str` 

1165 Path that includes node status file for a run. 

1166 

1167 Returns 

1168 ------- 

1169 dag_ad : `dict` [`str`, Any] 

1170 DAG summary information. 

1171 """ 

1172 dag_ad = {} 

1173 

1174 # While this is probably more up to date than dag classad, only read from 

1175 # file if need to. 

1176 try: 

1177 try: 

1178 node_stat_file = next(Path(wms_path).glob("*.node_status")) 

1179 _LOG.debug("Reading Node Status File %s", node_stat_file) 

1180 with open(node_stat_file) as infh: 

1181 dag_ad = classad.parseNext(infh) # pylint: disable=E1101 

1182 except StopIteration: 

1183 pass 

1184 

1185 if not dag_ad: 

1186 # Pegasus check here 

1187 try: 

1188 metrics_file = next(Path(wms_path).glob("*.dag.metrics")) 

1189 with open(metrics_file) as infh: 

1190 metrics = json.load(infh) 

1191 dag_ad["NodesTotal"] = metrics.get("jobs", 0) 

1192 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0) 

1193 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0) 

1194 dag_ad["pegasus_version"] = metrics.get("planner_version", "") 

1195 except StopIteration: 

1196 try: 

1197 metrics_file = next(Path(wms_path).glob("*.metrics")) 

1198 with open(metrics_file) as infh: 

1199 metrics = json.load(infh) 

1200 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"] 

1201 dag_ad["pegasus_version"] = metrics.get("version", "") 

1202 except StopIteration: 

1203 pass 

1204 except (OSError, PermissionError): 

1205 pass 

1206 

1207 _LOG.debug("read_dag_status: %s", dag_ad) 

1208 return dict(dag_ad) 

1209 

1210 

1211def read_node_status(wms_path): 

1212 """Read entire node status file. 

1213 

1214 Parameters 

1215 ---------- 

1216 wms_path : `str` 

1217 Path that includes node status file for a run. 

1218 

1219 Returns 

1220 ------- 

1221 jobs : `dict` [`str`, Any] 

1222 DAG summary information. 

1223 """ 

1224 # Get jobid info from other places to fill in gaps in info from node_status 

1225 _, job_name_to_pipetask = summary_from_dag(wms_path) 

1226 wms_workflow_id, loginfo = read_dag_log(wms_path) 

1227 loginfo = read_dag_nodes_log(wms_path) 

1228 _LOG.debug("loginfo = %s", loginfo) 

1229 job_name_to_id = {} 

1230 for jid, jinfo in loginfo.items(): 

1231 if "LogNotes" in jinfo: 

1232 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"]) 

1233 if m: 

1234 job_name_to_id[m.group(1)] = jid 

1235 jinfo["DAGNodeName"] = m.group(1) 

1236 

1237 try: 

1238 node_status = next(Path(wms_path).glob("*.node_status")) 

1239 except StopIteration: 

1240 return loginfo 

1241 

1242 jobs = {} 

1243 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one 

1244 try: 

1245 with open(node_status) as fh: 

1246 ads = classad.parseAds(fh) 

1247 

1248 for jclassad in ads: 

1249 if jclassad["Type"] == "DagStatus": 

1250 # skip DAG summary 

1251 pass 

1252 elif "Node" not in jclassad: 

1253 if jclassad["Type"] != "StatusEnd": 

1254 _LOG.debug("Key 'Node' not in classad: %s", jclassad) 

1255 break 

1256 else: 

1257 if jclassad["Node"] in job_name_to_pipetask: 

1258 try: 

1259 label = job_name_to_pipetask[jclassad["Node"]] 

1260 except KeyError: 

1261 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys()) 

1262 raise 

1263 elif "_" in jclassad["Node"]: 

1264 label = jclassad["Node"].split("_")[1] 

1265 else: 

1266 label = jclassad["Node"] 

1267 

1268 # Make job info as if came from condor_q 

1269 if jclassad["Node"] in job_name_to_id: 

1270 job_id = job_name_to_id[jclassad["Node"]] 

1271 else: 

1272 job_id = str(fake_id) 

1273 fake_id -= 1 

1274 

1275 job = dict(jclassad) 

1276 job["ClusterId"] = int(float(job_id)) 

1277 job["DAGManJobID"] = wms_workflow_id 

1278 job["DAGNodeName"] = jclassad["Node"] 

1279 job["bps_job_label"] = label 

1280 

1281 jobs[str(job_id)] = job 

1282 except (OSError, PermissionError): 

1283 pass 

1284 

1285 return jobs 

1286 

1287 

1288def read_dag_log(wms_path): 

1289 """Read job information from the DAGMan log file. 

1290 

1291 Parameters 

1292 ---------- 

1293 wms_path : `str` 

1294 Path containing the DAGMan log file. 

1295 

1296 Returns 

1297 ------- 

1298 wms_workflow_id : `str` 

1299 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job. 

1300 dag_info : `dict` [`str`, `~collections.abc.Any`] 

1301 HTCondor job information read from the log file mapped to HTCondor 

1302 job id. 

1303 

1304 Raises 

1305 ------ 

1306 FileNotFoundError 

1307 If cannot find DAGMan log in given wms_path. 

1308 """ 

1309 wms_workflow_id = 0 

1310 dag_info = {} 

1311 

1312 path = Path(wms_path) 

1313 if path.exists(): 

1314 try: 

1315 filename = next(path.glob("*.dag.dagman.log")) 

1316 except StopIteration as exc: 

1317 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc 

1318 _LOG.debug("dag node log filename: %s", filename) 

1319 

1320 info = {} 

1321 job_event_log = htcondor.JobEventLog(str(filename)) 

1322 for event in job_event_log.events(stop_after=0): 

1323 id_ = f"{event['Cluster']}.{event['Proc']}" 

1324 if id_ not in info: 

1325 info[id_] = {} 

1326 wms_workflow_id = id_ # taking last job id in case of restarts 

1327 info[id_].update(event) 

1328 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1329 

1330 # only save latest DAG job 

1331 dag_info = {wms_workflow_id: info[wms_workflow_id]} 

1332 for job in dag_info.values(): 

1333 _tweak_log_info(filename, job) 

1334 

1335 return wms_workflow_id, dag_info 

1336 

1337 

1338def read_dag_nodes_log(wms_path): 

1339 """Read job information from the DAGMan nodes log file. 

1340 

1341 Parameters 

1342 ---------- 

1343 wms_path : `str` 

1344 Path containing the DAGMan nodes log file. 

1345 

1346 Returns 

1347 ------- 

1348 info : `dict` [`str`, Any] 

1349 HTCondor job information read from the log file mapped to HTCondor 

1350 job id. 

1351 

1352 Raises 

1353 ------ 

1354 FileNotFoundError 

1355 If cannot find DAGMan node log in given wms_path. 

1356 """ 

1357 try: 

1358 filename = next(Path(wms_path).glob("*.dag.nodes.log")) 

1359 except StopIteration as exc: 

1360 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc 

1361 _LOG.debug("dag node log filename: %s", filename) 

1362 

1363 info = {} 

1364 job_event_log = htcondor.JobEventLog(str(filename)) 

1365 for event in job_event_log.events(stop_after=0): 

1366 id_ = f"{event['Cluster']}.{event['Proc']}" 

1367 if id_ not in info: 

1368 info[id_] = {} 

1369 info[id_].update(event) 

1370 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1371 

1372 # Add more condor_q-like info to info parsed from log file. 

1373 for job in info.values(): 

1374 _tweak_log_info(filename, job) 

1375 

1376 return info 

1377 

1378 

1379def read_dag_info(wms_path): 

1380 """Read custom DAGMan job information from the file. 

1381 

1382 Parameters 

1383 ---------- 

1384 wms_path : `str` 

1385 Path containing the file with the DAGMan job info. 

1386 

1387 Returns 

1388 ------- 

1389 dag_info : `dict` [`str`, `dict` [`str`, Any]] 

1390 HTCondor job information. 

1391 

1392 Raises 

1393 ------ 

1394 FileNotFoundError 

1395 If cannot find DAGMan job info file in the given location. 

1396 """ 

1397 try: 

1398 filename = next(Path(wms_path).glob("*.info.json")) 

1399 except StopIteration as exc: 

1400 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc 

1401 _LOG.debug("DAGMan job information filename: %s", filename) 

1402 try: 

1403 with open(filename) as fh: 

1404 dag_info = json.load(fh) 

1405 except (OSError, PermissionError) as exc: 

1406 _LOG.debug("Retrieving DAGMan job information failed: %s", exc) 

1407 dag_info = {} 

1408 return dag_info 

1409 

1410 

1411def write_dag_info(filename, dag_info): 

1412 """Write custom job information about DAGMan job. 

1413 

1414 Parameters 

1415 ---------- 

1416 filename : `str` 

1417 Name of the file where the information will be stored. 

1418 dag_info : `dict` [`str` `dict` [`str`, Any]] 

1419 Information about the DAGMan job. 

1420 """ 

1421 schedd_name = next(iter(dag_info)) 

1422 dag_id = next(iter(dag_info[schedd_name])) 

1423 dag_ad = dag_info[schedd_name][dag_id] 

1424 try: 

1425 with open(filename, "w") as fh: 

1426 info = { 

1427 schedd_name: { 

1428 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]} 

1429 } 

1430 } 

1431 json.dump(info, fh) 

1432 except (KeyError, OSError, PermissionError) as exc: 

1433 _LOG.debug("Persisting DAGMan job information failed: %s", exc) 

1434 

1435 

1436def _tweak_log_info(filename, job): 

1437 """Massage the given job info has same structure as if came from condor_q. 

1438 

1439 Parameters 

1440 ---------- 

1441 filename : `pathlib.Path` 

1442 Name of the DAGMan log. 

1443 job : `dict` [ `str`, Any ] 

1444 A mapping between HTCondor job id and job information read from 

1445 the log. 

1446 """ 

1447 _LOG.debug("_tweak_log_info: %s %s", filename, job) 

1448 try: 

1449 job["ClusterId"] = job["Cluster"] 

1450 job["ProcId"] = job["Proc"] 

1451 job["Iwd"] = str(filename.parent) 

1452 job["Owner"] = filename.owner() 

1453 if job["MyType"] == "ExecuteEvent": 

1454 job["JobStatus"] = JobStatus.RUNNING 

1455 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent": 

1456 job["JobStatus"] = JobStatus.COMPLETED 

1457 try: 

1458 if not job["TerminatedNormally"]: 

1459 if "ReturnValue" in job: 

1460 job["ExitCode"] = job["ReturnValue"] 

1461 job["ExitBySignal"] = False 

1462 elif "TerminatedBySignal" in job: 

1463 job["ExitBySignal"] = True 

1464 job["ExitSignal"] = job["TerminatedBySignal"] 

1465 else: 

1466 _LOG.warning("Could not determine exit status for completed job: %s", job) 

1467 except KeyError as ex: 

1468 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job) 

1469 elif job["MyType"] == "SubmitEvent": 

1470 job["JobStatus"] = JobStatus.IDLE 

1471 elif job["MyType"] == "JobAbortedEvent": 

1472 job["JobStatus"] = JobStatus.REMOVED 

1473 else: 

1474 _LOG.debug("Unknown log event type: %s", job["MyType"]) 

1475 except KeyError: 

1476 _LOG.error("Missing key in job: %s", job) 

1477 raise 

1478 

1479 

1480def htc_check_dagman_output(wms_path): 

1481 """Check the DAGMan output for error messages. 

1482 

1483 Parameters 

1484 ---------- 

1485 wms_path : `str` 

1486 Directory containing the DAGman output file. 

1487 

1488 Returns 

1489 ------- 

1490 message : `str` 

1491 Message containing error messages from the DAGMan output. Empty 

1492 string if no messages. 

1493 

1494 Raises 

1495 ------ 

1496 FileNotFoundError 

1497 If cannot find DAGMan standard output file in given wms_path. 

1498 """ 

1499 try: 

1500 filename = next(Path(wms_path).glob("*.dag.dagman.out")) 

1501 except StopIteration as exc: 

1502 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc 

1503 _LOG.debug("dag output filename: %s", filename) 

1504 

1505 message = "" 

1506 try: 

1507 with open(filename) as fh: 

1508 last_submit_failed = "" 

1509 for line in fh: 

1510 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line) 

1511 if m: 

1512 last_submit_failed = m.group(1) 

1513 if last_submit_failed: 

1514 message = f"Warn: Job submission issues (last: {last_submit_failed})" 

1515 except (OSError, PermissionError): 

1516 message = f"Warn: Could not read dagman output file from {wms_path}." 

1517 return message