Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 12%

585 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-11 02:53 -0800

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Placeholder HTCondor DAGMan API. 

23 

24There is new work on a python DAGMan API from HTCondor. However, at this 

25time, it tries to make things easier by assuming DAG is easily broken into 

26levels where there are 1-1 or all-to-all relationships to nodes in next 

27level. LSST workflows are more complicated. 

28""" 

29 

30__all__ = [ 

31 "DagStatus", 

32 "JobStatus", 

33 "NodeStatus", 

34 "RestrictedDict", 

35 "HTCJob", 

36 "HTCDag", 

37 "htc_backup_files", 

38 "htc_check_dagman_output", 

39 "htc_create_submit_from_cmd", 

40 "htc_create_submit_from_dag", 

41 "htc_create_submit_from_file", 

42 "htc_escape", 

43 "htc_write_attribs", 

44 "htc_write_condor_file", 

45 "htc_version", 

46 "htc_submit_dag", 

47 "condor_history", 

48 "condor_q", 

49 "condor_search", 

50 "condor_status", 

51 "update_job_info", 

52 "MISSING_ID", 

53 "summary_from_dag", 

54 "read_dag_info", 

55 "read_dag_log", 

56 "read_dag_nodes_log", 

57 "read_dag_status", 

58 "read_node_status", 

59 "write_dag_info", 

60 "pegasus_name_to_label", 

61] 

62 

63 

64import itertools 

65import json 

66import logging 

67import os 

68import pprint 

69import re 

70import subprocess 

71from collections import defaultdict 

72from collections.abc import MutableMapping 

73from datetime import datetime, timedelta 

74from enum import IntEnum 

75from pathlib import Path 

76 

77import classad 

78import htcondor 

79import networkx 

80 

81_LOG = logging.getLogger(__name__) 

82 

83MISSING_ID = -99999 

84 

85 

86class DagStatus(IntEnum): 

87 """HTCondor DAGMan's statuses for a DAG.""" 

88 

89 OK = 0 

90 ERROR = 1 # an error condition different than those listed here 

91 FAILED = 2 # one or more nodes in the DAG have failed 

92 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification 

93 REMOVED = 4 # the DAG has been removed by condor_rm 

94 CYCLE = 5 # a cycle was found in the DAG 

95 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8) 

96 

97 

98class JobStatus(IntEnum): 

99 """HTCondor's statuses for jobs.""" 

100 

101 UNEXPANDED = 0 # Unexpanded 

102 IDLE = 1 # Idle 

103 RUNNING = 2 # Running 

104 REMOVED = 3 # Removed 

105 COMPLETED = 4 # Completed 

106 HELD = 5 # Held 

107 TRANSFERRING_OUTPUT = 6 # Transferring_Output 

108 SUSPENDED = 7 # Suspended 

109 

110 

111class NodeStatus(IntEnum): 

112 """HTCondor's statuses for DAGman nodes.""" 

113 

114 # (STATUS_NOT_READY): At least one parent has not yet finished or the node 

115 # is a FINAL node. 

116 NOT_READY = 0 

117 

118 # (STATUS_READY): All parents have finished, but the node is not yet 

119 # running. 

120 READY = 1 

121 

122 # (STATUS_PRERUN): The node’s PRE script is running. 

123 PRERUN = 2 

124 

125 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue. 

126 # StatusDetails = "not_idle" -> running. 

127 # JobProcsHeld = 1-> hold. 

128 # JobProcsQueued = 1 -> idle. 

129 SUBMITTED = 3 

130 

131 # (STATUS_POSTRUN): The node’s POST script is running. 

132 POSTRUN = 4 

133 

134 # (STATUS_DONE): The node has completed successfully. 

135 DONE = 5 

136 

137 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g., 

138 # ULOG_JOB_ABORTED for deleted job). 

139 ERROR = 6 

140 

141 

142HTC_QUOTE_KEYS = {"environment"} 

143HTC_VALID_JOB_KEYS = { 

144 "universe", 

145 "executable", 

146 "arguments", 

147 "environment", 

148 "log", 

149 "error", 

150 "output", 

151 "should_transfer_files", 

152 "when_to_transfer_output", 

153 "getenv", 

154 "notification", 

155 "notify_user", 

156 "concurrency_limit", 

157 "transfer_executable", 

158 "transfer_input_files", 

159 "transfer_output_files", 

160 "request_cpus", 

161 "request_memory", 

162 "request_disk", 

163 "priority", 

164 "category", 

165 "requirements", 

166 "on_exit_hold", 

167 "on_exit_hold_reason", 

168 "on_exit_hold_subcode", 

169 "max_retries", 

170 "periodic_release", 

171 "periodic_remove", 

172 "accounting_group", 

173 "accounting_group_user", 

174} 

175HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"} 

176 

177 

178class RestrictedDict(MutableMapping): 

179 """A dictionary that only allows certain keys. 

180 

181 Parameters 

182 ---------- 

183 valid_keys : `Container` 

184 Strings that are valid keys. 

185 init_data : `dict` or `RestrictedDict`, optional 

186 Initial data. 

187 

188 Raises 

189 ------ 

190 KeyError 

191 If invalid key(s) in init_data. 

192 """ 

193 

194 def __init__(self, valid_keys, init_data=()): 

195 self.valid_keys = valid_keys 

196 self.data = {} 

197 self.update(init_data) 

198 

199 def __getitem__(self, key): 

200 """Returns value for given key if exists. 

201 

202 Parameters 

203 ---------- 

204 key : `str` 

205 Identifier for value to return. 

206 

207 Returns 

208 ------- 

209 value : `Any` 

210 Value associated with given key. 

211 

212 Raises 

213 ------ 

214 KeyError 

215 If key doesn't exist. 

216 """ 

217 return self.data[key] 

218 

219 def __delitem__(self, key): 

220 """Deletes value for given key if exists. 

221 

222 Parameters 

223 ---------- 

224 key : `str` 

225 Identifier for value to delete. 

226 

227 Raises 

228 ------ 

229 KeyError 

230 If key doesn't exist. 

231 """ 

232 del self.data[key] 

233 

234 def __setitem__(self, key, value): 

235 """Stores key,value in internal dict only if key is valid 

236 

237 Parameters 

238 ---------- 

239 key : `str` 

240 Identifier to associate with given value. 

241 value : `Any` 

242 Value to store. 

243 

244 Raises 

245 ------ 

246 KeyError 

247 If key is invalid. 

248 """ 

249 if key not in self.valid_keys: 

250 raise KeyError(f"Invalid key {key}") 

251 self.data[key] = value 

252 

253 def __iter__(self): 

254 return self.data.__iter__() 

255 

256 def __len__(self): 

257 return len(self.data) 

258 

259 def __str__(self): 

260 return str(self.data) 

261 

262 

263def htc_backup_files(wms_path, subdir=None, limit=100): 

264 """Backup select HTCondor files in the submit directory. 

265 

266 Files will be saved in separate subdirectories which will be created in 

267 the submit directory where the files are located. These subdirectories 

268 will be consecutive, zero-padded integers. Their values will correspond to 

269 the number of HTCondor rescue DAGs in the submit directory. 

270 

271 Hence, with the default settings, copies after the initial failed run will 

272 be placed in '001' subdirectory, '002' after the first restart, and so on 

273 until the limit of backups is reached. If there's no rescue DAG yet, files 

274 will be copied to '000' subdirectory. 

275 

276 Parameters 

277 ---------- 

278 wms_path : `str` or `pathlib.Path` 

279 Path to the submit directory either absolute or relative. 

280 subdir : `str` or `pathlib.Path`, optional 

281 A path, relative to the submit directory, where all subdirectories with 

282 backup files will be kept. Defaults to None which means that the backup 

283 subdirectories will be placed directly in the submit directory. 

284 limit : `int`, optional 

285 Maximal number of backups. If the number of backups reaches the limit, 

286 the last backup files will be overwritten. The default value is 100 

287 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in 

288 version 8.8+. 

289 

290 Raises 

291 ------ 

292 FileNotFoundError 

293 If the submit directory or the file that needs to be backed up does not 

294 exist. 

295 OSError 

296 If the submit directory cannot be accessed or backing up a file failed 

297 either due to permission or filesystem related issues. 

298 

299 Notes 

300 ----- 

301 This is not a generic function for making backups. It is intended to be 

302 used once, just before a restart, to make snapshots of files which will be 

303 overwritten by HTCondor after during the next run. 

304 """ 

305 width = len(str(limit)) 

306 

307 path = Path(wms_path).resolve() 

308 if not path.is_dir(): 

309 raise FileNotFoundError(f"Directory {path} not found") 

310 

311 # Initialize the backup counter. 

312 rescue_dags = list(Path(wms_path).glob("*.rescue*")) 

313 counter = min(len(rescue_dags), limit) 

314 

315 # Create the backup directory and move select files there. 

316 dest = Path(wms_path) 

317 if subdir: 

318 # PurePath.is_relative_to() is not available before Python 3.9. Hence 

319 # we need to check is 'subdir' is in the submit directory in some other 

320 # way if it is an absolute path. 

321 subdir = Path(subdir) 

322 if subdir.is_absolute(): 

323 if dest not in subdir.parents: 

324 _LOG.warning( 

325 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.", 

326 subdir, 

327 wms_path, 

328 ) 

329 else: 

330 dest /= subdir 

331 else: 

332 dest /= subdir 

333 dest /= f"{counter:0{width}}" 

334 try: 

335 dest.mkdir(parents=True, exist_ok=False if counter < limit else True) 

336 except FileExistsError: 

337 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest) 

338 else: 

339 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]: 

340 for source in path.glob(patt): 

341 if source.is_file(): 

342 target = dest / source.relative_to(path) 

343 try: 

344 source.rename(target) 

345 except OSError as exc: 

346 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None 

347 else: 

348 raise FileNotFoundError(f"Backing up '{source}' failed: not a file") 

349 

350 

351def htc_escape(value): 

352 """Escape characters in given value based upon HTCondor syntax. 

353 

354 Parameters 

355 ---------- 

356 value : `Any` 

357 Value that needs to have characters escaped if string. 

358 

359 Returns 

360 ------- 

361 new_value : `Any` 

362 Given value with characters escaped appropriate for HTCondor if string. 

363 """ 

364 if isinstance(value, str): 

365 newval = value.replace('"', '""').replace("'", "''").replace("&quot;", '"') 

366 else: 

367 newval = value 

368 

369 return newval 

370 

371 

372def htc_write_attribs(stream, attrs): 

373 """Write job attributes in HTCondor format to writeable stream. 

374 

375 Parameters 

376 ---------- 

377 stream : `TextIOBase` 

378 Output text stream (typically an open file) 

379 attrs : `dict` 

380 HTCondor job attributes (dictionary of attribute key, value) 

381 """ 

382 for key, value in attrs.items(): 

383 # Make sure strings are syntactically correct for HTCondor. 

384 if isinstance(value, str): 

385 pval = f'"{htc_escape(value)}"' 

386 else: 

387 pval = value 

388 

389 print(f"+{key} = {pval}", file=stream) 

390 

391 

392def htc_write_condor_file(filename, job_name, job, job_attrs): 

393 """Main function to write an HTCondor submit file. 

394 

395 Parameters 

396 ---------- 

397 filename : `str` 

398 Filename for the HTCondor submit file 

399 job_name : `str` 

400 Job name to use in submit file 

401 job : `RestrictedDict` 

402 Submit script information. 

403 job_attrs : `dict` 

404 Job attributes. 

405 """ 

406 os.makedirs(os.path.dirname(filename), exist_ok=True) 

407 with open(filename, "w") as fh: 

408 for key, value in job.items(): 

409 if value is not None: 

410 if key in HTC_QUOTE_KEYS: 

411 print(f'{key}="{htc_escape(value)}"', file=fh) 

412 else: 

413 print(f"{key}={value}", file=fh) 

414 for key in ["output", "error", "log"]: 

415 if key not in job: 

416 filename = f"{job_name}.$(Cluster).${key[:3]}" 

417 print(f"{key}={filename}", file=fh) 

418 

419 if job_attrs is not None: 

420 htc_write_attribs(fh, job_attrs) 

421 print("queue", file=fh) 

422 

423 

424def htc_version(): 

425 """Return the version given by the HTCondor API. 

426 

427 Returns 

428 ------- 

429 version : `str` 

430 HTCondor version as easily comparable string. 

431 

432 Raises 

433 ------ 

434 RuntimeError 

435 Raised if fail to parse htcondor API string. 

436 """ 

437 # Example string returned by htcondor.version: 

438 # $CondorVersion: 8.8.6 Nov 13 2019 BuildID: 489199 PackageID: 8.8.6-1 $ 

439 version_info = re.match(r"\$CondorVersion: (\d+).(\d+).(\d+)", htcondor.version()) 

440 if version_info is None: 

441 raise RuntimeError("Problems parsing condor version") 

442 return f"{int(version_info.group(1))}.{int(version_info.group(2))}.{int(version_info.group(3))}" 

443 

444 

445def htc_submit_dag(sub): 

446 """Submit job for execution. 

447 

448 Parameters 

449 ---------- 

450 sub : `htcondor.Submit` 

451 An object representing a job submit description. 

452 

453 Returns 

454 ------- 

455 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

456 Information about jobs satisfying the search criteria where for each 

457 Scheduler, local HTCondor job ids are mapped to their respective 

458 classads. 

459 """ 

460 coll = htcondor.Collector() 

461 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

462 schedd = htcondor.Schedd(schedd_ad) 

463 

464 jobs_ads = [] 

465 with schedd.transaction() as txn: 

466 sub.queue(txn, ad_results=jobs_ads) 

467 

468 # Submit.queue() above will raise RuntimeError if submission fails, so 

469 # 'jobs_ads' should contain the ad at this point. 

470 dag_ad = jobs_ads[0] 

471 

472 # Sadly, the ClassAd from Submit.queue() (see above) does not have 

473 # 'GlobalJobId' so we need to run a regular query to get it anyway. 

474 schedd_name = schedd_ad["Name"] 

475 schedd_dag_info = condor_q( 

476 constraint=f"ClusterId == {dag_ad['ClusterId']}", schedds={schedd_name: schedd} 

477 ) 

478 return schedd_dag_info 

479 

480 

481def htc_create_submit_from_dag(dag_filename, submit_options=None): 

482 """Create a DAGMan job submit description. 

483 

484 Parameters 

485 ---------- 

486 dag_filename : `str` 

487 Name of file containing HTCondor DAG commands. 

488 submit_options : `dict` [`str`, Any], optional 

489 Contains extra options for command line (Value of None means flag). 

490 

491 Returns 

492 ------- 

493 sub : `htcondor.Submit` 

494 An object representing a job submit description. 

495 

496 Notes 

497 ----- 

498 Use with HTCondor versions which support htcondor.Submit.from_dag(), 

499 i.e., 8.9.3 or newer. 

500 """ 

501 return htcondor.Submit.from_dag(dag_filename, submit_options) 

502 

503 

504def htc_create_submit_from_cmd(dag_filename, submit_options=None): 

505 """Create a DAGMan job submit description. 

506 

507 Create a DAGMan job submit description by calling ``condor_submit_dag`` 

508 on given DAG description file. 

509 

510 Parameters 

511 ---------- 

512 dag_filename : `str` 

513 Name of file containing HTCondor DAG commands. 

514 submit_options : `dict` [`str`, Any], optional 

515 Contains extra options for command line (Value of None means flag). 

516 

517 Returns 

518 ------- 

519 sub : `htcondor.Submit` 

520 An object representing a job submit description. 

521 

522 Notes 

523 ----- 

524 Use with HTCondor versions which do not support htcondor.Submit.from_dag(), 

525 i.e., older than 8.9.3. 

526 """ 

527 # Run command line condor_submit_dag command. 

528 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse " 

529 

530 if submit_options is not None: 

531 for opt, val in submit_options.items(): 

532 cmd += f" -{opt} {val or ''}" 

533 cmd += f"{dag_filename}" 

534 

535 process = subprocess.Popen( 

536 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" 

537 ) 

538 process.wait() 

539 

540 if process.returncode != 0: 

541 print(f"Exit code: {process.returncode}") 

542 print(process.communicate()[0]) 

543 raise RuntimeError("Problems running condor_submit_dag") 

544 

545 return htc_create_submit_from_file(f"{dag_filename}.condor.sub") 

546 

547 

548def htc_create_submit_from_file(submit_file): 

549 """Parse a submission file. 

550 

551 Parameters 

552 ---------- 

553 submit_file : `str` 

554 Name of the HTCondor submit file. 

555 

556 Returns 

557 ------- 

558 sub : `htcondor.Submit` 

559 An object representing a job submit description. 

560 """ 

561 descriptors = {} 

562 with open(submit_file, "r") as fh: 

563 for line in fh: 

564 line = line.strip() 

565 if not line.startswith("#") and not line == "queue": 

566 (key, val) = re.split(r"\s*=\s*", line, 1) 

567 descriptors[key] = val 

568 

569 # Avoid UserWarning: the line 'copy_to_spool = False' was 

570 # unused by Submit object. Is it a typo? 

571 try: 

572 del descriptors["copy_to_spool"] 

573 except KeyError: 

574 pass 

575 

576 return htcondor.Submit(descriptors) 

577 

578 

579def _htc_write_job_commands(stream, name, jobs): 

580 """Output the DAGMan job lines for single job in DAG. 

581 

582 Parameters 

583 ---------- 

584 stream : `TextIOBase` 

585 Writeable text stream (typically an opened file). 

586 name : `str` 

587 Job name. 

588 jobs : `RestrictedDict` 

589 DAG job keys and values. 

590 """ 

591 if "pre" in jobs: 

592 print( 

593 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}" 

594 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}", 

595 file=stream, 

596 ) 

597 

598 if "post" in jobs: 

599 print( 

600 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}" 

601 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}", 

602 file=stream, 

603 ) 

604 

605 if "vars" in jobs: 

606 for key, value in jobs["vars"]: 

607 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream) 

608 

609 if "pre_skip" in jobs: 

610 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream) 

611 

612 if "retry" in jobs and jobs["retry"]: 

613 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream) 

614 if "retry_unless_exit" in jobs: 

615 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream) 

616 print("\n", file=stream) 

617 

618 if "abort_dag_on" in jobs and jobs["abort_dag_on"]: 

619 print( 

620 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}" 

621 f" RETURN {jobs['abort_dag_on']['abort_exit']}", 

622 file=stream, 

623 ) 

624 

625 

626class HTCJob: 

627 """HTCondor job for use in building DAG. 

628 

629 Parameters 

630 ---------- 

631 name : `str` 

632 Name of the job 

633 label : `str` 

634 Label that can used for grouping or lookup. 

635 initcmds : `RestrictedDict` 

636 Initial job commands for submit file. 

637 initdagcmds : `RestrictedDict` 

638 Initial commands for job inside DAG. 

639 initattrs : `dict` 

640 Initial dictionary of job attributes. 

641 """ 

642 

643 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None): 

644 self.name = name 

645 self.label = label 

646 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds) 

647 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds) 

648 self.attrs = initattrs 

649 self.subfile = None 

650 

651 def __str__(self): 

652 return self.name 

653 

654 def add_job_cmds(self, new_commands): 

655 """Add commands to Job (overwrite existing). 

656 

657 Parameters 

658 ---------- 

659 new_commands : `dict` 

660 Submit file commands to be added to Job. 

661 """ 

662 self.cmds.update(new_commands) 

663 

664 def add_dag_cmds(self, new_commands): 

665 """Add DAG commands to Job (overwrite existing). 

666 

667 Parameters 

668 ---------- 

669 new_commands : `dict` 

670 DAG file commands to be added to Job 

671 """ 

672 self.dagcmds.update(new_commands) 

673 

674 def add_job_attrs(self, new_attrs): 

675 """Add attributes to Job (overwrite existing). 

676 

677 Parameters 

678 ---------- 

679 new_attrs : `dict` 

680 Attributes to be added to Job 

681 """ 

682 if self.attrs is None: 

683 self.attrs = {} 

684 if new_attrs: 

685 self.attrs.update(new_attrs) 

686 

687 def write_submit_file(self, submit_path, job_subdir=""): 

688 """Write job description to submit file. 

689 

690 Parameters 

691 ---------- 

692 submit_path : `str` 

693 Prefix path for the submit file. 

694 job_subdir : `str`, optional 

695 Template for job subdir. 

696 """ 

697 if not self.subfile: 

698 self.subfile = f"{self.name}.sub" 

699 job_subdir = job_subdir.format(self=self) 

700 if job_subdir: 

701 self.subfile = os.path.join(job_subdir, self.subfile) 

702 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs) 

703 

704 def write_dag_commands(self, stream): 

705 """Write DAG commands for single job to output stream. 

706 

707 Parameters 

708 ---------- 

709 stream : `IO` or `str` 

710 Output Stream 

711 """ 

712 print(f"JOB {self.name} {self.subfile}", file=stream) 

713 _htc_write_job_commands(stream, self.name, self.dagcmds) 

714 

715 def dump(self, fh): 

716 """Dump job information to output stream. 

717 

718 Parameters 

719 ---------- 

720 fh : `TextIOBase` 

721 Output stream 

722 """ 

723 printer = pprint.PrettyPrinter(indent=4, stream=fh) 

724 printer.pprint(self.name) 

725 printer.pprint(self.cmds) 

726 printer.pprint(self.attrs) 

727 

728 

729class HTCDag(networkx.DiGraph): 

730 """HTCondor DAG. 

731 

732 Parameters 

733 ---------- 

734 data : networkx.DiGraph.data 

735 Initial graph. 

736 name : `str` 

737 Name for DAG. 

738 """ 

739 

740 def __init__(self, data=None, name=""): 

741 super().__init__(data=data, name=name) 

742 

743 self.graph["attr"] = {} 

744 self.graph["run_id"] = None 

745 self.graph["submit_path"] = None 

746 self.graph["final_job"] = None 

747 

748 def __str__(self): 

749 """Represent basic DAG info as string. 

750 

751 Returns 

752 ------- 

753 info : `str` 

754 String containing basic DAG info. 

755 """ 

756 return f"{self.graph['name']} {len(self)}" 

757 

758 def add_attribs(self, attribs=None): 

759 """Add attributes to the DAG. 

760 

761 Parameters 

762 ---------- 

763 attribs : `dict` 

764 DAG attributes 

765 """ 

766 if attribs is not None: 

767 self.graph["attr"].update(attribs) 

768 

769 def add_job(self, job, parent_names=None, child_names=None): 

770 """Add an HTCJob to the HTCDag. 

771 

772 Parameters 

773 ---------- 

774 job : `HTCJob` 

775 HTCJob to add to the HTCDag 

776 parent_names : `Iterable` [`str`], optional 

777 Names of parent jobs 

778 child_names : `Iterable` [`str`], optional 

779 Names of child jobs 

780 """ 

781 assert isinstance(job, HTCJob) 

782 

783 # Add dag level attributes to each job 

784 job.add_job_attrs(self.graph["attr"]) 

785 

786 self.add_node(job.name, data=job) 

787 

788 if parent_names is not None: 

789 self.add_job_relationships(parent_names, job.name) 

790 

791 if child_names is not None: 

792 self.add_job_relationships(child_names, job.name) 

793 

794 def add_job_relationships(self, parents, children): 

795 """Add DAG edge between parents and children jobs. 

796 

797 Parameters 

798 ---------- 

799 parents : `list` [`str`] 

800 Contains parent job name(s). 

801 children : `list` [`str`] 

802 Contains children job name(s). 

803 """ 

804 self.add_edges_from(itertools.product(parents, children)) 

805 

806 def add_final_job(self, job): 

807 """Add an HTCJob for the FINAL job in HTCDag. 

808 

809 Parameters 

810 ---------- 

811 job : `HTCJob` 

812 HTCJob to add to the HTCDag as a FINAL job. 

813 """ 

814 # Add dag level attributes to each job 

815 job.add_job_attrs(self.graph["attr"]) 

816 

817 self.graph["final_job"] = job 

818 

819 def del_job(self, job_name): 

820 """Delete the job from the DAG. 

821 

822 Parameters 

823 ---------- 

824 job_name : `str` 

825 Name of job in DAG to delete 

826 """ 

827 # Reconnect edges around node to delete 

828 parents = self.predecessors(job_name) 

829 children = self.successors(job_name) 

830 self.add_edges_from(itertools.product(parents, children)) 

831 

832 # Delete job node (which deletes its edges). 

833 self.remove_node(job_name) 

834 

835 def write(self, submit_path, job_subdir=""): 

836 """Write DAG to a file. 

837 

838 Parameters 

839 ---------- 

840 submit_path : `str` 

841 Prefix path for dag filename to be combined with DAG name. 

842 job_subdir : `str`, optional 

843 Template for job subdir. 

844 """ 

845 self.graph["submit_path"] = submit_path 

846 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag") 

847 os.makedirs(submit_path, exist_ok=True) 

848 with open(self.graph["dag_filename"], "w") as fh: 

849 for _, nodeval in self.nodes().items(): 

850 job = nodeval["data"] 

851 job.write_submit_file(submit_path, job_subdir) 

852 job.write_dag_commands(fh) 

853 for edge in self.edges(): 

854 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

855 print(f"DOT {self.name}.dot", file=fh) 

856 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh) 

857 

858 # Add bps attributes to dag submission 

859 for key, value in self.graph["attr"].items(): 

860 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh) 

861 

862 if self.graph["final_job"]: 

863 job = self.graph["final_job"] 

864 job.write_submit_file(submit_path, job_subdir) 

865 print(f"FINAL {job.name} {job.subfile}", file=fh) 

866 if "pre" in job.dagcmds: 

867 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh) 

868 if "post" in job.dagcmds: 

869 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh) 

870 

871 def dump(self, fh): 

872 """Dump DAG info to output stream. 

873 

874 Parameters 

875 ---------- 

876 fh : `IO` or `str` 

877 Where to dump DAG info as text. 

878 """ 

879 for key, value in self.graph: 

880 print(f"{key}={value}", file=fh) 

881 for name, data in self.nodes().items(): 

882 print(f"{name}:", file=fh) 

883 data.dump(fh) 

884 for edge in self.edges(): 

885 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

886 if self.graph["final_job"]: 

887 print(f'FINAL {self.graph["final_job"].name}:', file=fh) 

888 self.graph["final_job"].dump(fh) 

889 

890 def write_dot(self, filename): 

891 """Write a dot version of the DAG. 

892 

893 Parameters 

894 ---------- 

895 filename : `str` 

896 dot filename 

897 """ 

898 pos = networkx.nx_agraph.graphviz_layout(self) 

899 networkx.draw(self, pos=pos) 

900 networkx.drawing.nx_pydot.write_dot(self, filename) 

901 

902 

903def condor_q(constraint=None, schedds=None): 

904 """Query HTCondor for current jobs. 

905 

906 Parameters 

907 ---------- 

908 constraint : `str`, optional 

909 Constraints to be passed to job query. 

910 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

911 HTCondor schedulers which to query for job information. If None 

912 (default), the query will be run against local scheduler only. 

913 

914 Returns 

915 ------- 

916 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

917 Information about jobs satisfying the search criteria where for each 

918 Scheduler, local HTCondor job ids are mapped to their respective 

919 classads. 

920 """ 

921 if not schedds: 

922 coll = htcondor.Collector() 

923 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

924 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

925 

926 queries = [schedd.xquery(requirements=constraint) for schedd in schedds.values()] 

927 

928 job_info = {} 

929 for query in htcondor.poll(queries): 

930 schedd_name = query.tag() 

931 job_info.setdefault(schedd_name, {}) 

932 for job_ad in query.nextAdsNonBlocking(): 

933 del job_ad["Environment"] 

934 del job_ad["Env"] 

935 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}" 

936 job_info[schedd_name][id_] = dict(job_ad) 

937 _LOG.debug("condor_q returned %d jobs", sum(len(val) for val in job_info.values())) 

938 

939 # When returning the results filter out entries for schedulers with no jobs 

940 # matching the search criteria. 

941 return {key: val for key, val in job_info.items() if val} 

942 

943 

944def condor_history(constraint=None, schedds=None): 

945 """Get information about completed jobs from HTCondor history. 

946 

947 Parameters 

948 ---------- 

949 constraint : `str`, optional 

950 Constraints to be passed to job query. 

951 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

952 HTCondor schedulers which to query for job information. If None 

953 (default), the query will be run against the history file of 

954 the local scheduler only. 

955 

956 Returns 

957 ------- 

958 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

959 Information about jobs satisfying the search criteria where for each 

960 Scheduler, local HTCondor job ids are mapped to their respective 

961 classads. 

962 """ 

963 if not schedds: 

964 coll = htcondor.Collector() 

965 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

966 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

967 

968 job_info = {} 

969 for schedd_name, schedd in schedds.items(): 

970 job_info[schedd_name] = {} 

971 for job_ad in schedd.history(requirements=constraint, projection=[]): 

972 del job_ad["Environment"] 

973 del job_ad["Env"] 

974 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}" 

975 job_info[schedd_name][id_] = dict(job_ad) 

976 _LOG.debug("condor_history returned %d jobs", sum(len(val) for val in job_info.values())) 

977 

978 # When returning the results filter out entries for schedulers with no jobs 

979 # matching the search criteria. 

980 return {key: val for key, val in job_info.items() if val} 

981 

982 

983def condor_search(constraint=None, hist=None, schedds=None): 

984 """Search for running and finished jobs satisfying given criteria. 

985 

986 Parameters 

987 ---------- 

988 constraint : `str`, optional 

989 Constraints to be passed to job query. 

990 hist : `float` 

991 Limit history search to this many days. 

992 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

993 The list of the HTCondor schedulers which to query for job information. 

994 If None (default), only the local scheduler will be queried. 

995 

996 Returns 

997 ------- 

998 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

999 Information about jobs satisfying the search criteria where for each 

1000 Scheduler, local HTCondor job ids are mapped to their respective 

1001 classads. 

1002 """ 

1003 if not schedds: 

1004 coll = htcondor.Collector() 

1005 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1006 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)} 

1007 

1008 job_info = condor_q(constraint=constraint, schedds=schedds) 

1009 if hist is not None: 

1010 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

1011 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

1012 hist_info = condor_history(constraint, schedds=schedds) 

1013 update_job_info(job_info, hist_info) 

1014 return job_info 

1015 

1016 

1017def condor_status(constraint=None, coll=None): 

1018 """Get information about HTCondor pool. 

1019 

1020 Parameters 

1021 ---------- 

1022 constraint : `str`, optional 

1023 Constraints to be passed to the query. 

1024 coll : `htcondor.Collector`, optional 

1025 Object representing HTCondor collector daemon. 

1026 

1027 Returns 

1028 ------- 

1029 pool_info : `dict` [`str`, `dict` [`str`, Any]] 

1030 Mapping between HTCondor slot names and slot information (classAds). 

1031 """ 

1032 if coll is None: 

1033 coll = htcondor.Collector() 

1034 try: 

1035 pool_ads = coll.query(constraint=constraint) 

1036 except OSError as ex: 

1037 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex 

1038 

1039 pool_info = {} 

1040 for slot in pool_ads: 

1041 pool_info[slot["name"]] = dict(slot) 

1042 _LOG.debug("condor_status returned %d ads", len(pool_info)) 

1043 return pool_info 

1044 

1045 

1046def update_job_info(job_info, other_info): 

1047 """Update results of a job query with results from another query. 

1048 

1049 Parameters 

1050 ---------- 

1051 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1052 Results of the job query that needs to be updated. 

1053 other_info : `dict` [`str`, `dict` [`str`, Any]] 

1054 Results of the other job query. 

1055 

1056 Returns 

1057 ------- 

1058 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1059 The updated results. 

1060 """ 

1061 for schedd_name, others in other_info.items(): 

1062 try: 

1063 jobs = job_info[schedd_name] 

1064 except KeyError: 

1065 job_info[schedd_name] = others 

1066 else: 

1067 for id_, ad in others.items(): 

1068 jobs.setdefault(id_, {}).update(ad) 

1069 return job_info 

1070 

1071 

1072def summary_from_dag(dir_name): 

1073 """Build bps_run_summary string from dag file. 

1074 

1075 Parameters 

1076 ---------- 

1077 dir_name : `str` 

1078 Path that includes dag file for a run. 

1079 

1080 Returns 

1081 ------- 

1082 summary : `str` 

1083 Semi-colon separated list of job labels and counts. 

1084 (Same format as saved in dag classad.) 

1085 job_name_to_pipetask : `dict` [`str`, `str`] 

1086 Mapping of job names to job labels 

1087 """ 

1088 dag = next(Path(dir_name).glob("*.dag")) 

1089 

1090 # Later code depends upon insertion order 

1091 counts = defaultdict(int) 

1092 job_name_to_pipetask = {} 

1093 try: 

1094 with open(dag, "r") as fh: 

1095 for line in fh: 

1096 if line.startswith("JOB"): 

1097 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line) 

1098 if m: 

1099 label = m.group(2) 

1100 if label == "init": 

1101 label = "pipetaskInit" 

1102 job_name_to_pipetask[m.group(1)] = label 

1103 counts[label] += 1 

1104 else: # Check if Pegasus submission 

1105 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line) 

1106 if m: 

1107 label = pegasus_name_to_label(m.group(1)) 

1108 job_name_to_pipetask[m.group(1)] = label 

1109 counts[label] += 1 

1110 else: 

1111 _LOG.warning("Parse DAG: unmatched job line: %s", line) 

1112 elif line.startswith("FINAL"): 

1113 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line) 

1114 if m: 

1115 label = m.group(2) 

1116 job_name_to_pipetask[m.group(1)] = label 

1117 counts[label] += 1 

1118 

1119 except (OSError, PermissionError, StopIteration): 

1120 pass 

1121 

1122 summary = ";".join([f"{name}:{counts[name]}" for name in counts]) 

1123 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask) 

1124 return summary, job_name_to_pipetask 

1125 

1126 

1127def pegasus_name_to_label(name): 

1128 """Convert pegasus job name to a label for the report. 

1129 

1130 Parameters 

1131 ---------- 

1132 name : `str` 

1133 Name of job. 

1134 

1135 Returns 

1136 ------- 

1137 label : `str` 

1138 Label for job. 

1139 """ 

1140 label = "UNK" 

1141 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"): 

1142 label = "pegasus" 

1143 else: 

1144 m = re.match(r"pipetask_(\d+_)?([^_]+)", name) 

1145 if m: 

1146 label = m.group(2) 

1147 if label == "init": 

1148 label = "pipetaskInit" 

1149 

1150 return label 

1151 

1152 

1153def read_dag_status(wms_path): 

1154 """Read the node status file for DAG summary information 

1155 

1156 Parameters 

1157 ---------- 

1158 wms_path : `str` 

1159 Path that includes node status file for a run. 

1160 

1161 Returns 

1162 ------- 

1163 dag_ad : `dict` [`str`, Any] 

1164 DAG summary information. 

1165 """ 

1166 dag_ad = {} 

1167 

1168 # While this is probably more up to date than dag classad, only read from 

1169 # file if need to. 

1170 try: 

1171 try: 

1172 node_stat_file = next(Path(wms_path).glob("*.node_status")) 

1173 _LOG.debug("Reading Node Status File %s", node_stat_file) 

1174 with open(node_stat_file, "r") as infh: 

1175 dag_ad = classad.parseNext(infh) # pylint: disable=E1101 

1176 except StopIteration: 

1177 pass 

1178 

1179 if not dag_ad: 

1180 # Pegasus check here 

1181 try: 

1182 metrics_file = next(Path(wms_path).glob("*.dag.metrics")) 

1183 with open(metrics_file, "r") as infh: 

1184 metrics = json.load(infh) 

1185 dag_ad["NodesTotal"] = metrics.get("jobs", 0) 

1186 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0) 

1187 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0) 

1188 dag_ad["pegasus_version"] = metrics.get("planner_version", "") 

1189 except StopIteration: 

1190 try: 

1191 metrics_file = next(Path(wms_path).glob("*.metrics")) 

1192 with open(metrics_file, "r") as infh: 

1193 metrics = json.load(infh) 

1194 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"] 

1195 dag_ad["pegasus_version"] = metrics.get("version", "") 

1196 except StopIteration: 

1197 pass 

1198 except (OSError, PermissionError): 

1199 pass 

1200 

1201 _LOG.debug("read_dag_status: %s", dag_ad) 

1202 return dict(dag_ad) 

1203 

1204 

1205def read_node_status(wms_path): 

1206 """Read entire node status file. 

1207 

1208 Parameters 

1209 ---------- 

1210 wms_path : `str` 

1211 Path that includes node status file for a run. 

1212 

1213 Returns 

1214 ------- 

1215 jobs : `dict` [`str`, Any] 

1216 DAG summary information. 

1217 """ 

1218 # Get jobid info from other places to fill in gaps in info from node_status 

1219 _, job_name_to_pipetask = summary_from_dag(wms_path) 

1220 wms_workflow_id, loginfo = read_dag_log(wms_path) 

1221 loginfo = read_dag_nodes_log(wms_path) 

1222 _LOG.debug("loginfo = %s", loginfo) 

1223 job_name_to_id = {} 

1224 for jid, jinfo in loginfo.items(): 

1225 if "LogNotes" in jinfo: 

1226 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"]) 

1227 if m: 

1228 job_name_to_id[m.group(1)] = jid 

1229 jinfo["DAGNodeName"] = m.group(1) 

1230 

1231 try: 

1232 node_status = next(Path(wms_path).glob("*.node_status")) 

1233 except StopIteration: 

1234 return loginfo 

1235 

1236 jobs = {} 

1237 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one 

1238 try: 

1239 with open(node_status, "r") as fh: 

1240 ads = classad.parseAds(fh) 

1241 

1242 for jclassad in ads: 

1243 if jclassad["Type"] == "DagStatus": 

1244 # skip DAG summary 

1245 pass 

1246 elif "Node" not in jclassad: 

1247 if jclassad["Type"] != "StatusEnd": 

1248 _LOG.debug("Key 'Node' not in classad: %s", jclassad) 

1249 break 

1250 else: 

1251 if jclassad["Node"] in job_name_to_pipetask: 

1252 try: 

1253 label = job_name_to_pipetask[jclassad["Node"]] 

1254 except KeyError: 

1255 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys()) 

1256 raise 

1257 elif "_" in jclassad["Node"]: 

1258 label = jclassad["Node"].split("_")[1] 

1259 else: 

1260 label = jclassad["Node"] 

1261 

1262 # Make job info as if came from condor_q 

1263 if jclassad["Node"] in job_name_to_id: 

1264 job_id = job_name_to_id[jclassad["Node"]] 

1265 else: 

1266 job_id = str(fake_id) 

1267 fake_id -= 1 

1268 

1269 job = dict(jclassad) 

1270 job["ClusterId"] = int(float(job_id)) 

1271 job["DAGManJobID"] = wms_workflow_id 

1272 job["DAGNodeName"] = jclassad["Node"] 

1273 job["bps_job_label"] = label 

1274 

1275 jobs[str(job_id)] = job 

1276 except (OSError, PermissionError): 

1277 pass 

1278 

1279 return jobs 

1280 

1281 

1282def read_dag_log(wms_path): 

1283 """Read job information from the DAGMan log file. 

1284 

1285 Parameters 

1286 ---------- 

1287 wms_path : `str` 

1288 Path containing the DAGMan log file. 

1289 

1290 Returns 

1291 ------- 

1292 wms_workflow_id : `str` 

1293 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job. 

1294 dag_info : `dict` [`str`, `Any`] 

1295 HTCondor job information read from the log file mapped to HTCondor 

1296 job id. 

1297 

1298 Raises 

1299 ------ 

1300 FileNotFoundError 

1301 If cannot find DAGMan log in given wms_path. 

1302 """ 

1303 wms_workflow_id = 0 

1304 dag_info = {} 

1305 

1306 path = Path(wms_path) 

1307 if path.exists(): 

1308 try: 

1309 filename = next(path.glob("*.dag.dagman.log")) 

1310 except StopIteration as exc: 

1311 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc 

1312 _LOG.debug("dag node log filename: %s", filename) 

1313 

1314 info = {} 

1315 job_event_log = htcondor.JobEventLog(str(filename)) 

1316 for event in job_event_log.events(stop_after=0): 

1317 id_ = f"{event['Cluster']}.{event['Proc']}" 

1318 if id_ not in info: 

1319 info[id_] = {} 

1320 wms_workflow_id = id_ # taking last job id in case of restarts 

1321 info[id_].update(event) 

1322 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1323 

1324 # only save latest DAG job 

1325 dag_info = {wms_workflow_id: info[wms_workflow_id]} 

1326 for job in dag_info.values(): 

1327 _tweak_log_info(filename, job) 

1328 

1329 return wms_workflow_id, dag_info 

1330 

1331 

1332def read_dag_nodes_log(wms_path): 

1333 """Read job information from the DAGMan nodes log file. 

1334 

1335 Parameters 

1336 ---------- 

1337 wms_path : `str` 

1338 Path containing the DAGMan nodes log file. 

1339 

1340 Returns 

1341 ------- 

1342 info : `dict` [`str`, Any] 

1343 HTCondor job information read from the log file mapped to HTCondor 

1344 job id. 

1345 

1346 Raises 

1347 ------ 

1348 FileNotFoundError 

1349 If cannot find DAGMan node log in given wms_path. 

1350 """ 

1351 try: 

1352 filename = next(Path(wms_path).glob("*.dag.nodes.log")) 

1353 except StopIteration as exc: 

1354 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc 

1355 _LOG.debug("dag node log filename: %s", filename) 

1356 

1357 info = {} 

1358 job_event_log = htcondor.JobEventLog(str(filename)) 

1359 for event in job_event_log.events(stop_after=0): 

1360 id_ = f"{event['Cluster']}.{event['Proc']}" 

1361 if id_ not in info: 

1362 info[id_] = {} 

1363 info[id_].update(event) 

1364 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1365 

1366 # Add more condor_q-like info to info parsed from log file. 

1367 for job in info.values(): 

1368 _tweak_log_info(filename, job) 

1369 

1370 return info 

1371 

1372 

1373def read_dag_info(wms_path): 

1374 """Read custom DAGMan job information from the file. 

1375 

1376 Parameters 

1377 ---------- 

1378 wms_path : `str` 

1379 Path containing the file with the DAGMan job info. 

1380 

1381 Returns 

1382 ------- 

1383 dag_info : `dict` [`str`, `dict` [`str`, Any]] 

1384 HTCondor job information. 

1385 

1386 Raises 

1387 ------ 

1388 FileNotFoundError 

1389 If cannot find DAGMan job info file in the given location. 

1390 """ 

1391 try: 

1392 filename = next(Path(wms_path).glob("*.info.json")) 

1393 except StopIteration as exc: 

1394 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc 

1395 _LOG.debug("DAGMan job information filename: %s", filename) 

1396 try: 

1397 with open(filename) as fh: 

1398 dag_info = json.load(fh) 

1399 except (IOError, PermissionError) as exc: 

1400 _LOG.debug("Retrieving DAGMan job information failed: %s", exc) 

1401 dag_info = {} 

1402 return dag_info 

1403 

1404 

1405def write_dag_info(filename, dag_info): 

1406 """Writes custom job information about DAGMan job. 

1407 

1408 Parameters 

1409 ---------- 

1410 filename : `str` 

1411 Name of the file where the information will be stored. 

1412 dag_info : `dict` [`str` `dict` [`str` Any]] 

1413 Information about the DAGMan job. 

1414 """ 

1415 schedd_name = next(iter(dag_info)) 

1416 dag_id = next(iter(dag_info[schedd_name])) 

1417 dag_ad = dag_info[schedd_name][dag_id] 

1418 try: 

1419 with open(filename, "w") as fh: 

1420 info = { 

1421 schedd_name: { 

1422 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]} 

1423 } 

1424 } 

1425 json.dump(info, fh) 

1426 except (KeyError, IOError, PermissionError) as exc: 

1427 _LOG.debug("Persisting DAGMan job information failed: %s", exc) 

1428 

1429 

1430def _tweak_log_info(filename, job): 

1431 """Massage the given job info has same structure as if came from condor_q. 

1432 

1433 Parameters 

1434 ---------- 

1435 filename : `pathlib.Path` 

1436 Name of the DAGMan log. 

1437 job : `dict` [ `str`, Any ] 

1438 A mapping between HTCondor job id and job information read from 

1439 the log. 

1440 """ 

1441 _LOG.debug("_tweak_log_info: %s %s", filename, job) 

1442 try: 

1443 job["ClusterId"] = job["Cluster"] 

1444 job["ProcId"] = job["Proc"] 

1445 job["Iwd"] = str(filename.parent) 

1446 job["Owner"] = filename.owner() 

1447 if job["MyType"] == "ExecuteEvent": 

1448 job["JobStatus"] = JobStatus.RUNNING 

1449 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent": 

1450 job["JobStatus"] = JobStatus.COMPLETED 

1451 try: 

1452 if not job["TerminatedNormally"]: 

1453 if "ReturnValue" in job: 

1454 job["ExitCode"] = job["ReturnValue"] 

1455 job["ExitBySignal"] = False 

1456 elif "TerminatedBySignal" in job: 

1457 job["ExitBySignal"] = True 

1458 job["ExitSignal"] = job["TerminatedBySignal"] 

1459 else: 

1460 _LOG.warning("Could not determine exit status for completed job: %s", job) 

1461 except KeyError as ex: 

1462 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job) 

1463 elif job["MyType"] == "SubmitEvent": 

1464 job["JobStatus"] = JobStatus.IDLE 

1465 elif job["MyType"] == "JobAbortedEvent": 

1466 job["JobStatus"] = JobStatus.REMOVED 

1467 else: 

1468 _LOG.debug("Unknown log event type: %s", job["MyType"]) 

1469 except KeyError: 

1470 _LOG.error("Missing key in job: %s", job) 

1471 raise 

1472 

1473 

1474def htc_check_dagman_output(wms_path): 

1475 """Check the DAGMan output for error messages. 

1476 

1477 Parameters 

1478 ---------- 

1479 wms_path : `str` 

1480 Directory containing the DAGman output file. 

1481 

1482 Returns 

1483 ------- 

1484 message : `str` 

1485 Message containing error messages from the DAGMan output. Empty 

1486 string if no messages. 

1487 

1488 Raises 

1489 ------ 

1490 FileNotFoundError 

1491 If cannot find DAGMan standard output file in given wms_path. 

1492 """ 

1493 try: 

1494 filename = next(Path(wms_path).glob("*.dag.dagman.out")) 

1495 except StopIteration as exc: 

1496 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc 

1497 _LOG.debug("dag output filename: %s", filename) 

1498 

1499 message = "" 

1500 try: 

1501 with open(filename, "r") as fh: 

1502 last_submit_failed = "" 

1503 for line in fh: 

1504 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line) 

1505 if m: 

1506 last_submit_failed = m.group(1) 

1507 if last_submit_failed: 

1508 message = f"Warn: Job submission issues (last: {last_submit_failed})" 

1509 except (IOError, PermissionError): 

1510 message = f"Warn: Could not read dagman output file from {wms_path}." 

1511 return message