Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%

594 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-17 08:57 +0000

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Placeholder HTCondor DAGMan API. 

29 

30There is new work on a python DAGMan API from HTCondor. However, at this 

31time, it tries to make things easier by assuming DAG is easily broken into 

32levels where there are 1-1 or all-to-all relationships to nodes in next 

33level. LSST workflows are more complicated. 

34""" 

35 

36__all__ = [ 

37 "DagStatus", 

38 "JobStatus", 

39 "NodeStatus", 

40 "RestrictedDict", 

41 "HTCJob", 

42 "HTCDag", 

43 "htc_backup_files", 

44 "htc_check_dagman_output", 

45 "htc_create_submit_from_cmd", 

46 "htc_create_submit_from_dag", 

47 "htc_create_submit_from_file", 

48 "htc_escape", 

49 "htc_write_attribs", 

50 "htc_write_condor_file", 

51 "htc_query_history", 

52 "htc_query_present", 

53 "htc_version", 

54 "htc_submit_dag", 

55 "condor_history", 

56 "condor_q", 

57 "condor_search", 

58 "condor_status", 

59 "update_job_info", 

60 "MISSING_ID", 

61 "summary_from_dag", 

62 "read_dag_info", 

63 "read_dag_log", 

64 "read_dag_nodes_log", 

65 "read_dag_status", 

66 "read_node_status", 

67 "write_dag_info", 

68 "pegasus_name_to_label", 

69] 

70 

71 

72import itertools 

73import json 

74import logging 

75import os 

76import pprint 

77import re 

78import subprocess 

79from collections import defaultdict 

80from collections.abc import MutableMapping 

81from datetime import datetime, timedelta 

82from enum import IntEnum 

83from pathlib import Path 

84 

85import classad 

86import htcondor 

87import networkx 

88from packaging import version 

89 

90_LOG = logging.getLogger(__name__) 

91 

92MISSING_ID = -99999 

93 

94 

95class DagStatus(IntEnum): 

96 """HTCondor DAGMan's statuses for a DAG.""" 

97 

98 OK = 0 

99 ERROR = 1 # an error condition different than those listed here 

100 FAILED = 2 # one or more nodes in the DAG have failed 

101 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification 

102 REMOVED = 4 # the DAG has been removed by condor_rm 

103 CYCLE = 5 # a cycle was found in the DAG 

104 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8) 

105 

106 

107class JobStatus(IntEnum): 

108 """HTCondor's statuses for jobs.""" 

109 

110 UNEXPANDED = 0 # Unexpanded 

111 IDLE = 1 # Idle 

112 RUNNING = 2 # Running 

113 REMOVED = 3 # Removed 

114 COMPLETED = 4 # Completed 

115 HELD = 5 # Held 

116 TRANSFERRING_OUTPUT = 6 # Transferring_Output 

117 SUSPENDED = 7 # Suspended 

118 

119 

120class NodeStatus(IntEnum): 

121 """HTCondor's statuses for DAGman nodes.""" 

122 

123 # (STATUS_NOT_READY): At least one parent has not yet finished or the node 

124 # is a FINAL node. 

125 NOT_READY = 0 

126 

127 # (STATUS_READY): All parents have finished, but the node is not yet 

128 # running. 

129 READY = 1 

130 

131 # (STATUS_PRERUN): The node’s PRE script is running. 

132 PRERUN = 2 

133 

134 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue. 

135 # StatusDetails = "not_idle" -> running. 

136 # JobProcsHeld = 1-> hold. 

137 # JobProcsQueued = 1 -> idle. 

138 SUBMITTED = 3 

139 

140 # (STATUS_POSTRUN): The node’s POST script is running. 

141 POSTRUN = 4 

142 

143 # (STATUS_DONE): The node has completed successfully. 

144 DONE = 5 

145 

146 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g., 

147 # ULOG_JOB_ABORTED for deleted job). 

148 ERROR = 6 

149 

150 

151HTC_QUOTE_KEYS = {"environment"} 

152HTC_VALID_JOB_KEYS = { 

153 "universe", 

154 "executable", 

155 "arguments", 

156 "environment", 

157 "log", 

158 "error", 

159 "output", 

160 "should_transfer_files", 

161 "when_to_transfer_output", 

162 "getenv", 

163 "notification", 

164 "notify_user", 

165 "concurrency_limit", 

166 "transfer_executable", 

167 "transfer_input_files", 

168 "transfer_output_files", 

169 "request_cpus", 

170 "request_memory", 

171 "request_disk", 

172 "priority", 

173 "category", 

174 "requirements", 

175 "on_exit_hold", 

176 "on_exit_hold_reason", 

177 "on_exit_hold_subcode", 

178 "max_retries", 

179 "periodic_release", 

180 "periodic_remove", 

181 "accounting_group", 

182 "accounting_group_user", 

183} 

184HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"} 

185HTC_VERSION = version.parse(htcondor.__version__) 

186 

187 

188class RestrictedDict(MutableMapping): 

189 """A dictionary that only allows certain keys. 

190 

191 Parameters 

192 ---------- 

193 valid_keys : `Container` 

194 Strings that are valid keys. 

195 init_data : `dict` or `RestrictedDict`, optional 

196 Initial data. 

197 

198 Raises 

199 ------ 

200 KeyError 

201 If invalid key(s) in init_data. 

202 """ 

203 

204 def __init__(self, valid_keys, init_data=()): 

205 self.valid_keys = valid_keys 

206 self.data = {} 

207 self.update(init_data) 

208 

209 def __getitem__(self, key): 

210 """Return value for given key if exists. 

211 

212 Parameters 

213 ---------- 

214 key : `str` 

215 Identifier for value to return. 

216 

217 Returns 

218 ------- 

219 value : `~collections.abc.Any` 

220 Value associated with given key. 

221 

222 Raises 

223 ------ 

224 KeyError 

225 If key doesn't exist. 

226 """ 

227 return self.data[key] 

228 

229 def __delitem__(self, key): 

230 """Delete value for given key if exists. 

231 

232 Parameters 

233 ---------- 

234 key : `str` 

235 Identifier for value to delete. 

236 

237 Raises 

238 ------ 

239 KeyError 

240 If key doesn't exist. 

241 """ 

242 del self.data[key] 

243 

244 def __setitem__(self, key, value): 

245 """Store key,value in internal dict only if key is valid. 

246 

247 Parameters 

248 ---------- 

249 key : `str` 

250 Identifier to associate with given value. 

251 value : `~collections.abc.Any` 

252 Value to store. 

253 

254 Raises 

255 ------ 

256 KeyError 

257 If key is invalid. 

258 """ 

259 if key not in self.valid_keys: 

260 raise KeyError(f"Invalid key {key}") 

261 self.data[key] = value 

262 

263 def __iter__(self): 

264 return self.data.__iter__() 

265 

266 def __len__(self): 

267 return len(self.data) 

268 

269 def __str__(self): 

270 return str(self.data) 

271 

272 

273def htc_backup_files(wms_path, subdir=None, limit=100): 

274 """Backup select HTCondor files in the submit directory. 

275 

276 Files will be saved in separate subdirectories which will be created in 

277 the submit directory where the files are located. These subdirectories 

278 will be consecutive, zero-padded integers. Their values will correspond to 

279 the number of HTCondor rescue DAGs in the submit directory. 

280 

281 Hence, with the default settings, copies after the initial failed run will 

282 be placed in '001' subdirectory, '002' after the first restart, and so on 

283 until the limit of backups is reached. If there's no rescue DAG yet, files 

284 will be copied to '000' subdirectory. 

285 

286 Parameters 

287 ---------- 

288 wms_path : `str` or `pathlib.Path` 

289 Path to the submit directory either absolute or relative. 

290 subdir : `str` or `pathlib.Path`, optional 

291 A path, relative to the submit directory, where all subdirectories with 

292 backup files will be kept. Defaults to None which means that the backup 

293 subdirectories will be placed directly in the submit directory. 

294 limit : `int`, optional 

295 Maximal number of backups. If the number of backups reaches the limit, 

296 the last backup files will be overwritten. The default value is 100 

297 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in 

298 version 8.8+. 

299 

300 Raises 

301 ------ 

302 FileNotFoundError 

303 If the submit directory or the file that needs to be backed up does not 

304 exist. 

305 OSError 

306 If the submit directory cannot be accessed or backing up a file failed 

307 either due to permission or filesystem related issues. 

308 

309 Notes 

310 ----- 

311 This is not a generic function for making backups. It is intended to be 

312 used once, just before a restart, to make snapshots of files which will be 

313 overwritten by HTCondor after during the next run. 

314 """ 

315 width = len(str(limit)) 

316 

317 path = Path(wms_path).resolve() 

318 if not path.is_dir(): 

319 raise FileNotFoundError(f"Directory {path} not found") 

320 

321 # Initialize the backup counter. 

322 rescue_dags = list(Path(wms_path).glob("*.rescue*")) 

323 counter = min(len(rescue_dags), limit) 

324 

325 # Create the backup directory and move select files there. 

326 dest = Path(wms_path) 

327 if subdir: 

328 # PurePath.is_relative_to() is not available before Python 3.9. Hence 

329 # we need to check is 'subdir' is in the submit directory in some other 

330 # way if it is an absolute path. 

331 subdir = Path(subdir) 

332 if subdir.is_absolute(): 

333 if dest not in subdir.parents: 

334 _LOG.warning( 

335 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.", 

336 subdir, 

337 wms_path, 

338 ) 

339 else: 

340 dest /= subdir 

341 else: 

342 dest /= subdir 

343 dest /= f"{counter:0{width}}" 

344 try: 

345 dest.mkdir(parents=True, exist_ok=False if counter < limit else True) 

346 except FileExistsError: 

347 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest) 

348 else: 

349 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]: 

350 for source in path.glob(patt): 

351 if source.is_file(): 

352 target = dest / source.relative_to(path) 

353 try: 

354 source.rename(target) 

355 except OSError as exc: 

356 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None 

357 else: 

358 raise FileNotFoundError(f"Backing up '{source}' failed: not a file") 

359 

360 

361def htc_escape(value): 

362 """Escape characters in given value based upon HTCondor syntax. 

363 

364 Parameters 

365 ---------- 

366 value : `~collections.abc.Any` 

367 Value that needs to have characters escaped if string. 

368 

369 Returns 

370 ------- 

371 new_value : `~collections.abc.Any` 

372 Given value with characters escaped appropriate for HTCondor if string. 

373 """ 

374 if isinstance(value, str): 

375 newval = value.replace('"', '""').replace("'", "''").replace("&quot;", '"') 

376 else: 

377 newval = value 

378 

379 return newval 

380 

381 

382def htc_write_attribs(stream, attrs): 

383 """Write job attributes in HTCondor format to writeable stream. 

384 

385 Parameters 

386 ---------- 

387 stream : `~io.TextIOBase` 

388 Output text stream (typically an open file). 

389 attrs : `dict` 

390 HTCondor job attributes (dictionary of attribute key, value). 

391 """ 

392 for key, value in attrs.items(): 

393 # Make sure strings are syntactically correct for HTCondor. 

394 if isinstance(value, str): 

395 pval = f'"{htc_escape(value)}"' 

396 else: 

397 pval = value 

398 

399 print(f"+{key} = {pval}", file=stream) 

400 

401 

402def htc_write_condor_file(filename, job_name, job, job_attrs): 

403 """Write an HTCondor submit file. 

404 

405 Parameters 

406 ---------- 

407 filename : `str` 

408 Filename for the HTCondor submit file. 

409 job_name : `str` 

410 Job name to use in submit file. 

411 job : `RestrictedDict` 

412 Submit script information. 

413 job_attrs : `dict` 

414 Job attributes. 

415 """ 

416 os.makedirs(os.path.dirname(filename), exist_ok=True) 

417 with open(filename, "w") as fh: 

418 for key, value in job.items(): 

419 if value is not None: 

420 if key in HTC_QUOTE_KEYS: 

421 print(f'{key}="{htc_escape(value)}"', file=fh) 

422 else: 

423 print(f"{key}={value}", file=fh) 

424 for key in ["output", "error", "log"]: 

425 if key not in job: 

426 filename = f"{job_name}.$(Cluster).${key[:3]}" 

427 print(f"{key}={filename}", file=fh) 

428 

429 if job_attrs is not None: 

430 htc_write_attribs(fh, job_attrs) 

431 print("queue", file=fh) 

432 

433 

434# To avoid doing the version check during every function call select 

435# appropriate conversion function at the import time. 

436# 

437# Make sure that *each* version specific variant of the conversion function(s) 

438# has the same signature after applying any changes! 

439if HTC_VERSION < version.parse("8.9.8"): 439 ↛ 441line 439 didn't jump to line 441, because the condition on line 439 was never true

440 

441 def htc_tune_schedd_args(**kwargs): 

442 """Ensure that arguments for Schedd are version appropriate. 

443 

444 The old arguments: 'requirements' and 'attr_list' of 

445 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were 

446 deprecated in favor of 'constraint' and 'projection', respectively, 

447 starting from version 8.9.8. The function will convert "new" keyword 

448 arguments to "old" ones. 

449 

450 Parameters 

451 ---------- 

452 **kwargs 

453 Any keyword arguments that Schedd.history(), Schedd.query(), and 

454 Schedd.xquery() accepts. 

455 

456 Returns 

457 ------- 

458 kwargs : `dict` [`str`, Any] 

459 Keywords arguments that are guaranteed to work with the Python 

460 HTCondor API. 

461 

462 Notes 

463 ----- 

464 Function doesn't validate provided keyword arguments beyond converting 

465 selected arguments to their version specific form. For example, 

466 it won't remove keywords that are not supported by the methods 

467 mentioned earlier. 

468 """ 

469 translation_table = { 

470 "constraint": "requirements", 

471 "projection": "attr_list", 

472 } 

473 for new, old in translation_table.items(): 

474 try: 

475 kwargs[old] = kwargs.pop(new) 

476 except KeyError: 

477 pass 

478 return kwargs 

479 

480else: 

481 

482 def htc_tune_schedd_args(**kwargs): 

483 """Ensure that arguments for Schedd are version appropriate. 

484 

485 This is the fallback function if no version specific alteration are 

486 necessary. Effectively, a no-op. 

487 

488 Parameters 

489 ---------- 

490 **kwargs 

491 Any keyword arguments that Schedd.history(), Schedd.query(), and 

492 Schedd.xquery() accepts. 

493 

494 Returns 

495 ------- 

496 kwargs : `dict` [`str`, Any] 

497 Keywords arguments that were passed to the function. 

498 """ 

499 return kwargs 

500 

501 

502def htc_query_history(schedds, **kwargs): 

503 """Fetch history records from the condor_schedd daemon. 

504 

505 Parameters 

506 ---------- 

507 schedds : `htcondor.Schedd` 

508 HTCondor schedulers which to query for job information. 

509 **kwargs 

510 Any keyword arguments that Schedd.history() accepts. 

511 

512 Yields 

513 ------ 

514 schedd_name : `str` 

515 Name of the HTCondor scheduler managing the job queue. 

516 job_ad : `dict` [`str`, Any`] 

517 HTCondor classad describing a job. 

518 """ 

519 # If not set, provide defaults for positional arguments. 

520 kwargs.setdefault("constraint", None) 

521 kwargs.setdefault("projection", []) 

522 kwargs = htc_tune_schedd_args(**kwargs) 

523 for schedd_name, schedd in schedds.items(): 

524 for job_ad in schedd.history(**kwargs): 

525 yield schedd_name, job_ad 

526 

527 

528def htc_query_present(schedds, **kwargs): 

529 """Query the condor_schedd daemon for job ads. 

530 

531 Parameters 

532 ---------- 

533 schedds : `htcondor.Schedd` 

534 HTCondor schedulers which to query for job information. 

535 **kwargs 

536 Any keyword arguments that Schedd.xquery() accepts. 

537 

538 Yields 

539 ------ 

540 schedd_name : `str` 

541 Name of the HTCondor scheduler managing the job queue. 

542 job_ad : `dict` [`str`, Any`] 

543 HTCondor classad describing a job. 

544 """ 

545 kwargs = htc_tune_schedd_args(**kwargs) 

546 queries = [schedd.xquery(**kwargs) for schedd in schedds.values()] 

547 for query in htcondor.poll(queries): 

548 schedd_name = query.tag() 

549 for job_ad in query.nextAdsNonBlocking(): 

550 yield schedd_name, job_ad 

551 

552 

553def htc_version(): 

554 """Return the version given by the HTCondor API. 

555 

556 Returns 

557 ------- 

558 version : `str` 

559 HTCondor version as easily comparable string. 

560 """ 

561 return str(HTC_VERSION) 

562 

563 

564def htc_submit_dag(sub): 

565 """Submit job for execution. 

566 

567 Parameters 

568 ---------- 

569 sub : `htcondor.Submit` 

570 An object representing a job submit description. 

571 

572 Returns 

573 ------- 

574 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

575 Information about jobs satisfying the search criteria where for each 

576 Scheduler, local HTCondor job ids are mapped to their respective 

577 classads. 

578 """ 

579 coll = htcondor.Collector() 

580 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

581 schedd = htcondor.Schedd(schedd_ad) 

582 

583 jobs_ads = [] 

584 with schedd.transaction() as txn: 

585 sub.queue(txn, ad_results=jobs_ads) 

586 

587 # Submit.queue() above will raise RuntimeError if submission fails, so 

588 # 'jobs_ads' should contain the ad at this point. 

589 dag_ad = jobs_ads[0] 

590 

591 # Sadly, the ClassAd from Submit.queue() (see above) does not have 

592 # 'GlobalJobId' so we need to run a regular query to get it anyway. 

593 schedd_name = schedd_ad["Name"] 

594 schedd_dag_info = condor_q( 

595 constraint=f"ClusterId == {dag_ad['ClusterId']}", schedds={schedd_name: schedd} 

596 ) 

597 return schedd_dag_info 

598 

599 

600def htc_create_submit_from_dag(dag_filename, submit_options=None): 

601 """Create a DAGMan job submit description. 

602 

603 Parameters 

604 ---------- 

605 dag_filename : `str` 

606 Name of file containing HTCondor DAG commands. 

607 submit_options : `dict` [`str`, Any], optional 

608 Contains extra options for command line (Value of None means flag). 

609 

610 Returns 

611 ------- 

612 sub : `htcondor.Submit` 

613 An object representing a job submit description. 

614 

615 Notes 

616 ----- 

617 Use with HTCondor versions which support htcondor.Submit.from_dag(), 

618 i.e., 8.9.3 or newer. 

619 """ 

620 return htcondor.Submit.from_dag(dag_filename, submit_options) 

621 

622 

623def htc_create_submit_from_cmd(dag_filename, submit_options=None): 

624 """Create a DAGMan job submit description. 

625 

626 Create a DAGMan job submit description by calling ``condor_submit_dag`` 

627 on given DAG description file. 

628 

629 Parameters 

630 ---------- 

631 dag_filename : `str` 

632 Name of file containing HTCondor DAG commands. 

633 submit_options : `dict` [`str`, Any], optional 

634 Contains extra options for command line (Value of None means flag). 

635 

636 Returns 

637 ------- 

638 sub : `htcondor.Submit` 

639 An object representing a job submit description. 

640 

641 Notes 

642 ----- 

643 Use with HTCondor versions which do not support htcondor.Submit.from_dag(), 

644 i.e., older than 8.9.3. 

645 """ 

646 # Run command line condor_submit_dag command. 

647 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse " 

648 

649 if submit_options is not None: 

650 for opt, val in submit_options.items(): 

651 cmd += f" -{opt} {val or ''}" 

652 cmd += f"{dag_filename}" 

653 

654 process = subprocess.Popen( 

655 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" 

656 ) 

657 process.wait() 

658 

659 if process.returncode != 0: 

660 print(f"Exit code: {process.returncode}") 

661 print(process.communicate()[0]) 

662 raise RuntimeError("Problems running condor_submit_dag") 

663 

664 return htc_create_submit_from_file(f"{dag_filename}.condor.sub") 

665 

666 

667def htc_create_submit_from_file(submit_file): 

668 """Parse a submission file. 

669 

670 Parameters 

671 ---------- 

672 submit_file : `str` 

673 Name of the HTCondor submit file. 

674 

675 Returns 

676 ------- 

677 sub : `htcondor.Submit` 

678 An object representing a job submit description. 

679 """ 

680 descriptors = {} 

681 with open(submit_file) as fh: 

682 for line in fh: 

683 line = line.strip() 

684 if not line.startswith("#") and not line == "queue": 

685 (key, val) = re.split(r"\s*=\s*", line, 1) 

686 descriptors[key] = val 

687 

688 # Avoid UserWarning: the line 'copy_to_spool = False' was 

689 # unused by Submit object. Is it a typo? 

690 try: 

691 del descriptors["copy_to_spool"] 

692 except KeyError: 

693 pass 

694 

695 return htcondor.Submit(descriptors) 

696 

697 

698def _htc_write_job_commands(stream, name, jobs): 

699 """Output the DAGMan job lines for single job in DAG. 

700 

701 Parameters 

702 ---------- 

703 stream : `~io.TextIOBase` 

704 Writeable text stream (typically an opened file). 

705 name : `str` 

706 Job name. 

707 jobs : `RestrictedDict` 

708 DAG job keys and values. 

709 """ 

710 if "pre" in jobs: 

711 print( 

712 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}" 

713 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}", 

714 file=stream, 

715 ) 

716 

717 if "post" in jobs: 

718 print( 

719 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}" 

720 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}", 

721 file=stream, 

722 ) 

723 

724 if "vars" in jobs: 

725 for key, value in jobs["vars"]: 

726 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream) 

727 

728 if "pre_skip" in jobs: 

729 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream) 

730 

731 if "retry" in jobs and jobs["retry"]: 

732 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream) 

733 if "retry_unless_exit" in jobs: 

734 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream) 

735 print("\n", file=stream) 

736 

737 if "abort_dag_on" in jobs and jobs["abort_dag_on"]: 

738 print( 

739 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}" 

740 f" RETURN {jobs['abort_dag_on']['abort_exit']}", 

741 file=stream, 

742 ) 

743 

744 

745class HTCJob: 

746 """HTCondor job for use in building DAG. 

747 

748 Parameters 

749 ---------- 

750 name : `str` 

751 Name of the job 

752 label : `str` 

753 Label that can used for grouping or lookup. 

754 initcmds : `RestrictedDict` 

755 Initial job commands for submit file. 

756 initdagcmds : `RestrictedDict` 

757 Initial commands for job inside DAG. 

758 initattrs : `dict` 

759 Initial dictionary of job attributes. 

760 """ 

761 

762 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None): 

763 self.name = name 

764 self.label = label 

765 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds) 

766 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds) 

767 self.attrs = initattrs 

768 self.subfile = None 

769 

770 def __str__(self): 

771 return self.name 

772 

773 def add_job_cmds(self, new_commands): 

774 """Add commands to Job (overwrite existing). 

775 

776 Parameters 

777 ---------- 

778 new_commands : `dict` 

779 Submit file commands to be added to Job. 

780 """ 

781 self.cmds.update(new_commands) 

782 

783 def add_dag_cmds(self, new_commands): 

784 """Add DAG commands to Job (overwrite existing). 

785 

786 Parameters 

787 ---------- 

788 new_commands : `dict` 

789 DAG file commands to be added to Job 

790 """ 

791 self.dagcmds.update(new_commands) 

792 

793 def add_job_attrs(self, new_attrs): 

794 """Add attributes to Job (overwrite existing). 

795 

796 Parameters 

797 ---------- 

798 new_attrs : `dict` 

799 Attributes to be added to Job 

800 """ 

801 if self.attrs is None: 

802 self.attrs = {} 

803 if new_attrs: 

804 self.attrs.update(new_attrs) 

805 

806 def write_submit_file(self, submit_path, job_subdir=""): 

807 """Write job description to submit file. 

808 

809 Parameters 

810 ---------- 

811 submit_path : `str` 

812 Prefix path for the submit file. 

813 job_subdir : `str`, optional 

814 Template for job subdir. 

815 """ 

816 if not self.subfile: 

817 self.subfile = f"{self.name}.sub" 

818 job_subdir = job_subdir.format(self=self) 

819 if job_subdir: 

820 self.subfile = os.path.join(job_subdir, self.subfile) 

821 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs) 

822 

823 def write_dag_commands(self, stream): 

824 """Write DAG commands for single job to output stream. 

825 

826 Parameters 

827 ---------- 

828 stream : `IO` or `str` 

829 Output Stream 

830 """ 

831 print(f"JOB {self.name} {self.subfile}", file=stream) 

832 _htc_write_job_commands(stream, self.name, self.dagcmds) 

833 

834 def dump(self, fh): 

835 """Dump job information to output stream. 

836 

837 Parameters 

838 ---------- 

839 fh : `~io.TextIOBase` 

840 Output stream 

841 """ 

842 printer = pprint.PrettyPrinter(indent=4, stream=fh) 

843 printer.pprint(self.name) 

844 printer.pprint(self.cmds) 

845 printer.pprint(self.attrs) 

846 

847 

848class HTCDag(networkx.DiGraph): 

849 """HTCondor DAG. 

850 

851 Parameters 

852 ---------- 

853 data : networkx.DiGraph.data 

854 Initial graph. 

855 name : `str` 

856 Name for DAG. 

857 """ 

858 

859 def __init__(self, data=None, name=""): 

860 super().__init__(data=data, name=name) 

861 

862 self.graph["attr"] = {} 

863 self.graph["run_id"] = None 

864 self.graph["submit_path"] = None 

865 self.graph["final_job"] = None 

866 

867 def __str__(self): 

868 """Represent basic DAG info as string. 

869 

870 Returns 

871 ------- 

872 info : `str` 

873 String containing basic DAG info. 

874 """ 

875 return f"{self.graph['name']} {len(self)}" 

876 

877 def add_attribs(self, attribs=None): 

878 """Add attributes to the DAG. 

879 

880 Parameters 

881 ---------- 

882 attribs : `dict` 

883 DAG attributes 

884 """ 

885 if attribs is not None: 

886 self.graph["attr"].update(attribs) 

887 

888 def add_job(self, job, parent_names=None, child_names=None): 

889 """Add an HTCJob to the HTCDag. 

890 

891 Parameters 

892 ---------- 

893 job : `HTCJob` 

894 HTCJob to add to the HTCDag 

895 parent_names : `~collections.abc.Iterable` [`str`], optional 

896 Names of parent jobs 

897 child_names : `~collections.abc.Iterable` [`str`], optional 

898 Names of child jobs 

899 """ 

900 assert isinstance(job, HTCJob) 

901 

902 # Add dag level attributes to each job 

903 job.add_job_attrs(self.graph["attr"]) 

904 

905 self.add_node(job.name, data=job) 

906 

907 if parent_names is not None: 

908 self.add_job_relationships(parent_names, job.name) 

909 

910 if child_names is not None: 

911 self.add_job_relationships(child_names, job.name) 

912 

913 def add_job_relationships(self, parents, children): 

914 """Add DAG edge between parents and children jobs. 

915 

916 Parameters 

917 ---------- 

918 parents : `list` [`str`] 

919 Contains parent job name(s). 

920 children : `list` [`str`] 

921 Contains children job name(s). 

922 """ 

923 self.add_edges_from(itertools.product(parents, children)) 

924 

925 def add_final_job(self, job): 

926 """Add an HTCJob for the FINAL job in HTCDag. 

927 

928 Parameters 

929 ---------- 

930 job : `HTCJob` 

931 HTCJob to add to the HTCDag as a FINAL job. 

932 """ 

933 # Add dag level attributes to each job 

934 job.add_job_attrs(self.graph["attr"]) 

935 

936 self.graph["final_job"] = job 

937 

938 def del_job(self, job_name): 

939 """Delete the job from the DAG. 

940 

941 Parameters 

942 ---------- 

943 job_name : `str` 

944 Name of job in DAG to delete 

945 """ 

946 # Reconnect edges around node to delete 

947 parents = self.predecessors(job_name) 

948 children = self.successors(job_name) 

949 self.add_edges_from(itertools.product(parents, children)) 

950 

951 # Delete job node (which deletes its edges). 

952 self.remove_node(job_name) 

953 

954 def write(self, submit_path, job_subdir=""): 

955 """Write DAG to a file. 

956 

957 Parameters 

958 ---------- 

959 submit_path : `str` 

960 Prefix path for dag filename to be combined with DAG name. 

961 job_subdir : `str`, optional 

962 Template for job subdir. 

963 """ 

964 self.graph["submit_path"] = submit_path 

965 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag") 

966 os.makedirs(submit_path, exist_ok=True) 

967 with open(self.graph["dag_filename"], "w") as fh: 

968 for _, nodeval in self.nodes().items(): 

969 job = nodeval["data"] 

970 job.write_submit_file(submit_path, job_subdir) 

971 job.write_dag_commands(fh) 

972 for edge in self.edges(): 

973 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

974 print(f"DOT {self.name}.dot", file=fh) 

975 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh) 

976 

977 # Add bps attributes to dag submission 

978 for key, value in self.graph["attr"].items(): 

979 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh) 

980 

981 if self.graph["final_job"]: 

982 job = self.graph["final_job"] 

983 job.write_submit_file(submit_path, job_subdir) 

984 print(f"FINAL {job.name} {job.subfile}", file=fh) 

985 if "pre" in job.dagcmds: 

986 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh) 

987 if "post" in job.dagcmds: 

988 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh) 

989 

990 def dump(self, fh): 

991 """Dump DAG info to output stream. 

992 

993 Parameters 

994 ---------- 

995 fh : `io.IO` or `str` 

996 Where to dump DAG info as text. 

997 """ 

998 for key, value in self.graph: 

999 print(f"{key}={value}", file=fh) 

1000 for name, data in self.nodes().items(): 

1001 print(f"{name}:", file=fh) 

1002 data.dump(fh) 

1003 for edge in self.edges(): 

1004 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

1005 if self.graph["final_job"]: 

1006 print(f'FINAL {self.graph["final_job"].name}:', file=fh) 

1007 self.graph["final_job"].dump(fh) 

1008 

1009 def write_dot(self, filename): 

1010 """Write a dot version of the DAG. 

1011 

1012 Parameters 

1013 ---------- 

1014 filename : `str` 

1015 dot filename 

1016 """ 

1017 pos = networkx.nx_agraph.graphviz_layout(self) 

1018 networkx.draw(self, pos=pos) 

1019 networkx.drawing.nx_pydot.write_dot(self, filename) 

1020 

1021 

1022def condor_q(constraint=None, schedds=None, **kwargs): 

1023 """Get information about the jobs in the HTCondor job queue(s). 

1024 

1025 Parameters 

1026 ---------- 

1027 constraint : `str`, optional 

1028 Constraints to be passed to job query. 

1029 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1030 HTCondor schedulers which to query for job information. If None 

1031 (default), the query will be run against local scheduler only. 

1032 **kwargs: 

1033 Additional keyword arguments that need to be passed to the internal 

1034 query method. 

1035 

1036 Returns 

1037 ------- 

1038 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1039 Information about jobs satisfying the search criteria where for each 

1040 Scheduler, local HTCondor job ids are mapped to their respective 

1041 classads. 

1042 """ 

1043 return condor_query(constraint, schedds, htc_query_present, **kwargs) 

1044 

1045 

1046def condor_history(constraint=None, schedds=None, **kwargs): 

1047 """Get information about the jobs from HTCondor history records. 

1048 

1049 Parameters 

1050 ---------- 

1051 constraint : `str`, optional 

1052 Constraints to be passed to job query. 

1053 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1054 HTCondor schedulers which to query for job information. If None 

1055 (default), the query will be run against the history file of 

1056 the local scheduler only. 

1057 **kwargs: 

1058 Additional keyword arguments that need to be passed to the internal 

1059 query method. 

1060 

1061 Returns 

1062 ------- 

1063 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1064 Information about jobs satisfying the search criteria where for each 

1065 Scheduler, local HTCondor job ids are mapped to their respective 

1066 classads. 

1067 """ 

1068 return condor_query(constraint, schedds, htc_query_history, **kwargs) 

1069 

1070 

1071def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs): 

1072 """Get information about HTCondor jobs. 

1073 

1074 Parameters 

1075 ---------- 

1076 constraint : `str`, optional 

1077 Constraints to be passed to job query. 

1078 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1079 HTCondor schedulers which to query for job information. If None 

1080 (default), the query will be run against the history file of 

1081 the local scheduler only. 

1082 query_func : callable 

1083 An query function which takes following arguments: 

1084 

1085 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]). 

1086 - ``**kwargs``: Keyword arguments that will be passed to the query 

1087 function. 

1088 **kwargs: 

1089 Additional keyword arguments that need to be passed to the query 

1090 method. 

1091 

1092 Returns 

1093 ------- 

1094 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1095 Information about jobs satisfying the search criteria where for each 

1096 Scheduler, local HTCondor job ids are mapped to their respective 

1097 classads. 

1098 """ 

1099 if not schedds: 

1100 coll = htcondor.Collector() 

1101 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1102 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

1103 

1104 job_info = defaultdict(dict) 

1105 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs): 

1106 del job_ad["Environment"] 

1107 del job_ad["Env"] 

1108 id_ = f"{int(job_ad['ClusterId'])}.{int(job_ad['ProcId'])}" 

1109 job_info[schedd_name][id_] = dict(job_ad) 

1110 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values())) 

1111 

1112 # When returning the results filter out entries for schedulers with no jobs 

1113 # matching the search criteria. 

1114 return {key: val for key, val in job_info.items() if val} 

1115 

1116 

1117def condor_search(constraint=None, hist=None, schedds=None): 

1118 """Search for running and finished jobs satisfying given criteria. 

1119 

1120 Parameters 

1121 ---------- 

1122 constraint : `str`, optional 

1123 Constraints to be passed to job query. 

1124 hist : `float` 

1125 Limit history search to this many days. 

1126 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1127 The list of the HTCondor schedulers which to query for job information. 

1128 If None (default), only the local scheduler will be queried. 

1129 

1130 Returns 

1131 ------- 

1132 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1133 Information about jobs satisfying the search criteria where for each 

1134 Scheduler, local HTCondor job ids are mapped to their respective 

1135 classads. 

1136 """ 

1137 if not schedds: 

1138 coll = htcondor.Collector() 

1139 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1140 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)} 

1141 

1142 job_info = condor_q(constraint=constraint, schedds=schedds) 

1143 if hist is not None: 

1144 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

1145 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

1146 hist_info = condor_history(constraint, schedds=schedds) 

1147 update_job_info(job_info, hist_info) 

1148 return job_info 

1149 

1150 

1151def condor_status(constraint=None, coll=None): 

1152 """Get information about HTCondor pool. 

1153 

1154 Parameters 

1155 ---------- 

1156 constraint : `str`, optional 

1157 Constraints to be passed to the query. 

1158 coll : `htcondor.Collector`, optional 

1159 Object representing HTCondor collector daemon. 

1160 

1161 Returns 

1162 ------- 

1163 pool_info : `dict` [`str`, `dict` [`str`, Any]] 

1164 Mapping between HTCondor slot names and slot information (classAds). 

1165 """ 

1166 if coll is None: 

1167 coll = htcondor.Collector() 

1168 try: 

1169 pool_ads = coll.query(constraint=constraint) 

1170 except OSError as ex: 

1171 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex 

1172 

1173 pool_info = {} 

1174 for slot in pool_ads: 

1175 pool_info[slot["name"]] = dict(slot) 

1176 _LOG.debug("condor_status returned %d ads", len(pool_info)) 

1177 return pool_info 

1178 

1179 

1180def update_job_info(job_info, other_info): 

1181 """Update results of a job query with results from another query. 

1182 

1183 Parameters 

1184 ---------- 

1185 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1186 Results of the job query that needs to be updated. 

1187 other_info : `dict` [`str`, `dict` [`str`, Any]] 

1188 Results of the other job query. 

1189 

1190 Returns 

1191 ------- 

1192 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1193 The updated results. 

1194 """ 

1195 for schedd_name, others in other_info.items(): 

1196 try: 

1197 jobs = job_info[schedd_name] 

1198 except KeyError: 

1199 job_info[schedd_name] = others 

1200 else: 

1201 for id_, ad in others.items(): 

1202 jobs.setdefault(id_, {}).update(ad) 

1203 return job_info 

1204 

1205 

1206def summary_from_dag(dir_name): 

1207 """Build bps_run_summary string from dag file. 

1208 

1209 Parameters 

1210 ---------- 

1211 dir_name : `str` 

1212 Path that includes dag file for a run. 

1213 

1214 Returns 

1215 ------- 

1216 summary : `str` 

1217 Semi-colon separated list of job labels and counts. 

1218 (Same format as saved in dag classad.) 

1219 job_name_to_pipetask : `dict` [`str`, `str`] 

1220 Mapping of job names to job labels 

1221 """ 

1222 dag = next(Path(dir_name).glob("*.dag")) 

1223 

1224 # Later code depends upon insertion order 

1225 counts = defaultdict(int) 

1226 job_name_to_pipetask = {} 

1227 try: 

1228 with open(dag) as fh: 

1229 for line in fh: 

1230 if line.startswith("JOB"): 

1231 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line) 

1232 if m: 

1233 label = m.group(2) 

1234 if label == "init": 

1235 label = "pipetaskInit" 

1236 job_name_to_pipetask[m.group(1)] = label 

1237 counts[label] += 1 

1238 else: # Check if Pegasus submission 

1239 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line) 

1240 if m: 

1241 label = pegasus_name_to_label(m.group(1)) 

1242 job_name_to_pipetask[m.group(1)] = label 

1243 counts[label] += 1 

1244 else: 

1245 _LOG.warning("Parse DAG: unmatched job line: %s", line) 

1246 elif line.startswith("FINAL"): 

1247 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line) 

1248 if m: 

1249 label = m.group(2) 

1250 job_name_to_pipetask[m.group(1)] = label 

1251 counts[label] += 1 

1252 

1253 except (OSError, PermissionError, StopIteration): 

1254 pass 

1255 

1256 summary = ";".join([f"{name}:{counts[name]}" for name in counts]) 

1257 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask) 

1258 return summary, job_name_to_pipetask 

1259 

1260 

1261def pegasus_name_to_label(name): 

1262 """Convert pegasus job name to a label for the report. 

1263 

1264 Parameters 

1265 ---------- 

1266 name : `str` 

1267 Name of job. 

1268 

1269 Returns 

1270 ------- 

1271 label : `str` 

1272 Label for job. 

1273 """ 

1274 label = "UNK" 

1275 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"): 

1276 label = "pegasus" 

1277 else: 

1278 m = re.match(r"pipetask_(\d+_)?([^_]+)", name) 

1279 if m: 

1280 label = m.group(2) 

1281 if label == "init": 

1282 label = "pipetaskInit" 

1283 

1284 return label 

1285 

1286 

1287def read_dag_status(wms_path): 

1288 """Read the node status file for DAG summary information 

1289 

1290 Parameters 

1291 ---------- 

1292 wms_path : `str` 

1293 Path that includes node status file for a run. 

1294 

1295 Returns 

1296 ------- 

1297 dag_ad : `dict` [`str`, Any] 

1298 DAG summary information. 

1299 """ 

1300 dag_ad = {} 

1301 

1302 # While this is probably more up to date than dag classad, only read from 

1303 # file if need to. 

1304 try: 

1305 try: 

1306 node_stat_file = next(Path(wms_path).glob("*.node_status")) 

1307 _LOG.debug("Reading Node Status File %s", node_stat_file) 

1308 with open(node_stat_file) as infh: 

1309 dag_ad = classad.parseNext(infh) # pylint: disable=E1101 

1310 except StopIteration: 

1311 pass 

1312 

1313 if not dag_ad: 

1314 # Pegasus check here 

1315 try: 

1316 metrics_file = next(Path(wms_path).glob("*.dag.metrics")) 

1317 with open(metrics_file) as infh: 

1318 metrics = json.load(infh) 

1319 dag_ad["NodesTotal"] = metrics.get("jobs", 0) 

1320 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0) 

1321 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0) 

1322 dag_ad["pegasus_version"] = metrics.get("planner_version", "") 

1323 except StopIteration: 

1324 try: 

1325 metrics_file = next(Path(wms_path).glob("*.metrics")) 

1326 with open(metrics_file) as infh: 

1327 metrics = json.load(infh) 

1328 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"] 

1329 dag_ad["pegasus_version"] = metrics.get("version", "") 

1330 except StopIteration: 

1331 pass 

1332 except (OSError, PermissionError): 

1333 pass 

1334 

1335 _LOG.debug("read_dag_status: %s", dag_ad) 

1336 return dict(dag_ad) 

1337 

1338 

1339def read_node_status(wms_path): 

1340 """Read entire node status file. 

1341 

1342 Parameters 

1343 ---------- 

1344 wms_path : `str` 

1345 Path that includes node status file for a run. 

1346 

1347 Returns 

1348 ------- 

1349 jobs : `dict` [`str`, Any] 

1350 DAG summary information. 

1351 """ 

1352 # Get jobid info from other places to fill in gaps in info from node_status 

1353 _, job_name_to_pipetask = summary_from_dag(wms_path) 

1354 wms_workflow_id, loginfo = read_dag_log(wms_path) 

1355 loginfo = read_dag_nodes_log(wms_path) 

1356 _LOG.debug("loginfo = %s", loginfo) 

1357 job_name_to_id = {} 

1358 for jid, jinfo in loginfo.items(): 

1359 if "LogNotes" in jinfo: 

1360 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"]) 

1361 if m: 

1362 job_name_to_id[m.group(1)] = jid 

1363 jinfo["DAGNodeName"] = m.group(1) 

1364 

1365 try: 

1366 node_status = next(Path(wms_path).glob("*.node_status")) 

1367 except StopIteration: 

1368 return loginfo 

1369 

1370 jobs = {} 

1371 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one 

1372 try: 

1373 with open(node_status) as fh: 

1374 ads = classad.parseAds(fh) 

1375 

1376 for jclassad in ads: 

1377 if jclassad["Type"] == "DagStatus": 

1378 # skip DAG summary 

1379 pass 

1380 elif "Node" not in jclassad: 

1381 if jclassad["Type"] != "StatusEnd": 

1382 _LOG.debug("Key 'Node' not in classad: %s", jclassad) 

1383 break 

1384 else: 

1385 if jclassad["Node"] in job_name_to_pipetask: 

1386 try: 

1387 label = job_name_to_pipetask[jclassad["Node"]] 

1388 except KeyError: 

1389 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys()) 

1390 raise 

1391 elif "_" in jclassad["Node"]: 

1392 label = jclassad["Node"].split("_")[1] 

1393 else: 

1394 label = jclassad["Node"] 

1395 

1396 # Make job info as if came from condor_q 

1397 if jclassad["Node"] in job_name_to_id: 

1398 job_id = job_name_to_id[jclassad["Node"]] 

1399 else: 

1400 job_id = str(fake_id) 

1401 fake_id -= 1 

1402 

1403 job = dict(jclassad) 

1404 job["ClusterId"] = int(float(job_id)) 

1405 job["DAGManJobID"] = wms_workflow_id 

1406 job["DAGNodeName"] = jclassad["Node"] 

1407 job["bps_job_label"] = label 

1408 

1409 jobs[str(job_id)] = job 

1410 except (OSError, PermissionError): 

1411 pass 

1412 

1413 return jobs 

1414 

1415 

1416def read_dag_log(wms_path): 

1417 """Read job information from the DAGMan log file. 

1418 

1419 Parameters 

1420 ---------- 

1421 wms_path : `str` 

1422 Path containing the DAGMan log file. 

1423 

1424 Returns 

1425 ------- 

1426 wms_workflow_id : `str` 

1427 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job. 

1428 dag_info : `dict` [`str`, `~collections.abc.Any`] 

1429 HTCondor job information read from the log file mapped to HTCondor 

1430 job id. 

1431 

1432 Raises 

1433 ------ 

1434 FileNotFoundError 

1435 If cannot find DAGMan log in given wms_path. 

1436 """ 

1437 wms_workflow_id = 0 

1438 dag_info = {} 

1439 

1440 path = Path(wms_path) 

1441 if path.exists(): 

1442 try: 

1443 filename = next(path.glob("*.dag.dagman.log")) 

1444 except StopIteration as exc: 

1445 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc 

1446 _LOG.debug("dag node log filename: %s", filename) 

1447 

1448 info = {} 

1449 job_event_log = htcondor.JobEventLog(str(filename)) 

1450 for event in job_event_log.events(stop_after=0): 

1451 id_ = f"{event['Cluster']}.{event['Proc']}" 

1452 if id_ not in info: 

1453 info[id_] = {} 

1454 wms_workflow_id = id_ # taking last job id in case of restarts 

1455 info[id_].update(event) 

1456 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1457 

1458 # only save latest DAG job 

1459 dag_info = {wms_workflow_id: info[wms_workflow_id]} 

1460 for job in dag_info.values(): 

1461 _tweak_log_info(filename, job) 

1462 

1463 return wms_workflow_id, dag_info 

1464 

1465 

1466def read_dag_nodes_log(wms_path): 

1467 """Read job information from the DAGMan nodes log file. 

1468 

1469 Parameters 

1470 ---------- 

1471 wms_path : `str` 

1472 Path containing the DAGMan nodes log file. 

1473 

1474 Returns 

1475 ------- 

1476 info : `dict` [`str`, Any] 

1477 HTCondor job information read from the log file mapped to HTCondor 

1478 job id. 

1479 

1480 Raises 

1481 ------ 

1482 FileNotFoundError 

1483 If cannot find DAGMan node log in given wms_path. 

1484 """ 

1485 try: 

1486 filename = next(Path(wms_path).glob("*.dag.nodes.log")) 

1487 except StopIteration as exc: 

1488 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc 

1489 _LOG.debug("dag node log filename: %s", filename) 

1490 

1491 info = {} 

1492 job_event_log = htcondor.JobEventLog(str(filename)) 

1493 for event in job_event_log.events(stop_after=0): 

1494 id_ = f"{event['Cluster']}.{event['Proc']}" 

1495 if id_ not in info: 

1496 info[id_] = {} 

1497 info[id_].update(event) 

1498 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1499 

1500 # Add more condor_q-like info to info parsed from log file. 

1501 for job in info.values(): 

1502 _tweak_log_info(filename, job) 

1503 

1504 return info 

1505 

1506 

1507def read_dag_info(wms_path): 

1508 """Read custom DAGMan job information from the file. 

1509 

1510 Parameters 

1511 ---------- 

1512 wms_path : `str` 

1513 Path containing the file with the DAGMan job info. 

1514 

1515 Returns 

1516 ------- 

1517 dag_info : `dict` [`str`, `dict` [`str`, Any]] 

1518 HTCondor job information. 

1519 

1520 Raises 

1521 ------ 

1522 FileNotFoundError 

1523 If cannot find DAGMan job info file in the given location. 

1524 """ 

1525 try: 

1526 filename = next(Path(wms_path).glob("*.info.json")) 

1527 except StopIteration as exc: 

1528 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc 

1529 _LOG.debug("DAGMan job information filename: %s", filename) 

1530 try: 

1531 with open(filename) as fh: 

1532 dag_info = json.load(fh) 

1533 except (OSError, PermissionError) as exc: 

1534 _LOG.debug("Retrieving DAGMan job information failed: %s", exc) 

1535 dag_info = {} 

1536 return dag_info 

1537 

1538 

1539def write_dag_info(filename, dag_info): 

1540 """Write custom job information about DAGMan job. 

1541 

1542 Parameters 

1543 ---------- 

1544 filename : `str` 

1545 Name of the file where the information will be stored. 

1546 dag_info : `dict` [`str` `dict` [`str`, Any]] 

1547 Information about the DAGMan job. 

1548 """ 

1549 schedd_name = next(iter(dag_info)) 

1550 dag_id = next(iter(dag_info[schedd_name])) 

1551 dag_ad = dag_info[schedd_name][dag_id] 

1552 try: 

1553 with open(filename, "w") as fh: 

1554 info = { 

1555 schedd_name: { 

1556 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]} 

1557 } 

1558 } 

1559 json.dump(info, fh) 

1560 except (KeyError, OSError, PermissionError) as exc: 

1561 _LOG.debug("Persisting DAGMan job information failed: %s", exc) 

1562 

1563 

1564def _tweak_log_info(filename, job): 

1565 """Massage the given job info has same structure as if came from condor_q. 

1566 

1567 Parameters 

1568 ---------- 

1569 filename : `pathlib.Path` 

1570 Name of the DAGMan log. 

1571 job : `dict` [ `str`, Any ] 

1572 A mapping between HTCondor job id and job information read from 

1573 the log. 

1574 """ 

1575 _LOG.debug("_tweak_log_info: %s %s", filename, job) 

1576 try: 

1577 job["ClusterId"] = job["Cluster"] 

1578 job["ProcId"] = job["Proc"] 

1579 job["Iwd"] = str(filename.parent) 

1580 job["Owner"] = filename.owner() 

1581 if job["MyType"] == "ExecuteEvent": 

1582 job["JobStatus"] = JobStatus.RUNNING 

1583 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent": 

1584 job["JobStatus"] = JobStatus.COMPLETED 

1585 try: 

1586 if not job["TerminatedNormally"]: 

1587 if "ReturnValue" in job: 

1588 job["ExitCode"] = job["ReturnValue"] 

1589 job["ExitBySignal"] = False 

1590 elif "TerminatedBySignal" in job: 

1591 job["ExitBySignal"] = True 

1592 job["ExitSignal"] = job["TerminatedBySignal"] 

1593 else: 

1594 _LOG.warning("Could not determine exit status for completed job: %s", job) 

1595 except KeyError as ex: 

1596 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job) 

1597 elif job["MyType"] == "SubmitEvent": 

1598 job["JobStatus"] = JobStatus.IDLE 

1599 elif job["MyType"] == "JobAbortedEvent": 

1600 job["JobStatus"] = JobStatus.REMOVED 

1601 else: 

1602 _LOG.debug("Unknown log event type: %s", job["MyType"]) 

1603 except KeyError: 

1604 _LOG.error("Missing key in job: %s", job) 

1605 raise 

1606 

1607 

1608def htc_check_dagman_output(wms_path): 

1609 """Check the DAGMan output for error messages. 

1610 

1611 Parameters 

1612 ---------- 

1613 wms_path : `str` 

1614 Directory containing the DAGman output file. 

1615 

1616 Returns 

1617 ------- 

1618 message : `str` 

1619 Message containing error messages from the DAGMan output. Empty 

1620 string if no messages. 

1621 

1622 Raises 

1623 ------ 

1624 FileNotFoundError 

1625 If cannot find DAGMan standard output file in given wms_path. 

1626 """ 

1627 try: 

1628 filename = next(Path(wms_path).glob("*.dag.dagman.out")) 

1629 except StopIteration as exc: 

1630 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc 

1631 _LOG.debug("dag output filename: %s", filename) 

1632 

1633 message = "" 

1634 try: 

1635 with open(filename) as fh: 

1636 last_submit_failed = "" 

1637 for line in fh: 

1638 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line) 

1639 if m: 

1640 last_submit_failed = m.group(1) 

1641 if last_submit_failed: 

1642 message = f"Warn: Job submission issues (last: {last_submit_failed})" 

1643 except (OSError, PermissionError): 

1644 message = f"Warn: Could not read dagman output file from {wms_path}." 

1645 return message