Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%

600 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-06 04:21 -0700

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Placeholder HTCondor DAGMan API. 

29 

30There is new work on a python DAGMan API from HTCondor. However, at this 

31time, it tries to make things easier by assuming DAG is easily broken into 

32levels where there are 1-1 or all-to-all relationships to nodes in next 

33level. LSST workflows are more complicated. 

34""" 

35 

36__all__ = [ 

37 "DagStatus", 

38 "JobStatus", 

39 "NodeStatus", 

40 "RestrictedDict", 

41 "HTCJob", 

42 "HTCDag", 

43 "htc_backup_files", 

44 "htc_check_dagman_output", 

45 "htc_create_submit_from_cmd", 

46 "htc_create_submit_from_dag", 

47 "htc_create_submit_from_file", 

48 "htc_escape", 

49 "htc_write_attribs", 

50 "htc_write_condor_file", 

51 "htc_query_history", 

52 "htc_query_present", 

53 "htc_version", 

54 "htc_submit_dag", 

55 "condor_history", 

56 "condor_q", 

57 "condor_search", 

58 "condor_status", 

59 "update_job_info", 

60 "MISSING_ID", 

61 "summary_from_dag", 

62 "read_dag_info", 

63 "read_dag_log", 

64 "read_dag_nodes_log", 

65 "read_dag_status", 

66 "read_node_status", 

67 "write_dag_info", 

68 "pegasus_name_to_label", 

69] 

70 

71 

72import itertools 

73import json 

74import logging 

75import os 

76import pprint 

77import re 

78import subprocess 

79from collections import defaultdict 

80from collections.abc import MutableMapping 

81from datetime import datetime, timedelta 

82from enum import IntEnum 

83from pathlib import Path 

84 

85import classad 

86import htcondor 

87import networkx 

88from packaging import version 

89 

90_LOG = logging.getLogger(__name__) 

91 

92MISSING_ID = -99999 

93 

94 

95class DagStatus(IntEnum): 

96 """HTCondor DAGMan's statuses for a DAG.""" 

97 

98 OK = 0 

99 ERROR = 1 # an error condition different than those listed here 

100 FAILED = 2 # one or more nodes in the DAG have failed 

101 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification 

102 REMOVED = 4 # the DAG has been removed by condor_rm 

103 CYCLE = 5 # a cycle was found in the DAG 

104 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8) 

105 

106 

107class JobStatus(IntEnum): 

108 """HTCondor's statuses for jobs.""" 

109 

110 UNEXPANDED = 0 # Unexpanded 

111 IDLE = 1 # Idle 

112 RUNNING = 2 # Running 

113 REMOVED = 3 # Removed 

114 COMPLETED = 4 # Completed 

115 HELD = 5 # Held 

116 TRANSFERRING_OUTPUT = 6 # Transferring_Output 

117 SUSPENDED = 7 # Suspended 

118 

119 

120class NodeStatus(IntEnum): 

121 """HTCondor's statuses for DAGman nodes.""" 

122 

123 # (STATUS_NOT_READY): At least one parent has not yet finished or the node 

124 # is a FINAL node. 

125 NOT_READY = 0 

126 

127 # (STATUS_READY): All parents have finished, but the node is not yet 

128 # running. 

129 READY = 1 

130 

131 # (STATUS_PRERUN): The node’s PRE script is running. 

132 PRERUN = 2 

133 

134 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue. 

135 # StatusDetails = "not_idle" -> running. 

136 # JobProcsHeld = 1-> hold. 

137 # JobProcsQueued = 1 -> idle. 

138 SUBMITTED = 3 

139 

140 # (STATUS_POSTRUN): The node’s POST script is running. 

141 POSTRUN = 4 

142 

143 # (STATUS_DONE): The node has completed successfully. 

144 DONE = 5 

145 

146 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g., 

147 # ULOG_JOB_ABORTED for deleted job). 

148 ERROR = 6 

149 

150 

151HTC_QUOTE_KEYS = {"environment"} 

152HTC_VALID_JOB_KEYS = { 

153 "universe", 

154 "executable", 

155 "arguments", 

156 "environment", 

157 "log", 

158 "error", 

159 "output", 

160 "should_transfer_files", 

161 "when_to_transfer_output", 

162 "getenv", 

163 "notification", 

164 "notify_user", 

165 "concurrency_limit", 

166 "transfer_executable", 

167 "transfer_input_files", 

168 "transfer_output_files", 

169 "request_cpus", 

170 "request_memory", 

171 "request_disk", 

172 "priority", 

173 "category", 

174 "requirements", 

175 "on_exit_hold", 

176 "on_exit_hold_reason", 

177 "on_exit_hold_subcode", 

178 "max_retries", 

179 "periodic_release", 

180 "periodic_remove", 

181 "accounting_group", 

182 "accounting_group_user", 

183} 

184HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"} 

185HTC_VERSION = version.parse(htcondor.__version__) 

186 

187 

188class RestrictedDict(MutableMapping): 

189 """A dictionary that only allows certain keys. 

190 

191 Parameters 

192 ---------- 

193 valid_keys : `Container` 

194 Strings that are valid keys. 

195 init_data : `dict` or `RestrictedDict`, optional 

196 Initial data. 

197 

198 Raises 

199 ------ 

200 KeyError 

201 If invalid key(s) in init_data. 

202 """ 

203 

204 def __init__(self, valid_keys, init_data=()): 

205 self.valid_keys = valid_keys 

206 self.data = {} 

207 self.update(init_data) 

208 

209 def __getitem__(self, key): 

210 """Return value for given key if exists. 

211 

212 Parameters 

213 ---------- 

214 key : `str` 

215 Identifier for value to return. 

216 

217 Returns 

218 ------- 

219 value : `~collections.abc.Any` 

220 Value associated with given key. 

221 

222 Raises 

223 ------ 

224 KeyError 

225 If key doesn't exist. 

226 """ 

227 return self.data[key] 

228 

229 def __delitem__(self, key): 

230 """Delete value for given key if exists. 

231 

232 Parameters 

233 ---------- 

234 key : `str` 

235 Identifier for value to delete. 

236 

237 Raises 

238 ------ 

239 KeyError 

240 If key doesn't exist. 

241 """ 

242 del self.data[key] 

243 

244 def __setitem__(self, key, value): 

245 """Store key,value in internal dict only if key is valid. 

246 

247 Parameters 

248 ---------- 

249 key : `str` 

250 Identifier to associate with given value. 

251 value : `~collections.abc.Any` 

252 Value to store. 

253 

254 Raises 

255 ------ 

256 KeyError 

257 If key is invalid. 

258 """ 

259 if key not in self.valid_keys: 

260 raise KeyError(f"Invalid key {key}") 

261 self.data[key] = value 

262 

263 def __iter__(self): 

264 return self.data.__iter__() 

265 

266 def __len__(self): 

267 return len(self.data) 

268 

269 def __str__(self): 

270 return str(self.data) 

271 

272 

273def htc_backup_files(wms_path, subdir=None, limit=100): 

274 """Backup select HTCondor files in the submit directory. 

275 

276 Files will be saved in separate subdirectories which will be created in 

277 the submit directory where the files are located. These subdirectories 

278 will be consecutive, zero-padded integers. Their values will correspond to 

279 the number of HTCondor rescue DAGs in the submit directory. 

280 

281 Hence, with the default settings, copies after the initial failed run will 

282 be placed in '001' subdirectory, '002' after the first restart, and so on 

283 until the limit of backups is reached. If there's no rescue DAG yet, files 

284 will be copied to '000' subdirectory. 

285 

286 Parameters 

287 ---------- 

288 wms_path : `str` or `pathlib.Path` 

289 Path to the submit directory either absolute or relative. 

290 subdir : `str` or `pathlib.Path`, optional 

291 A path, relative to the submit directory, where all subdirectories with 

292 backup files will be kept. Defaults to None which means that the backup 

293 subdirectories will be placed directly in the submit directory. 

294 limit : `int`, optional 

295 Maximal number of backups. If the number of backups reaches the limit, 

296 the last backup files will be overwritten. The default value is 100 

297 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in 

298 version 8.8+. 

299 

300 Raises 

301 ------ 

302 FileNotFoundError 

303 If the submit directory or the file that needs to be backed up does not 

304 exist. 

305 OSError 

306 If the submit directory cannot be accessed or backing up a file failed 

307 either due to permission or filesystem related issues. 

308 

309 Notes 

310 ----- 

311 This is not a generic function for making backups. It is intended to be 

312 used once, just before a restart, to make snapshots of files which will be 

313 overwritten by HTCondor after during the next run. 

314 """ 

315 width = len(str(limit)) 

316 

317 path = Path(wms_path).resolve() 

318 if not path.is_dir(): 

319 raise FileNotFoundError(f"Directory {path} not found") 

320 

321 # Initialize the backup counter. 

322 rescue_dags = list(Path(wms_path).glob("*.rescue*")) 

323 counter = min(len(rescue_dags), limit) 

324 

325 # Create the backup directory and move select files there. 

326 dest = Path(wms_path) 

327 if subdir: 

328 # PurePath.is_relative_to() is not available before Python 3.9. Hence 

329 # we need to check is 'subdir' is in the submit directory in some other 

330 # way if it is an absolute path. 

331 subdir = Path(subdir) 

332 if subdir.is_absolute(): 

333 if dest not in subdir.parents: 

334 _LOG.warning( 

335 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.", 

336 subdir, 

337 wms_path, 

338 ) 

339 else: 

340 dest /= subdir 

341 else: 

342 dest /= subdir 

343 dest /= f"{counter:0{width}}" 

344 try: 

345 dest.mkdir(parents=True, exist_ok=False if counter < limit else True) 

346 except FileExistsError: 

347 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest) 

348 else: 

349 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]: 

350 for source in path.glob(patt): 

351 if source.is_file(): 

352 target = dest / source.relative_to(path) 

353 try: 

354 source.rename(target) 

355 except OSError as exc: 

356 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None 

357 else: 

358 raise FileNotFoundError(f"Backing up '{source}' failed: not a file") 

359 

360 

361def htc_escape(value): 

362 """Escape characters in given value based upon HTCondor syntax. 

363 

364 Parameters 

365 ---------- 

366 value : `~collections.abc.Any` 

367 Value that needs to have characters escaped if string. 

368 

369 Returns 

370 ------- 

371 new_value : `~collections.abc.Any` 

372 Given value with characters escaped appropriate for HTCondor if string. 

373 """ 

374 if isinstance(value, str): 

375 newval = value.replace('"', '""').replace("'", "''").replace("&quot;", '"') 

376 else: 

377 newval = value 

378 

379 return newval 

380 

381 

382def htc_write_attribs(stream, attrs): 

383 """Write job attributes in HTCondor format to writeable stream. 

384 

385 Parameters 

386 ---------- 

387 stream : `~io.TextIOBase` 

388 Output text stream (typically an open file). 

389 attrs : `dict` 

390 HTCondor job attributes (dictionary of attribute key, value). 

391 """ 

392 for key, value in attrs.items(): 

393 # Make sure strings are syntactically correct for HTCondor. 

394 if isinstance(value, str): 

395 pval = f'"{htc_escape(value)}"' 

396 else: 

397 pval = value 

398 

399 print(f"+{key} = {pval}", file=stream) 

400 

401 

402def htc_write_condor_file(filename, job_name, job, job_attrs): 

403 """Write an HTCondor submit file. 

404 

405 Parameters 

406 ---------- 

407 filename : `str` 

408 Filename for the HTCondor submit file. 

409 job_name : `str` 

410 Job name to use in submit file. 

411 job : `RestrictedDict` 

412 Submit script information. 

413 job_attrs : `dict` 

414 Job attributes. 

415 """ 

416 os.makedirs(os.path.dirname(filename), exist_ok=True) 

417 with open(filename, "w") as fh: 

418 for key, value in job.items(): 

419 if value is not None: 

420 if key in HTC_QUOTE_KEYS: 

421 print(f'{key}="{htc_escape(value)}"', file=fh) 

422 else: 

423 print(f"{key}={value}", file=fh) 

424 for key in ["output", "error", "log"]: 

425 if key not in job: 

426 filename = f"{job_name}.$(Cluster).${key[:3]}" 

427 print(f"{key}={filename}", file=fh) 

428 

429 if job_attrs is not None: 

430 htc_write_attribs(fh, job_attrs) 

431 print("queue", file=fh) 

432 

433 

434# To avoid doing the version check during every function call select 

435# appropriate conversion function at the import time. 

436# 

437# Make sure that *each* version specific variant of the conversion function(s) 

438# has the same signature after applying any changes! 

439if HTC_VERSION < version.parse("8.9.8"): 439 ↛ 441line 439 didn't jump to line 441, because the condition on line 439 was never true

440 

441 def htc_tune_schedd_args(**kwargs): 

442 """Ensure that arguments for Schedd are version appropriate. 

443 

444 The old arguments: 'requirements' and 'attr_list' of 

445 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were 

446 deprecated in favor of 'constraint' and 'projection', respectively, 

447 starting from version 8.9.8. The function will convert "new" keyword 

448 arguments to "old" ones. 

449 

450 Parameters 

451 ---------- 

452 **kwargs 

453 Any keyword arguments that Schedd.history(), Schedd.query(), and 

454 Schedd.xquery() accepts. 

455 

456 Returns 

457 ------- 

458 kwargs : `dict` [`str`, Any] 

459 Keywords arguments that are guaranteed to work with the Python 

460 HTCondor API. 

461 

462 Notes 

463 ----- 

464 Function doesn't validate provided keyword arguments beyond converting 

465 selected arguments to their version specific form. For example, 

466 it won't remove keywords that are not supported by the methods 

467 mentioned earlier. 

468 """ 

469 translation_table = { 

470 "constraint": "requirements", 

471 "projection": "attr_list", 

472 } 

473 for new, old in translation_table.items(): 

474 try: 

475 kwargs[old] = kwargs.pop(new) 

476 except KeyError: 

477 pass 

478 return kwargs 

479 

480else: 

481 

482 def htc_tune_schedd_args(**kwargs): 

483 """Ensure that arguments for Schedd are version appropriate. 

484 

485 This is the fallback function if no version specific alteration are 

486 necessary. Effectively, a no-op. 

487 

488 Parameters 

489 ---------- 

490 **kwargs 

491 Any keyword arguments that Schedd.history(), Schedd.query(), and 

492 Schedd.xquery() accepts. 

493 

494 Returns 

495 ------- 

496 kwargs : `dict` [`str`, Any] 

497 Keywords arguments that were passed to the function. 

498 """ 

499 return kwargs 

500 

501 

502def htc_query_history(schedds, **kwargs): 

503 """Fetch history records from the condor_schedd daemon. 

504 

505 Parameters 

506 ---------- 

507 schedds : `htcondor.Schedd` 

508 HTCondor schedulers which to query for job information. 

509 **kwargs 

510 Any keyword arguments that Schedd.history() accepts. 

511 

512 Yields 

513 ------ 

514 schedd_name : `str` 

515 Name of the HTCondor scheduler managing the job queue. 

516 job_ad : `dict` [`str`, Any] 

517 A dictionary representing HTCondor ClassAd describing a job. It maps 

518 job attributes names to values of the ClassAd expressions they 

519 represent. 

520 """ 

521 # If not set, provide defaults for positional arguments. 

522 kwargs.setdefault("constraint", None) 

523 kwargs.setdefault("projection", []) 

524 kwargs = htc_tune_schedd_args(**kwargs) 

525 for schedd_name, schedd in schedds.items(): 

526 for job_ad in schedd.history(**kwargs): 

527 yield schedd_name, dict(job_ad) 

528 

529 

530def htc_query_present(schedds, **kwargs): 

531 """Query the condor_schedd daemon for job ads. 

532 

533 Parameters 

534 ---------- 

535 schedds : `htcondor.Schedd` 

536 HTCondor schedulers which to query for job information. 

537 **kwargs 

538 Any keyword arguments that Schedd.xquery() accepts. 

539 

540 Yields 

541 ------ 

542 schedd_name : `str` 

543 Name of the HTCondor scheduler managing the job queue. 

544 job_ad : `dict` [`str`, Any] 

545 A dictionary representing HTCondor ClassAd describing a job. It maps 

546 job attributes names to values of the ClassAd expressions they 

547 represent. 

548 """ 

549 kwargs = htc_tune_schedd_args(**kwargs) 

550 for schedd_name, schedd in schedds.items(): 

551 for job_ad in schedd.query(**kwargs): 

552 yield schedd_name, dict(job_ad) 

553 

554 

555def htc_version(): 

556 """Return the version given by the HTCondor API. 

557 

558 Returns 

559 ------- 

560 version : `str` 

561 HTCondor version as easily comparable string. 

562 """ 

563 return str(HTC_VERSION) 

564 

565 

566def htc_submit_dag(sub): 

567 """Submit job for execution. 

568 

569 Parameters 

570 ---------- 

571 sub : `htcondor.Submit` 

572 An object representing a job submit description. 

573 

574 Returns 

575 ------- 

576 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

577 Information about jobs satisfying the search criteria where for each 

578 Scheduler, local HTCondor job ids are mapped to their respective 

579 classads. 

580 """ 

581 coll = htcondor.Collector() 

582 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

583 schedd = htcondor.Schedd(schedd_ad) 

584 

585 # If Schedd.submit() fails, the method will raise an exception. Usually, 

586 # that implies issues with the HTCondor pool which BPS can't address. 

587 # Hence, no effort is made to handle the exception. 

588 submit_result = schedd.submit(sub) 

589 

590 # Sadly, the ClassAd from Schedd.submit() (see above) does not have 

591 # 'GlobalJobId' so we need to run a regular query to get it anyway. 

592 schedd_name = schedd_ad["Name"] 

593 schedd_dag_info = condor_q( 

594 constraint=f"ClusterId == {submit_result.cluster()}", schedds={schedd_name: schedd} 

595 ) 

596 return schedd_dag_info 

597 

598 

599def htc_create_submit_from_dag(dag_filename, submit_options=None): 

600 """Create a DAGMan job submit description. 

601 

602 Parameters 

603 ---------- 

604 dag_filename : `str` 

605 Name of file containing HTCondor DAG commands. 

606 submit_options : `dict` [`str`, Any], optional 

607 Contains extra options for command line (Value of None means flag). 

608 

609 Returns 

610 ------- 

611 sub : `htcondor.Submit` 

612 An object representing a job submit description. 

613 

614 Notes 

615 ----- 

616 Use with HTCondor versions which support htcondor.Submit.from_dag(), 

617 i.e., 8.9.3 or newer. 

618 """ 

619 return htcondor.Submit.from_dag(dag_filename, submit_options) 

620 

621 

622def htc_create_submit_from_cmd(dag_filename, submit_options=None): 

623 """Create a DAGMan job submit description. 

624 

625 Create a DAGMan job submit description by calling ``condor_submit_dag`` 

626 on given DAG description file. 

627 

628 Parameters 

629 ---------- 

630 dag_filename : `str` 

631 Name of file containing HTCondor DAG commands. 

632 submit_options : `dict` [`str`, Any], optional 

633 Contains extra options for command line (Value of None means flag). 

634 

635 Returns 

636 ------- 

637 sub : `htcondor.Submit` 

638 An object representing a job submit description. 

639 

640 Notes 

641 ----- 

642 Use with HTCondor versions which do not support htcondor.Submit.from_dag(), 

643 i.e., older than 8.9.3. 

644 """ 

645 # Run command line condor_submit_dag command. 

646 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse " 

647 

648 if submit_options is not None: 

649 for opt, val in submit_options.items(): 

650 cmd += f" -{opt} {val or ''}" 

651 cmd += f"{dag_filename}" 

652 

653 process = subprocess.Popen( 

654 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" 

655 ) 

656 process.wait() 

657 

658 if process.returncode != 0: 

659 print(f"Exit code: {process.returncode}") 

660 print(process.communicate()[0]) 

661 raise RuntimeError("Problems running condor_submit_dag") 

662 

663 return htc_create_submit_from_file(f"{dag_filename}.condor.sub") 

664 

665 

666def htc_create_submit_from_file(submit_file): 

667 """Parse a submission file. 

668 

669 Parameters 

670 ---------- 

671 submit_file : `str` 

672 Name of the HTCondor submit file. 

673 

674 Returns 

675 ------- 

676 sub : `htcondor.Submit` 

677 An object representing a job submit description. 

678 """ 

679 descriptors = {} 

680 with open(submit_file) as fh: 

681 for line in fh: 

682 line = line.strip() 

683 if not line.startswith("#") and not line == "queue": 

684 (key, val) = re.split(r"\s*=\s*", line, 1) 

685 descriptors[key] = val 

686 

687 # Avoid UserWarning: the line 'copy_to_spool = False' was 

688 # unused by Submit object. Is it a typo? 

689 try: 

690 del descriptors["copy_to_spool"] 

691 except KeyError: 

692 pass 

693 

694 return htcondor.Submit(descriptors) 

695 

696 

697def _htc_write_job_commands(stream, name, jobs): 

698 """Output the DAGMan job lines for single job in DAG. 

699 

700 Parameters 

701 ---------- 

702 stream : `~io.TextIOBase` 

703 Writeable text stream (typically an opened file). 

704 name : `str` 

705 Job name. 

706 jobs : `RestrictedDict` 

707 DAG job keys and values. 

708 """ 

709 if "pre" in jobs: 

710 print( 

711 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}" 

712 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}", 

713 file=stream, 

714 ) 

715 

716 if "post" in jobs: 

717 print( 

718 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}" 

719 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}", 

720 file=stream, 

721 ) 

722 

723 if "vars" in jobs: 

724 for key, value in jobs["vars"]: 

725 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream) 

726 

727 if "pre_skip" in jobs: 

728 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream) 

729 

730 if "retry" in jobs and jobs["retry"]: 

731 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream) 

732 if "retry_unless_exit" in jobs: 

733 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream) 

734 print("\n", file=stream) 

735 

736 if "abort_dag_on" in jobs and jobs["abort_dag_on"]: 

737 print( 

738 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}" 

739 f" RETURN {jobs['abort_dag_on']['abort_exit']}", 

740 file=stream, 

741 ) 

742 

743 

744class HTCJob: 

745 """HTCondor job for use in building DAG. 

746 

747 Parameters 

748 ---------- 

749 name : `str` 

750 Name of the job. 

751 label : `str` 

752 Label that can used for grouping or lookup. 

753 initcmds : `RestrictedDict` 

754 Initial job commands for submit file. 

755 initdagcmds : `RestrictedDict` 

756 Initial commands for job inside DAG. 

757 initattrs : `dict` 

758 Initial dictionary of job attributes. 

759 """ 

760 

761 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None): 

762 self.name = name 

763 self.label = label 

764 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds) 

765 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds) 

766 self.attrs = initattrs 

767 self.subfile = None 

768 

769 def __str__(self): 

770 return self.name 

771 

772 def add_job_cmds(self, new_commands): 

773 """Add commands to Job (overwrite existing). 

774 

775 Parameters 

776 ---------- 

777 new_commands : `dict` 

778 Submit file commands to be added to Job. 

779 """ 

780 self.cmds.update(new_commands) 

781 

782 def add_dag_cmds(self, new_commands): 

783 """Add DAG commands to Job (overwrite existing). 

784 

785 Parameters 

786 ---------- 

787 new_commands : `dict` 

788 DAG file commands to be added to Job. 

789 """ 

790 self.dagcmds.update(new_commands) 

791 

792 def add_job_attrs(self, new_attrs): 

793 """Add attributes to Job (overwrite existing). 

794 

795 Parameters 

796 ---------- 

797 new_attrs : `dict` 

798 Attributes to be added to Job. 

799 """ 

800 if self.attrs is None: 

801 self.attrs = {} 

802 if new_attrs: 

803 self.attrs.update(new_attrs) 

804 

805 def write_submit_file(self, submit_path, job_subdir=""): 

806 """Write job description to submit file. 

807 

808 Parameters 

809 ---------- 

810 submit_path : `str` 

811 Prefix path for the submit file. 

812 job_subdir : `str`, optional 

813 Template for job subdir. 

814 """ 

815 if not self.subfile: 

816 self.subfile = f"{self.name}.sub" 

817 job_subdir = job_subdir.format(self=self) 

818 if job_subdir: 

819 self.subfile = os.path.join(job_subdir, self.subfile) 

820 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs) 

821 

822 def write_dag_commands(self, stream): 

823 """Write DAG commands for single job to output stream. 

824 

825 Parameters 

826 ---------- 

827 stream : `IO` or `str` 

828 Output Stream. 

829 """ 

830 print(f"JOB {self.name} {self.subfile}", file=stream) 

831 _htc_write_job_commands(stream, self.name, self.dagcmds) 

832 

833 def dump(self, fh): 

834 """Dump job information to output stream. 

835 

836 Parameters 

837 ---------- 

838 fh : `~io.TextIOBase` 

839 Output stream. 

840 """ 

841 printer = pprint.PrettyPrinter(indent=4, stream=fh) 

842 printer.pprint(self.name) 

843 printer.pprint(self.cmds) 

844 printer.pprint(self.attrs) 

845 

846 

847class HTCDag(networkx.DiGraph): 

848 """HTCondor DAG. 

849 

850 Parameters 

851 ---------- 

852 data : networkx.DiGraph.data 

853 Initial graph. 

854 name : `str` 

855 Name for DAG. 

856 """ 

857 

858 def __init__(self, data=None, name=""): 

859 super().__init__(data=data, name=name) 

860 

861 self.graph["attr"] = {} 

862 self.graph["run_id"] = None 

863 self.graph["submit_path"] = None 

864 self.graph["final_job"] = None 

865 

866 def __str__(self): 

867 """Represent basic DAG info as string. 

868 

869 Returns 

870 ------- 

871 info : `str` 

872 String containing basic DAG info. 

873 """ 

874 return f"{self.graph['name']} {len(self)}" 

875 

876 def add_attribs(self, attribs=None): 

877 """Add attributes to the DAG. 

878 

879 Parameters 

880 ---------- 

881 attribs : `dict` 

882 DAG attributes. 

883 """ 

884 if attribs is not None: 

885 self.graph["attr"].update(attribs) 

886 

887 def add_job(self, job, parent_names=None, child_names=None): 

888 """Add an HTCJob to the HTCDag. 

889 

890 Parameters 

891 ---------- 

892 job : `HTCJob` 

893 HTCJob to add to the HTCDag. 

894 parent_names : `~collections.abc.Iterable` [`str`], optional 

895 Names of parent jobs. 

896 child_names : `~collections.abc.Iterable` [`str`], optional 

897 Names of child jobs. 

898 """ 

899 assert isinstance(job, HTCJob) 

900 

901 # Add dag level attributes to each job 

902 job.add_job_attrs(self.graph["attr"]) 

903 

904 self.add_node(job.name, data=job) 

905 

906 if parent_names is not None: 

907 self.add_job_relationships(parent_names, job.name) 

908 

909 if child_names is not None: 

910 self.add_job_relationships(child_names, job.name) 

911 

912 def add_job_relationships(self, parents, children): 

913 """Add DAG edge between parents and children jobs. 

914 

915 Parameters 

916 ---------- 

917 parents : `list` [`str`] 

918 Contains parent job name(s). 

919 children : `list` [`str`] 

920 Contains children job name(s). 

921 """ 

922 self.add_edges_from(itertools.product(parents, children)) 

923 

924 def add_final_job(self, job): 

925 """Add an HTCJob for the FINAL job in HTCDag. 

926 

927 Parameters 

928 ---------- 

929 job : `HTCJob` 

930 HTCJob to add to the HTCDag as a FINAL job. 

931 """ 

932 # Add dag level attributes to each job 

933 job.add_job_attrs(self.graph["attr"]) 

934 

935 self.graph["final_job"] = job 

936 

937 def del_job(self, job_name): 

938 """Delete the job from the DAG. 

939 

940 Parameters 

941 ---------- 

942 job_name : `str` 

943 Name of job in DAG to delete. 

944 """ 

945 # Reconnect edges around node to delete 

946 parents = self.predecessors(job_name) 

947 children = self.successors(job_name) 

948 self.add_edges_from(itertools.product(parents, children)) 

949 

950 # Delete job node (which deletes its edges). 

951 self.remove_node(job_name) 

952 

953 def write(self, submit_path, job_subdir=""): 

954 """Write DAG to a file. 

955 

956 Parameters 

957 ---------- 

958 submit_path : `str` 

959 Prefix path for dag filename to be combined with DAG name. 

960 job_subdir : `str`, optional 

961 Template for job subdir. 

962 """ 

963 self.graph["submit_path"] = submit_path 

964 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag") 

965 os.makedirs(submit_path, exist_ok=True) 

966 with open(self.graph["dag_filename"], "w") as fh: 

967 for _, nodeval in self.nodes().items(): 

968 job = nodeval["data"] 

969 job.write_submit_file(submit_path, job_subdir) 

970 job.write_dag_commands(fh) 

971 for edge in self.edges(): 

972 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

973 print(f"DOT {self.name}.dot", file=fh) 

974 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh) 

975 

976 # Add bps attributes to dag submission 

977 for key, value in self.graph["attr"].items(): 

978 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh) 

979 

980 if self.graph["final_job"]: 

981 job = self.graph["final_job"] 

982 job.write_submit_file(submit_path, job_subdir) 

983 print(f"FINAL {job.name} {job.subfile}", file=fh) 

984 if "pre" in job.dagcmds: 

985 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh) 

986 if "post" in job.dagcmds: 

987 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh) 

988 

989 def dump(self, fh): 

990 """Dump DAG info to output stream. 

991 

992 Parameters 

993 ---------- 

994 fh : `io.IO` or `str` 

995 Where to dump DAG info as text. 

996 """ 

997 for key, value in self.graph: 

998 print(f"{key}={value}", file=fh) 

999 for name, data in self.nodes().items(): 

1000 print(f"{name}:", file=fh) 

1001 data.dump(fh) 

1002 for edge in self.edges(): 

1003 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

1004 if self.graph["final_job"]: 

1005 print(f'FINAL {self.graph["final_job"].name}:', file=fh) 

1006 self.graph["final_job"].dump(fh) 

1007 

1008 def write_dot(self, filename): 

1009 """Write a dot version of the DAG. 

1010 

1011 Parameters 

1012 ---------- 

1013 filename : `str` 

1014 Name of the dot file. 

1015 """ 

1016 pos = networkx.nx_agraph.graphviz_layout(self) 

1017 networkx.draw(self, pos=pos) 

1018 networkx.drawing.nx_pydot.write_dot(self, filename) 

1019 

1020 

1021def condor_q(constraint=None, schedds=None, **kwargs): 

1022 """Get information about the jobs in the HTCondor job queue(s). 

1023 

1024 Parameters 

1025 ---------- 

1026 constraint : `str`, optional 

1027 Constraints to be passed to job query. 

1028 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1029 HTCondor schedulers which to query for job information. If None 

1030 (default), the query will be run against local scheduler only. 

1031 **kwargs : `~typing.Any` 

1032 Additional keyword arguments that need to be passed to the internal 

1033 query method. 

1034 

1035 Returns 

1036 ------- 

1037 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1038 Information about jobs satisfying the search criteria where for each 

1039 Scheduler, local HTCondor job ids are mapped to their respective 

1040 classads. 

1041 """ 

1042 return condor_query(constraint, schedds, htc_query_present, **kwargs) 

1043 

1044 

1045def condor_history(constraint=None, schedds=None, **kwargs): 

1046 """Get information about the jobs from HTCondor history records. 

1047 

1048 Parameters 

1049 ---------- 

1050 constraint : `str`, optional 

1051 Constraints to be passed to job query. 

1052 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1053 HTCondor schedulers which to query for job information. If None 

1054 (default), the query will be run against the history file of 

1055 the local scheduler only. 

1056 **kwargs : `~typing.Any` 

1057 Additional keyword arguments that need to be passed to the internal 

1058 query method. 

1059 

1060 Returns 

1061 ------- 

1062 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1063 Information about jobs satisfying the search criteria where for each 

1064 Scheduler, local HTCondor job ids are mapped to their respective 

1065 classads. 

1066 """ 

1067 return condor_query(constraint, schedds, htc_query_history, **kwargs) 

1068 

1069 

1070def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs): 

1071 """Get information about HTCondor jobs. 

1072 

1073 Parameters 

1074 ---------- 

1075 constraint : `str`, optional 

1076 Constraints to be passed to job query. 

1077 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1078 HTCondor schedulers which to query for job information. If None 

1079 (default), the query will be run against the history file of 

1080 the local scheduler only. 

1081 query_func : callable 

1082 An query function which takes following arguments: 

1083 

1084 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]). 

1085 - ``**kwargs``: Keyword arguments that will be passed to the query 

1086 function. 

1087 **kwargs : `~typing.Any` 

1088 Additional keyword arguments that need to be passed to the query 

1089 method. 

1090 

1091 Returns 

1092 ------- 

1093 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1094 Information about jobs satisfying the search criteria where for each 

1095 Scheduler, local HTCondor job ids are mapped to their respective 

1096 classads. 

1097 """ 

1098 if not schedds: 

1099 coll = htcondor.Collector() 

1100 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1101 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

1102 

1103 # Make sure that 'ClusterId' and 'ProcId' attributes are always included 

1104 # in the job classad. They are needed to construct the job id. 

1105 added_attrs = set() 

1106 if "projection" in kwargs and kwargs["projection"]: 

1107 requested_attrs = set(kwargs["projection"]) 

1108 required_attrs = {"ClusterId", "ProcId"} 

1109 added_attrs = required_attrs - requested_attrs 

1110 for attr in added_attrs: 

1111 kwargs["projection"].append(attr) 

1112 

1113 unwanted_attrs = {"Env", "Environment"} | added_attrs 

1114 job_info = defaultdict(dict) 

1115 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs): 

1116 id_ = f"{job_ad['ClusterId']}.{job_ad['ProcId']}" 

1117 for attr in set(job_ad) & unwanted_attrs: 

1118 del job_ad[attr] 

1119 job_info[schedd_name][id_] = job_ad 

1120 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values())) 

1121 

1122 # Restore the list of the requested attributes to its original value 

1123 # if needed. 

1124 if added_attrs: 

1125 for attr in added_attrs: 

1126 kwargs["projection"].remove(attr) 

1127 

1128 # When returning the results filter out entries for schedulers with no jobs 

1129 # matching the search criteria. 

1130 return {key: val for key, val in job_info.items() if val} 

1131 

1132 

1133def condor_search(constraint=None, hist=None, schedds=None): 

1134 """Search for running and finished jobs satisfying given criteria. 

1135 

1136 Parameters 

1137 ---------- 

1138 constraint : `str`, optional 

1139 Constraints to be passed to job query. 

1140 hist : `float` 

1141 Limit history search to this many days. 

1142 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1143 The list of the HTCondor schedulers which to query for job information. 

1144 If None (default), only the local scheduler will be queried. 

1145 

1146 Returns 

1147 ------- 

1148 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1149 Information about jobs satisfying the search criteria where for each 

1150 Scheduler, local HTCondor job ids are mapped to their respective 

1151 classads. 

1152 """ 

1153 if not schedds: 

1154 coll = htcondor.Collector() 

1155 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1156 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)} 

1157 

1158 job_info = condor_q(constraint=constraint, schedds=schedds) 

1159 if hist is not None: 

1160 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

1161 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

1162 hist_info = condor_history(constraint, schedds=schedds) 

1163 update_job_info(job_info, hist_info) 

1164 return job_info 

1165 

1166 

1167def condor_status(constraint=None, coll=None): 

1168 """Get information about HTCondor pool. 

1169 

1170 Parameters 

1171 ---------- 

1172 constraint : `str`, optional 

1173 Constraints to be passed to the query. 

1174 coll : `htcondor.Collector`, optional 

1175 Object representing HTCondor collector daemon. 

1176 

1177 Returns 

1178 ------- 

1179 pool_info : `dict` [`str`, `dict` [`str`, Any]] 

1180 Mapping between HTCondor slot names and slot information (classAds). 

1181 """ 

1182 if coll is None: 

1183 coll = htcondor.Collector() 

1184 try: 

1185 pool_ads = coll.query(constraint=constraint) 

1186 except OSError as ex: 

1187 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex 

1188 

1189 pool_info = {} 

1190 for slot in pool_ads: 

1191 pool_info[slot["name"]] = dict(slot) 

1192 _LOG.debug("condor_status returned %d ads", len(pool_info)) 

1193 return pool_info 

1194 

1195 

1196def update_job_info(job_info, other_info): 

1197 """Update results of a job query with results from another query. 

1198 

1199 Parameters 

1200 ---------- 

1201 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1202 Results of the job query that needs to be updated. 

1203 other_info : `dict` [`str`, `dict` [`str`, Any]] 

1204 Results of the other job query. 

1205 

1206 Returns 

1207 ------- 

1208 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1209 The updated results. 

1210 """ 

1211 for schedd_name, others in other_info.items(): 

1212 try: 

1213 jobs = job_info[schedd_name] 

1214 except KeyError: 

1215 job_info[schedd_name] = others 

1216 else: 

1217 for id_, ad in others.items(): 

1218 jobs.setdefault(id_, {}).update(ad) 

1219 return job_info 

1220 

1221 

1222def summary_from_dag(dir_name): 

1223 """Build bps_run_summary string from dag file. 

1224 

1225 Parameters 

1226 ---------- 

1227 dir_name : `str` 

1228 Path that includes dag file for a run. 

1229 

1230 Returns 

1231 ------- 

1232 summary : `str` 

1233 Semi-colon separated list of job labels and counts. 

1234 (Same format as saved in dag classad). 

1235 job_name_to_pipetask : `dict` [`str`, `str`] 

1236 Mapping of job names to job labels. 

1237 """ 

1238 dag = next(Path(dir_name).glob("*.dag")) 

1239 

1240 # Later code depends upon insertion order 

1241 counts = defaultdict(int) 

1242 job_name_to_pipetask = {} 

1243 try: 

1244 with open(dag) as fh: 

1245 for line in fh: 

1246 if line.startswith("JOB"): 

1247 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line) 

1248 if m: 

1249 label = m.group(2) 

1250 if label == "init": 

1251 label = "pipetaskInit" 

1252 job_name_to_pipetask[m.group(1)] = label 

1253 counts[label] += 1 

1254 else: # Check if Pegasus submission 

1255 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line) 

1256 if m: 

1257 label = pegasus_name_to_label(m.group(1)) 

1258 job_name_to_pipetask[m.group(1)] = label 

1259 counts[label] += 1 

1260 else: 

1261 _LOG.warning("Parse DAG: unmatched job line: %s", line) 

1262 elif line.startswith("FINAL"): 

1263 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line) 

1264 if m: 

1265 label = m.group(2) 

1266 job_name_to_pipetask[m.group(1)] = label 

1267 counts[label] += 1 

1268 

1269 except (OSError, PermissionError, StopIteration): 

1270 pass 

1271 

1272 summary = ";".join([f"{name}:{counts[name]}" for name in counts]) 

1273 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask) 

1274 return summary, job_name_to_pipetask 

1275 

1276 

1277def pegasus_name_to_label(name): 

1278 """Convert pegasus job name to a label for the report. 

1279 

1280 Parameters 

1281 ---------- 

1282 name : `str` 

1283 Name of job. 

1284 

1285 Returns 

1286 ------- 

1287 label : `str` 

1288 Label for job. 

1289 """ 

1290 label = "UNK" 

1291 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"): 

1292 label = "pegasus" 

1293 else: 

1294 m = re.match(r"pipetask_(\d+_)?([^_]+)", name) 

1295 if m: 

1296 label = m.group(2) 

1297 if label == "init": 

1298 label = "pipetaskInit" 

1299 

1300 return label 

1301 

1302 

1303def read_dag_status(wms_path): 

1304 """Read the node status file for DAG summary information. 

1305 

1306 Parameters 

1307 ---------- 

1308 wms_path : `str` 

1309 Path that includes node status file for a run. 

1310 

1311 Returns 

1312 ------- 

1313 dag_ad : `dict` [`str`, Any] 

1314 DAG summary information. 

1315 """ 

1316 dag_ad = {} 

1317 

1318 # While this is probably more up to date than dag classad, only read from 

1319 # file if need to. 

1320 try: 

1321 try: 

1322 node_stat_file = next(Path(wms_path).glob("*.node_status")) 

1323 _LOG.debug("Reading Node Status File %s", node_stat_file) 

1324 with open(node_stat_file) as infh: 

1325 dag_ad = classad.parseNext(infh) # pylint: disable=E1101 

1326 except StopIteration: 

1327 pass 

1328 

1329 if not dag_ad: 

1330 # Pegasus check here 

1331 try: 

1332 metrics_file = next(Path(wms_path).glob("*.dag.metrics")) 

1333 with open(metrics_file) as infh: 

1334 metrics = json.load(infh) 

1335 dag_ad["NodesTotal"] = metrics.get("jobs", 0) 

1336 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0) 

1337 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0) 

1338 dag_ad["pegasus_version"] = metrics.get("planner_version", "") 

1339 except StopIteration: 

1340 try: 

1341 metrics_file = next(Path(wms_path).glob("*.metrics")) 

1342 with open(metrics_file) as infh: 

1343 metrics = json.load(infh) 

1344 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"] 

1345 dag_ad["pegasus_version"] = metrics.get("version", "") 

1346 except StopIteration: 

1347 pass 

1348 except (OSError, PermissionError): 

1349 pass 

1350 

1351 _LOG.debug("read_dag_status: %s", dag_ad) 

1352 return dict(dag_ad) 

1353 

1354 

1355def read_node_status(wms_path): 

1356 """Read entire node status file. 

1357 

1358 Parameters 

1359 ---------- 

1360 wms_path : `str` 

1361 Path that includes node status file for a run. 

1362 

1363 Returns 

1364 ------- 

1365 jobs : `dict` [`str`, Any] 

1366 DAG summary information. 

1367 """ 

1368 # Get jobid info from other places to fill in gaps in info from node_status 

1369 _, job_name_to_pipetask = summary_from_dag(wms_path) 

1370 wms_workflow_id, loginfo = read_dag_log(wms_path) 

1371 loginfo = read_dag_nodes_log(wms_path) 

1372 _LOG.debug("loginfo = %s", loginfo) 

1373 job_name_to_id = {} 

1374 for jid, jinfo in loginfo.items(): 

1375 if "LogNotes" in jinfo: 

1376 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"]) 

1377 if m: 

1378 job_name_to_id[m.group(1)] = jid 

1379 jinfo["DAGNodeName"] = m.group(1) 

1380 

1381 try: 

1382 node_status = next(Path(wms_path).glob("*.node_status")) 

1383 except StopIteration: 

1384 return loginfo 

1385 

1386 jobs = {} 

1387 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one 

1388 try: 

1389 with open(node_status) as fh: 

1390 ads = classad.parseAds(fh) 

1391 

1392 for jclassad in ads: 

1393 if jclassad["Type"] == "DagStatus": 

1394 # skip DAG summary 

1395 pass 

1396 elif "Node" not in jclassad: 

1397 if jclassad["Type"] != "StatusEnd": 

1398 _LOG.debug("Key 'Node' not in classad: %s", jclassad) 

1399 break 

1400 else: 

1401 if jclassad["Node"] in job_name_to_pipetask: 

1402 try: 

1403 label = job_name_to_pipetask[jclassad["Node"]] 

1404 except KeyError: 

1405 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys()) 

1406 raise 

1407 elif "_" in jclassad["Node"]: 

1408 label = jclassad["Node"].split("_")[1] 

1409 else: 

1410 label = jclassad["Node"] 

1411 

1412 # Make job info as if came from condor_q 

1413 if jclassad["Node"] in job_name_to_id: 

1414 job_id = job_name_to_id[jclassad["Node"]] 

1415 else: 

1416 job_id = str(fake_id) 

1417 fake_id -= 1 

1418 

1419 job = dict(jclassad) 

1420 job["ClusterId"] = int(float(job_id)) 

1421 job["DAGManJobID"] = wms_workflow_id 

1422 job["DAGNodeName"] = jclassad["Node"] 

1423 job["bps_job_label"] = label 

1424 

1425 jobs[str(job_id)] = job 

1426 except (OSError, PermissionError): 

1427 pass 

1428 

1429 return jobs 

1430 

1431 

1432def read_dag_log(wms_path): 

1433 """Read job information from the DAGMan log file. 

1434 

1435 Parameters 

1436 ---------- 

1437 wms_path : `str` 

1438 Path containing the DAGMan log file. 

1439 

1440 Returns 

1441 ------- 

1442 wms_workflow_id : `str` 

1443 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job. 

1444 dag_info : `dict` [`str`, `~collections.abc.Any`] 

1445 HTCondor job information read from the log file mapped to HTCondor 

1446 job id. 

1447 

1448 Raises 

1449 ------ 

1450 FileNotFoundError 

1451 If cannot find DAGMan log in given wms_path. 

1452 """ 

1453 wms_workflow_id = 0 

1454 dag_info = {} 

1455 

1456 path = Path(wms_path) 

1457 if path.exists(): 

1458 try: 

1459 filename = next(path.glob("*.dag.dagman.log")) 

1460 except StopIteration as exc: 

1461 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc 

1462 _LOG.debug("dag node log filename: %s", filename) 

1463 

1464 info = {} 

1465 job_event_log = htcondor.JobEventLog(str(filename)) 

1466 for event in job_event_log.events(stop_after=0): 

1467 id_ = f"{event['Cluster']}.{event['Proc']}" 

1468 if id_ not in info: 

1469 info[id_] = {} 

1470 wms_workflow_id = id_ # taking last job id in case of restarts 

1471 info[id_].update(event) 

1472 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1473 

1474 # only save latest DAG job 

1475 dag_info = {wms_workflow_id: info[wms_workflow_id]} 

1476 for job in dag_info.values(): 

1477 _tweak_log_info(filename, job) 

1478 

1479 return wms_workflow_id, dag_info 

1480 

1481 

1482def read_dag_nodes_log(wms_path): 

1483 """Read job information from the DAGMan nodes log file. 

1484 

1485 Parameters 

1486 ---------- 

1487 wms_path : `str` 

1488 Path containing the DAGMan nodes log file. 

1489 

1490 Returns 

1491 ------- 

1492 info : `dict` [`str`, Any] 

1493 HTCondor job information read from the log file mapped to HTCondor 

1494 job id. 

1495 

1496 Raises 

1497 ------ 

1498 FileNotFoundError 

1499 If cannot find DAGMan node log in given wms_path. 

1500 """ 

1501 try: 

1502 filename = next(Path(wms_path).glob("*.dag.nodes.log")) 

1503 except StopIteration as exc: 

1504 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc 

1505 _LOG.debug("dag node log filename: %s", filename) 

1506 

1507 info = {} 

1508 job_event_log = htcondor.JobEventLog(str(filename)) 

1509 for event in job_event_log.events(stop_after=0): 

1510 id_ = f"{event['Cluster']}.{event['Proc']}" 

1511 if id_ not in info: 

1512 info[id_] = {} 

1513 info[id_].update(event) 

1514 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1515 

1516 # Add more condor_q-like info to info parsed from log file. 

1517 for job in info.values(): 

1518 _tweak_log_info(filename, job) 

1519 

1520 return info 

1521 

1522 

1523def read_dag_info(wms_path): 

1524 """Read custom DAGMan job information from the file. 

1525 

1526 Parameters 

1527 ---------- 

1528 wms_path : `str` 

1529 Path containing the file with the DAGMan job info. 

1530 

1531 Returns 

1532 ------- 

1533 dag_info : `dict` [`str`, `dict` [`str`, Any]] 

1534 HTCondor job information. 

1535 

1536 Raises 

1537 ------ 

1538 FileNotFoundError 

1539 If cannot find DAGMan job info file in the given location. 

1540 """ 

1541 try: 

1542 filename = next(Path(wms_path).glob("*.info.json")) 

1543 except StopIteration as exc: 

1544 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc 

1545 _LOG.debug("DAGMan job information filename: %s", filename) 

1546 try: 

1547 with open(filename) as fh: 

1548 dag_info = json.load(fh) 

1549 except (OSError, PermissionError) as exc: 

1550 _LOG.debug("Retrieving DAGMan job information failed: %s", exc) 

1551 dag_info = {} 

1552 return dag_info 

1553 

1554 

1555def write_dag_info(filename, dag_info): 

1556 """Write custom job information about DAGMan job. 

1557 

1558 Parameters 

1559 ---------- 

1560 filename : `str` 

1561 Name of the file where the information will be stored. 

1562 dag_info : `dict` [`str` `dict` [`str`, Any]] 

1563 Information about the DAGMan job. 

1564 """ 

1565 schedd_name = next(iter(dag_info)) 

1566 dag_id = next(iter(dag_info[schedd_name])) 

1567 dag_ad = dag_info[schedd_name][dag_id] 

1568 try: 

1569 with open(filename, "w") as fh: 

1570 info = { 

1571 schedd_name: { 

1572 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]} 

1573 } 

1574 } 

1575 json.dump(info, fh) 

1576 except (KeyError, OSError, PermissionError) as exc: 

1577 _LOG.debug("Persisting DAGMan job information failed: %s", exc) 

1578 

1579 

1580def _tweak_log_info(filename, job): 

1581 """Massage the given job info has same structure as if came from condor_q. 

1582 

1583 Parameters 

1584 ---------- 

1585 filename : `pathlib.Path` 

1586 Name of the DAGMan log. 

1587 job : `dict` [ `str`, Any ] 

1588 A mapping between HTCondor job id and job information read from 

1589 the log. 

1590 """ 

1591 _LOG.debug("_tweak_log_info: %s %s", filename, job) 

1592 try: 

1593 job["ClusterId"] = job["Cluster"] 

1594 job["ProcId"] = job["Proc"] 

1595 job["Iwd"] = str(filename.parent) 

1596 job["Owner"] = filename.owner() 

1597 if job["MyType"] == "ExecuteEvent": 

1598 job["JobStatus"] = JobStatus.RUNNING 

1599 elif job["MyType"] == "JobTerminatedEvent" or job["MyType"] == "PostScriptTerminatedEvent": 

1600 job["JobStatus"] = JobStatus.COMPLETED 

1601 try: 

1602 if not job["TerminatedNormally"]: 

1603 if "ReturnValue" in job: 

1604 job["ExitCode"] = job["ReturnValue"] 

1605 job["ExitBySignal"] = False 

1606 elif "TerminatedBySignal" in job: 

1607 job["ExitBySignal"] = True 

1608 job["ExitSignal"] = job["TerminatedBySignal"] 

1609 else: 

1610 _LOG.warning("Could not determine exit status for completed job: %s", job) 

1611 except KeyError as ex: 

1612 _LOG.error("Could not determine exit status for job (missing %s): %s", str(ex), job) 

1613 elif job["MyType"] == "SubmitEvent": 

1614 job["JobStatus"] = JobStatus.IDLE 

1615 elif job["MyType"] == "JobAbortedEvent": 

1616 job["JobStatus"] = JobStatus.REMOVED 

1617 else: 

1618 _LOG.debug("Unknown log event type: %s", job["MyType"]) 

1619 except KeyError: 

1620 _LOG.error("Missing key in job: %s", job) 

1621 raise 

1622 

1623 

1624def htc_check_dagman_output(wms_path): 

1625 """Check the DAGMan output for error messages. 

1626 

1627 Parameters 

1628 ---------- 

1629 wms_path : `str` 

1630 Directory containing the DAGman output file. 

1631 

1632 Returns 

1633 ------- 

1634 message : `str` 

1635 Message containing error messages from the DAGMan output. Empty 

1636 string if no messages. 

1637 

1638 Raises 

1639 ------ 

1640 FileNotFoundError 

1641 If cannot find DAGMan standard output file in given wms_path. 

1642 """ 

1643 try: 

1644 filename = next(Path(wms_path).glob("*.dag.dagman.out")) 

1645 except StopIteration as exc: 

1646 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc 

1647 _LOG.debug("dag output filename: %s", filename) 

1648 

1649 message = "" 

1650 try: 

1651 with open(filename) as fh: 

1652 last_submit_failed = "" 

1653 for line in fh: 

1654 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line) 

1655 if m: 

1656 last_submit_failed = m.group(1) 

1657 if last_submit_failed: 

1658 message = f"Warn: Job submission issues (last: {last_submit_failed})" 

1659 except (OSError, PermissionError): 

1660 message = f"Warn: Could not read dagman output file from {wms_path}." 

1661 return message