Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%

605 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-11 03:48 -0700

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Placeholder HTCondor DAGMan API. 

29 

30There is new work on a python DAGMan API from HTCondor. However, at this 

31time, it tries to make things easier by assuming DAG is easily broken into 

32levels where there are 1-1 or all-to-all relationships to nodes in next 

33level. LSST workflows are more complicated. 

34""" 

35 

36__all__ = [ 

37 "DagStatus", 

38 "JobStatus", 

39 "NodeStatus", 

40 "RestrictedDict", 

41 "HTCJob", 

42 "HTCDag", 

43 "htc_backup_files", 

44 "htc_check_dagman_output", 

45 "htc_create_submit_from_cmd", 

46 "htc_create_submit_from_dag", 

47 "htc_create_submit_from_file", 

48 "htc_escape", 

49 "htc_write_attribs", 

50 "htc_write_condor_file", 

51 "htc_query_history", 

52 "htc_query_present", 

53 "htc_version", 

54 "htc_submit_dag", 

55 "condor_history", 

56 "condor_q", 

57 "condor_search", 

58 "condor_status", 

59 "update_job_info", 

60 "MISSING_ID", 

61 "summary_from_dag", 

62 "read_dag_info", 

63 "read_dag_log", 

64 "read_dag_nodes_log", 

65 "read_dag_status", 

66 "read_node_status", 

67 "write_dag_info", 

68 "pegasus_name_to_label", 

69] 

70 

71 

72import itertools 

73import json 

74import logging 

75import os 

76import pprint 

77import re 

78import subprocess 

79from collections import defaultdict 

80from collections.abc import MutableMapping 

81from datetime import datetime, timedelta 

82from enum import IntEnum 

83from pathlib import Path 

84 

85import classad 

86import htcondor 

87import networkx 

88from packaging import version 

89 

90from .handlers import HTC_JOB_AD_HANDLERS 

91 

92_LOG = logging.getLogger(__name__) 

93 

94MISSING_ID = -99999 

95 

96 

97class DagStatus(IntEnum): 

98 """HTCondor DAGMan's statuses for a DAG.""" 

99 

100 OK = 0 

101 ERROR = 1 # an error condition different than those listed here 

102 FAILED = 2 # one or more nodes in the DAG have failed 

103 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification 

104 REMOVED = 4 # the DAG has been removed by condor_rm 

105 CYCLE = 5 # a cycle was found in the DAG 

106 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8) 

107 

108 

109class JobStatus(IntEnum): 

110 """HTCondor's statuses for jobs.""" 

111 

112 UNEXPANDED = 0 # Unexpanded 

113 IDLE = 1 # Idle 

114 RUNNING = 2 # Running 

115 REMOVED = 3 # Removed 

116 COMPLETED = 4 # Completed 

117 HELD = 5 # Held 

118 TRANSFERRING_OUTPUT = 6 # Transferring_Output 

119 SUSPENDED = 7 # Suspended 

120 

121 

122class NodeStatus(IntEnum): 

123 """HTCondor's statuses for DAGman nodes.""" 

124 

125 # (STATUS_NOT_READY): At least one parent has not yet finished or the node 

126 # is a FINAL node. 

127 NOT_READY = 0 

128 

129 # (STATUS_READY): All parents have finished, but the node is not yet 

130 # running. 

131 READY = 1 

132 

133 # (STATUS_PRERUN): The node’s PRE script is running. 

134 PRERUN = 2 

135 

136 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue. 

137 # StatusDetails = "not_idle" -> running. 

138 # JobProcsHeld = 1-> hold. 

139 # JobProcsQueued = 1 -> idle. 

140 SUBMITTED = 3 

141 

142 # (STATUS_POSTRUN): The node’s POST script is running. 

143 POSTRUN = 4 

144 

145 # (STATUS_DONE): The node has completed successfully. 

146 DONE = 5 

147 

148 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g., 

149 # ULOG_JOB_ABORTED for deleted job). 

150 ERROR = 6 

151 

152 # (STATUS_FUTILE): The node will never run because ancestor node failed. 

153 FUTILE = 7 

154 

155 

156HTC_QUOTE_KEYS = {"environment"} 

157HTC_VALID_JOB_KEYS = { 

158 "universe", 

159 "executable", 

160 "arguments", 

161 "environment", 

162 "log", 

163 "error", 

164 "output", 

165 "should_transfer_files", 

166 "when_to_transfer_output", 

167 "getenv", 

168 "notification", 

169 "notify_user", 

170 "concurrency_limit", 

171 "transfer_executable", 

172 "transfer_input_files", 

173 "transfer_output_files", 

174 "request_cpus", 

175 "request_memory", 

176 "request_disk", 

177 "priority", 

178 "category", 

179 "requirements", 

180 "on_exit_hold", 

181 "on_exit_hold_reason", 

182 "on_exit_hold_subcode", 

183 "max_retries", 

184 "periodic_release", 

185 "periodic_remove", 

186 "accounting_group", 

187 "accounting_group_user", 

188} 

189HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"} 

190HTC_VERSION = version.parse(htcondor.__version__) 

191 

192 

193class RestrictedDict(MutableMapping): 

194 """A dictionary that only allows certain keys. 

195 

196 Parameters 

197 ---------- 

198 valid_keys : `Container` 

199 Strings that are valid keys. 

200 init_data : `dict` or `RestrictedDict`, optional 

201 Initial data. 

202 

203 Raises 

204 ------ 

205 KeyError 

206 If invalid key(s) in init_data. 

207 """ 

208 

209 def __init__(self, valid_keys, init_data=()): 

210 self.valid_keys = valid_keys 

211 self.data = {} 

212 self.update(init_data) 

213 

214 def __getitem__(self, key): 

215 """Return value for given key if exists. 

216 

217 Parameters 

218 ---------- 

219 key : `str` 

220 Identifier for value to return. 

221 

222 Returns 

223 ------- 

224 value : `~collections.abc.Any` 

225 Value associated with given key. 

226 

227 Raises 

228 ------ 

229 KeyError 

230 If key doesn't exist. 

231 """ 

232 return self.data[key] 

233 

234 def __delitem__(self, key): 

235 """Delete value for given key if exists. 

236 

237 Parameters 

238 ---------- 

239 key : `str` 

240 Identifier for value to delete. 

241 

242 Raises 

243 ------ 

244 KeyError 

245 If key doesn't exist. 

246 """ 

247 del self.data[key] 

248 

249 def __setitem__(self, key, value): 

250 """Store key,value in internal dict only if key is valid. 

251 

252 Parameters 

253 ---------- 

254 key : `str` 

255 Identifier to associate with given value. 

256 value : `~collections.abc.Any` 

257 Value to store. 

258 

259 Raises 

260 ------ 

261 KeyError 

262 If key is invalid. 

263 """ 

264 if key not in self.valid_keys: 

265 raise KeyError(f"Invalid key {key}") 

266 self.data[key] = value 

267 

268 def __iter__(self): 

269 return self.data.__iter__() 

270 

271 def __len__(self): 

272 return len(self.data) 

273 

274 def __str__(self): 

275 return str(self.data) 

276 

277 

278def htc_backup_files(wms_path, subdir=None, limit=100): 

279 """Backup select HTCondor files in the submit directory. 

280 

281 Files will be saved in separate subdirectories which will be created in 

282 the submit directory where the files are located. These subdirectories 

283 will be consecutive, zero-padded integers. Their values will correspond to 

284 the number of HTCondor rescue DAGs in the submit directory. 

285 

286 Hence, with the default settings, copies after the initial failed run will 

287 be placed in '001' subdirectory, '002' after the first restart, and so on 

288 until the limit of backups is reached. If there's no rescue DAG yet, files 

289 will be copied to '000' subdirectory. 

290 

291 Parameters 

292 ---------- 

293 wms_path : `str` or `pathlib.Path` 

294 Path to the submit directory either absolute or relative. 

295 subdir : `str` or `pathlib.Path`, optional 

296 A path, relative to the submit directory, where all subdirectories with 

297 backup files will be kept. Defaults to None which means that the backup 

298 subdirectories will be placed directly in the submit directory. 

299 limit : `int`, optional 

300 Maximal number of backups. If the number of backups reaches the limit, 

301 the last backup files will be overwritten. The default value is 100 

302 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in 

303 version 8.8+. 

304 

305 Raises 

306 ------ 

307 FileNotFoundError 

308 If the submit directory or the file that needs to be backed up does not 

309 exist. 

310 OSError 

311 If the submit directory cannot be accessed or backing up a file failed 

312 either due to permission or filesystem related issues. 

313 

314 Notes 

315 ----- 

316 This is not a generic function for making backups. It is intended to be 

317 used once, just before a restart, to make snapshots of files which will be 

318 overwritten by HTCondor after during the next run. 

319 """ 

320 width = len(str(limit)) 

321 

322 path = Path(wms_path).resolve() 

323 if not path.is_dir(): 

324 raise FileNotFoundError(f"Directory {path} not found") 

325 

326 # Initialize the backup counter. 

327 rescue_dags = list(Path(wms_path).glob("*.rescue*")) 

328 counter = min(len(rescue_dags), limit) 

329 

330 # Create the backup directory and move select files there. 

331 dest = Path(wms_path) 

332 if subdir: 

333 # PurePath.is_relative_to() is not available before Python 3.9. Hence 

334 # we need to check is 'subdir' is in the submit directory in some other 

335 # way if it is an absolute path. 

336 subdir = Path(subdir) 

337 if subdir.is_absolute(): 

338 if dest not in subdir.parents: 

339 _LOG.warning( 

340 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.", 

341 subdir, 

342 wms_path, 

343 ) 

344 else: 

345 dest /= subdir 

346 else: 

347 dest /= subdir 

348 dest /= f"{counter:0{width}}" 

349 try: 

350 dest.mkdir(parents=True, exist_ok=False if counter < limit else True) 

351 except FileExistsError: 

352 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest) 

353 else: 

354 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]: 

355 for source in path.glob(patt): 

356 if source.is_file(): 

357 target = dest / source.relative_to(path) 

358 try: 

359 source.rename(target) 

360 except OSError as exc: 

361 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None 

362 else: 

363 raise FileNotFoundError(f"Backing up '{source}' failed: not a file") 

364 

365 

366def htc_escape(value): 

367 """Escape characters in given value based upon HTCondor syntax. 

368 

369 Parameters 

370 ---------- 

371 value : `~collections.abc.Any` 

372 Value that needs to have characters escaped if string. 

373 

374 Returns 

375 ------- 

376 new_value : `~collections.abc.Any` 

377 Given value with characters escaped appropriate for HTCondor if string. 

378 """ 

379 if isinstance(value, str): 

380 newval = value.replace('"', '""').replace("'", "''").replace("&quot;", '"') 

381 else: 

382 newval = value 

383 

384 return newval 

385 

386 

387def htc_write_attribs(stream, attrs): 

388 """Write job attributes in HTCondor format to writeable stream. 

389 

390 Parameters 

391 ---------- 

392 stream : `~io.TextIOBase` 

393 Output text stream (typically an open file). 

394 attrs : `dict` 

395 HTCondor job attributes (dictionary of attribute key, value). 

396 """ 

397 for key, value in attrs.items(): 

398 # Make sure strings are syntactically correct for HTCondor. 

399 if isinstance(value, str): 

400 pval = f'"{htc_escape(value)}"' 

401 else: 

402 pval = value 

403 

404 print(f"+{key} = {pval}", file=stream) 

405 

406 

407def htc_write_condor_file(filename, job_name, job, job_attrs): 

408 """Write an HTCondor submit file. 

409 

410 Parameters 

411 ---------- 

412 filename : `str` 

413 Filename for the HTCondor submit file. 

414 job_name : `str` 

415 Job name to use in submit file. 

416 job : `RestrictedDict` 

417 Submit script information. 

418 job_attrs : `dict` 

419 Job attributes. 

420 """ 

421 os.makedirs(os.path.dirname(filename), exist_ok=True) 

422 with open(filename, "w") as fh: 

423 for key, value in job.items(): 

424 if value is not None: 

425 if key in HTC_QUOTE_KEYS: 

426 print(f'{key}="{htc_escape(value)}"', file=fh) 

427 else: 

428 print(f"{key}={value}", file=fh) 

429 for key in ["output", "error", "log"]: 

430 if key not in job: 

431 filename = f"{job_name}.$(Cluster).${key[:3]}" 

432 print(f"{key}={filename}", file=fh) 

433 

434 if job_attrs is not None: 

435 htc_write_attribs(fh, job_attrs) 

436 print("queue", file=fh) 

437 

438 

439# To avoid doing the version check during every function call select 

440# appropriate conversion function at the import time. 

441# 

442# Make sure that *each* version specific variant of the conversion function(s) 

443# has the same signature after applying any changes! 

444if HTC_VERSION < version.parse("8.9.8"): 444 ↛ 446line 444 didn't jump to line 446, because the condition on line 444 was never true

445 

446 def htc_tune_schedd_args(**kwargs): 

447 """Ensure that arguments for Schedd are version appropriate. 

448 

449 The old arguments: 'requirements' and 'attr_list' of 

450 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were 

451 deprecated in favor of 'constraint' and 'projection', respectively, 

452 starting from version 8.9.8. The function will convert "new" keyword 

453 arguments to "old" ones. 

454 

455 Parameters 

456 ---------- 

457 **kwargs 

458 Any keyword arguments that Schedd.history(), Schedd.query(), and 

459 Schedd.xquery() accepts. 

460 

461 Returns 

462 ------- 

463 kwargs : `dict` [`str`, Any] 

464 Keywords arguments that are guaranteed to work with the Python 

465 HTCondor API. 

466 

467 Notes 

468 ----- 

469 Function doesn't validate provided keyword arguments beyond converting 

470 selected arguments to their version specific form. For example, 

471 it won't remove keywords that are not supported by the methods 

472 mentioned earlier. 

473 """ 

474 translation_table = { 

475 "constraint": "requirements", 

476 "projection": "attr_list", 

477 } 

478 for new, old in translation_table.items(): 

479 try: 

480 kwargs[old] = kwargs.pop(new) 

481 except KeyError: 

482 pass 

483 return kwargs 

484 

485else: 

486 

487 def htc_tune_schedd_args(**kwargs): 

488 """Ensure that arguments for Schedd are version appropriate. 

489 

490 This is the fallback function if no version specific alteration are 

491 necessary. Effectively, a no-op. 

492 

493 Parameters 

494 ---------- 

495 **kwargs 

496 Any keyword arguments that Schedd.history(), Schedd.query(), and 

497 Schedd.xquery() accepts. 

498 

499 Returns 

500 ------- 

501 kwargs : `dict` [`str`, Any] 

502 Keywords arguments that were passed to the function. 

503 """ 

504 return kwargs 

505 

506 

507def htc_query_history(schedds, **kwargs): 

508 """Fetch history records from the condor_schedd daemon. 

509 

510 Parameters 

511 ---------- 

512 schedds : `htcondor.Schedd` 

513 HTCondor schedulers which to query for job information. 

514 **kwargs 

515 Any keyword arguments that Schedd.history() accepts. 

516 

517 Yields 

518 ------ 

519 schedd_name : `str` 

520 Name of the HTCondor scheduler managing the job queue. 

521 job_ad : `dict` [`str`, Any] 

522 A dictionary representing HTCondor ClassAd describing a job. It maps 

523 job attributes names to values of the ClassAd expressions they 

524 represent. 

525 """ 

526 # If not set, provide defaults for positional arguments. 

527 kwargs.setdefault("constraint", None) 

528 kwargs.setdefault("projection", []) 

529 kwargs = htc_tune_schedd_args(**kwargs) 

530 for schedd_name, schedd in schedds.items(): 

531 for job_ad in schedd.history(**kwargs): 

532 yield schedd_name, dict(job_ad) 

533 

534 

535def htc_query_present(schedds, **kwargs): 

536 """Query the condor_schedd daemon for job ads. 

537 

538 Parameters 

539 ---------- 

540 schedds : `htcondor.Schedd` 

541 HTCondor schedulers which to query for job information. 

542 **kwargs 

543 Any keyword arguments that Schedd.xquery() accepts. 

544 

545 Yields 

546 ------ 

547 schedd_name : `str` 

548 Name of the HTCondor scheduler managing the job queue. 

549 job_ad : `dict` [`str`, Any] 

550 A dictionary representing HTCondor ClassAd describing a job. It maps 

551 job attributes names to values of the ClassAd expressions they 

552 represent. 

553 """ 

554 kwargs = htc_tune_schedd_args(**kwargs) 

555 for schedd_name, schedd in schedds.items(): 

556 for job_ad in schedd.query(**kwargs): 

557 yield schedd_name, dict(job_ad) 

558 

559 

560def htc_version(): 

561 """Return the version given by the HTCondor API. 

562 

563 Returns 

564 ------- 

565 version : `str` 

566 HTCondor version as easily comparable string. 

567 """ 

568 return str(HTC_VERSION) 

569 

570 

571def htc_submit_dag(sub): 

572 """Submit job for execution. 

573 

574 Parameters 

575 ---------- 

576 sub : `htcondor.Submit` 

577 An object representing a job submit description. 

578 

579 Returns 

580 ------- 

581 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

582 Information about jobs satisfying the search criteria where for each 

583 Scheduler, local HTCondor job ids are mapped to their respective 

584 classads. 

585 """ 

586 coll = htcondor.Collector() 

587 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

588 schedd = htcondor.Schedd(schedd_ad) 

589 

590 # If Schedd.submit() fails, the method will raise an exception. Usually, 

591 # that implies issues with the HTCondor pool which BPS can't address. 

592 # Hence, no effort is made to handle the exception. 

593 submit_result = schedd.submit(sub) 

594 

595 # Sadly, the ClassAd from Schedd.submit() (see above) does not have 

596 # 'GlobalJobId' so we need to run a regular query to get it anyway. 

597 schedd_name = schedd_ad["Name"] 

598 schedd_dag_info = condor_q( 

599 constraint=f"ClusterId == {submit_result.cluster()}", schedds={schedd_name: schedd} 

600 ) 

601 return schedd_dag_info 

602 

603 

604def htc_create_submit_from_dag(dag_filename, submit_options=None): 

605 """Create a DAGMan job submit description. 

606 

607 Parameters 

608 ---------- 

609 dag_filename : `str` 

610 Name of file containing HTCondor DAG commands. 

611 submit_options : `dict` [`str`, Any], optional 

612 Contains extra options for command line (Value of None means flag). 

613 

614 Returns 

615 ------- 

616 sub : `htcondor.Submit` 

617 An object representing a job submit description. 

618 

619 Notes 

620 ----- 

621 Use with HTCondor versions which support htcondor.Submit.from_dag(), 

622 i.e., 8.9.3 or newer. 

623 """ 

624 return htcondor.Submit.from_dag(dag_filename, submit_options) 

625 

626 

627def htc_create_submit_from_cmd(dag_filename, submit_options=None): 

628 """Create a DAGMan job submit description. 

629 

630 Create a DAGMan job submit description by calling ``condor_submit_dag`` 

631 on given DAG description file. 

632 

633 Parameters 

634 ---------- 

635 dag_filename : `str` 

636 Name of file containing HTCondor DAG commands. 

637 submit_options : `dict` [`str`, Any], optional 

638 Contains extra options for command line (Value of None means flag). 

639 

640 Returns 

641 ------- 

642 sub : `htcondor.Submit` 

643 An object representing a job submit description. 

644 

645 Notes 

646 ----- 

647 Use with HTCondor versions which do not support htcondor.Submit.from_dag(), 

648 i.e., older than 8.9.3. 

649 """ 

650 # Run command line condor_submit_dag command. 

651 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse " 

652 

653 if submit_options is not None: 

654 for opt, val in submit_options.items(): 

655 cmd += f" -{opt} {val or ''}" 

656 cmd += f"{dag_filename}" 

657 

658 process = subprocess.Popen( 

659 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" 

660 ) 

661 process.wait() 

662 

663 if process.returncode != 0: 

664 print(f"Exit code: {process.returncode}") 

665 print(process.communicate()[0]) 

666 raise RuntimeError("Problems running condor_submit_dag") 

667 

668 return htc_create_submit_from_file(f"{dag_filename}.condor.sub") 

669 

670 

671def htc_create_submit_from_file(submit_file): 

672 """Parse a submission file. 

673 

674 Parameters 

675 ---------- 

676 submit_file : `str` 

677 Name of the HTCondor submit file. 

678 

679 Returns 

680 ------- 

681 sub : `htcondor.Submit` 

682 An object representing a job submit description. 

683 """ 

684 descriptors = {} 

685 with open(submit_file) as fh: 

686 for line in fh: 

687 line = line.strip() 

688 if not line.startswith("#") and not line == "queue": 

689 (key, val) = re.split(r"\s*=\s*", line, 1) 

690 descriptors[key] = val 

691 

692 # Avoid UserWarning: the line 'copy_to_spool = False' was 

693 # unused by Submit object. Is it a typo? 

694 try: 

695 del descriptors["copy_to_spool"] 

696 except KeyError: 

697 pass 

698 

699 return htcondor.Submit(descriptors) 

700 

701 

702def _htc_write_job_commands(stream, name, jobs): 

703 """Output the DAGMan job lines for single job in DAG. 

704 

705 Parameters 

706 ---------- 

707 stream : `~io.TextIOBase` 

708 Writeable text stream (typically an opened file). 

709 name : `str` 

710 Job name. 

711 jobs : `RestrictedDict` 

712 DAG job keys and values. 

713 """ 

714 if "pre" in jobs: 

715 print( 

716 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}" 

717 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}", 

718 file=stream, 

719 ) 

720 

721 if "post" in jobs: 

722 print( 

723 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}" 

724 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}", 

725 file=stream, 

726 ) 

727 

728 if "vars" in jobs: 

729 for key, value in jobs["vars"]: 

730 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream) 

731 

732 if "pre_skip" in jobs: 

733 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream) 

734 

735 if "retry" in jobs and jobs["retry"]: 

736 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream) 

737 if "retry_unless_exit" in jobs: 

738 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream) 

739 print("\n", file=stream) 

740 

741 if "abort_dag_on" in jobs and jobs["abort_dag_on"]: 

742 print( 

743 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}" 

744 f" RETURN {jobs['abort_dag_on']['abort_exit']}", 

745 file=stream, 

746 ) 

747 

748 

749class HTCJob: 

750 """HTCondor job for use in building DAG. 

751 

752 Parameters 

753 ---------- 

754 name : `str` 

755 Name of the job. 

756 label : `str` 

757 Label that can used for grouping or lookup. 

758 initcmds : `RestrictedDict` 

759 Initial job commands for submit file. 

760 initdagcmds : `RestrictedDict` 

761 Initial commands for job inside DAG. 

762 initattrs : `dict` 

763 Initial dictionary of job attributes. 

764 """ 

765 

766 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None): 

767 self.name = name 

768 self.label = label 

769 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds) 

770 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds) 

771 self.attrs = initattrs 

772 self.subfile = None 

773 

774 def __str__(self): 

775 return self.name 

776 

777 def add_job_cmds(self, new_commands): 

778 """Add commands to Job (overwrite existing). 

779 

780 Parameters 

781 ---------- 

782 new_commands : `dict` 

783 Submit file commands to be added to Job. 

784 """ 

785 self.cmds.update(new_commands) 

786 

787 def add_dag_cmds(self, new_commands): 

788 """Add DAG commands to Job (overwrite existing). 

789 

790 Parameters 

791 ---------- 

792 new_commands : `dict` 

793 DAG file commands to be added to Job. 

794 """ 

795 self.dagcmds.update(new_commands) 

796 

797 def add_job_attrs(self, new_attrs): 

798 """Add attributes to Job (overwrite existing). 

799 

800 Parameters 

801 ---------- 

802 new_attrs : `dict` 

803 Attributes to be added to Job. 

804 """ 

805 if self.attrs is None: 

806 self.attrs = {} 

807 if new_attrs: 

808 self.attrs.update(new_attrs) 

809 

810 def write_submit_file(self, submit_path, job_subdir=""): 

811 """Write job description to submit file. 

812 

813 Parameters 

814 ---------- 

815 submit_path : `str` 

816 Prefix path for the submit file. 

817 job_subdir : `str`, optional 

818 Template for job subdir. 

819 """ 

820 if not self.subfile: 

821 self.subfile = f"{self.name}.sub" 

822 job_subdir = job_subdir.format(self=self) 

823 if job_subdir: 

824 self.subfile = os.path.join(job_subdir, self.subfile) 

825 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs) 

826 

827 def write_dag_commands(self, stream): 

828 """Write DAG commands for single job to output stream. 

829 

830 Parameters 

831 ---------- 

832 stream : `IO` or `str` 

833 Output Stream. 

834 """ 

835 print(f"JOB {self.name} {self.subfile}", file=stream) 

836 _htc_write_job_commands(stream, self.name, self.dagcmds) 

837 

838 def dump(self, fh): 

839 """Dump job information to output stream. 

840 

841 Parameters 

842 ---------- 

843 fh : `~io.TextIOBase` 

844 Output stream. 

845 """ 

846 printer = pprint.PrettyPrinter(indent=4, stream=fh) 

847 printer.pprint(self.name) 

848 printer.pprint(self.cmds) 

849 printer.pprint(self.attrs) 

850 

851 

852class HTCDag(networkx.DiGraph): 

853 """HTCondor DAG. 

854 

855 Parameters 

856 ---------- 

857 data : networkx.DiGraph.data 

858 Initial graph. 

859 name : `str` 

860 Name for DAG. 

861 """ 

862 

863 def __init__(self, data=None, name=""): 

864 super().__init__(data=data, name=name) 

865 

866 self.graph["attr"] = {} 

867 self.graph["run_id"] = None 

868 self.graph["submit_path"] = None 

869 self.graph["final_job"] = None 

870 

871 def __str__(self): 

872 """Represent basic DAG info as string. 

873 

874 Returns 

875 ------- 

876 info : `str` 

877 String containing basic DAG info. 

878 """ 

879 return f"{self.graph['name']} {len(self)}" 

880 

881 def add_attribs(self, attribs=None): 

882 """Add attributes to the DAG. 

883 

884 Parameters 

885 ---------- 

886 attribs : `dict` 

887 DAG attributes. 

888 """ 

889 if attribs is not None: 

890 self.graph["attr"].update(attribs) 

891 

892 def add_job(self, job, parent_names=None, child_names=None): 

893 """Add an HTCJob to the HTCDag. 

894 

895 Parameters 

896 ---------- 

897 job : `HTCJob` 

898 HTCJob to add to the HTCDag. 

899 parent_names : `~collections.abc.Iterable` [`str`], optional 

900 Names of parent jobs. 

901 child_names : `~collections.abc.Iterable` [`str`], optional 

902 Names of child jobs. 

903 """ 

904 assert isinstance(job, HTCJob) 

905 

906 # Add dag level attributes to each job 

907 job.add_job_attrs(self.graph["attr"]) 

908 

909 self.add_node(job.name, data=job) 

910 

911 if parent_names is not None: 

912 self.add_job_relationships(parent_names, job.name) 

913 

914 if child_names is not None: 

915 self.add_job_relationships(child_names, job.name) 

916 

917 def add_job_relationships(self, parents, children): 

918 """Add DAG edge between parents and children jobs. 

919 

920 Parameters 

921 ---------- 

922 parents : `list` [`str`] 

923 Contains parent job name(s). 

924 children : `list` [`str`] 

925 Contains children job name(s). 

926 """ 

927 self.add_edges_from(itertools.product(parents, children)) 

928 

929 def add_final_job(self, job): 

930 """Add an HTCJob for the FINAL job in HTCDag. 

931 

932 Parameters 

933 ---------- 

934 job : `HTCJob` 

935 HTCJob to add to the HTCDag as a FINAL job. 

936 """ 

937 # Add dag level attributes to each job 

938 job.add_job_attrs(self.graph["attr"]) 

939 

940 self.graph["final_job"] = job 

941 

942 def del_job(self, job_name): 

943 """Delete the job from the DAG. 

944 

945 Parameters 

946 ---------- 

947 job_name : `str` 

948 Name of job in DAG to delete. 

949 """ 

950 # Reconnect edges around node to delete 

951 parents = self.predecessors(job_name) 

952 children = self.successors(job_name) 

953 self.add_edges_from(itertools.product(parents, children)) 

954 

955 # Delete job node (which deletes its edges). 

956 self.remove_node(job_name) 

957 

958 def write(self, submit_path, job_subdir=""): 

959 """Write DAG to a file. 

960 

961 Parameters 

962 ---------- 

963 submit_path : `str` 

964 Prefix path for dag filename to be combined with DAG name. 

965 job_subdir : `str`, optional 

966 Template for job subdir. 

967 """ 

968 self.graph["submit_path"] = submit_path 

969 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag") 

970 os.makedirs(submit_path, exist_ok=True) 

971 with open(self.graph["dag_filename"], "w") as fh: 

972 for _, nodeval in self.nodes().items(): 

973 job = nodeval["data"] 

974 job.write_submit_file(submit_path, job_subdir) 

975 job.write_dag_commands(fh) 

976 for edge in self.edges(): 

977 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

978 print(f"DOT {self.name}.dot", file=fh) 

979 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh) 

980 

981 # Add bps attributes to dag submission 

982 for key, value in self.graph["attr"].items(): 

983 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh) 

984 

985 if self.graph["final_job"]: 

986 job = self.graph["final_job"] 

987 job.write_submit_file(submit_path, job_subdir) 

988 print(f"FINAL {job.name} {job.subfile}", file=fh) 

989 if "pre" in job.dagcmds: 

990 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh) 

991 if "post" in job.dagcmds: 

992 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh) 

993 

994 def dump(self, fh): 

995 """Dump DAG info to output stream. 

996 

997 Parameters 

998 ---------- 

999 fh : `io.IO` or `str` 

1000 Where to dump DAG info as text. 

1001 """ 

1002 for key, value in self.graph: 

1003 print(f"{key}={value}", file=fh) 

1004 for name, data in self.nodes().items(): 

1005 print(f"{name}:", file=fh) 

1006 data.dump(fh) 

1007 for edge in self.edges(): 

1008 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

1009 if self.graph["final_job"]: 

1010 print(f'FINAL {self.graph["final_job"].name}:', file=fh) 

1011 self.graph["final_job"].dump(fh) 

1012 

1013 def write_dot(self, filename): 

1014 """Write a dot version of the DAG. 

1015 

1016 Parameters 

1017 ---------- 

1018 filename : `str` 

1019 Name of the dot file. 

1020 """ 

1021 pos = networkx.nx_agraph.graphviz_layout(self) 

1022 networkx.draw(self, pos=pos) 

1023 networkx.drawing.nx_pydot.write_dot(self, filename) 

1024 

1025 

1026def condor_q(constraint=None, schedds=None, **kwargs): 

1027 """Get information about the jobs in the HTCondor job queue(s). 

1028 

1029 Parameters 

1030 ---------- 

1031 constraint : `str`, optional 

1032 Constraints to be passed to job query. 

1033 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1034 HTCondor schedulers which to query for job information. If None 

1035 (default), the query will be run against local scheduler only. 

1036 **kwargs : `~typing.Any` 

1037 Additional keyword arguments that need to be passed to the internal 

1038 query method. 

1039 

1040 Returns 

1041 ------- 

1042 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1043 Information about jobs satisfying the search criteria where for each 

1044 Scheduler, local HTCondor job ids are mapped to their respective 

1045 classads. 

1046 """ 

1047 return condor_query(constraint, schedds, htc_query_present, **kwargs) 

1048 

1049 

1050def condor_history(constraint=None, schedds=None, **kwargs): 

1051 """Get information about the jobs from HTCondor history records. 

1052 

1053 Parameters 

1054 ---------- 

1055 constraint : `str`, optional 

1056 Constraints to be passed to job query. 

1057 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1058 HTCondor schedulers which to query for job information. If None 

1059 (default), the query will be run against the history file of 

1060 the local scheduler only. 

1061 **kwargs : `~typing.Any` 

1062 Additional keyword arguments that need to be passed to the internal 

1063 query method. 

1064 

1065 Returns 

1066 ------- 

1067 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1068 Information about jobs satisfying the search criteria where for each 

1069 Scheduler, local HTCondor job ids are mapped to their respective 

1070 classads. 

1071 """ 

1072 return condor_query(constraint, schedds, htc_query_history, **kwargs) 

1073 

1074 

1075def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs): 

1076 """Get information about HTCondor jobs. 

1077 

1078 Parameters 

1079 ---------- 

1080 constraint : `str`, optional 

1081 Constraints to be passed to job query. 

1082 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1083 HTCondor schedulers which to query for job information. If None 

1084 (default), the query will be run against the history file of 

1085 the local scheduler only. 

1086 query_func : callable 

1087 An query function which takes following arguments: 

1088 

1089 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]). 

1090 - ``**kwargs``: Keyword arguments that will be passed to the query 

1091 function. 

1092 **kwargs : `~typing.Any` 

1093 Additional keyword arguments that need to be passed to the query 

1094 method. 

1095 

1096 Returns 

1097 ------- 

1098 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1099 Information about jobs satisfying the search criteria where for each 

1100 Scheduler, local HTCondor job ids are mapped to their respective 

1101 classads. 

1102 """ 

1103 if not schedds: 

1104 coll = htcondor.Collector() 

1105 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1106 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

1107 

1108 # Make sure that 'ClusterId' and 'ProcId' attributes are always included 

1109 # in the job classad. They are needed to construct the job id. 

1110 added_attrs = set() 

1111 if "projection" in kwargs and kwargs["projection"]: 

1112 requested_attrs = set(kwargs["projection"]) 

1113 required_attrs = {"ClusterId", "ProcId"} 

1114 added_attrs = required_attrs - requested_attrs 

1115 for attr in added_attrs: 

1116 kwargs["projection"].append(attr) 

1117 

1118 unwanted_attrs = {"Env", "Environment"} | added_attrs 

1119 job_info = defaultdict(dict) 

1120 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs): 

1121 id_ = f"{job_ad['ClusterId']}.{job_ad['ProcId']}" 

1122 for attr in set(job_ad) & unwanted_attrs: 

1123 del job_ad[attr] 

1124 job_info[schedd_name][id_] = job_ad 

1125 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values())) 

1126 

1127 # Restore the list of the requested attributes to its original value 

1128 # if needed. 

1129 if added_attrs: 

1130 for attr in added_attrs: 

1131 kwargs["projection"].remove(attr) 

1132 

1133 # When returning the results filter out entries for schedulers with no jobs 

1134 # matching the search criteria. 

1135 return {key: val for key, val in job_info.items() if val} 

1136 

1137 

1138def condor_search(constraint=None, hist=None, schedds=None): 

1139 """Search for running and finished jobs satisfying given criteria. 

1140 

1141 Parameters 

1142 ---------- 

1143 constraint : `str`, optional 

1144 Constraints to be passed to job query. 

1145 hist : `float` 

1146 Limit history search to this many days. 

1147 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1148 The list of the HTCondor schedulers which to query for job information. 

1149 If None (default), only the local scheduler will be queried. 

1150 

1151 Returns 

1152 ------- 

1153 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1154 Information about jobs satisfying the search criteria where for each 

1155 Scheduler, local HTCondor job ids are mapped to their respective 

1156 classads. 

1157 """ 

1158 if not schedds: 

1159 coll = htcondor.Collector() 

1160 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1161 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)} 

1162 

1163 job_info = condor_q(constraint=constraint, schedds=schedds) 

1164 if hist is not None: 

1165 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

1166 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

1167 hist_info = condor_history(constraint, schedds=schedds) 

1168 update_job_info(job_info, hist_info) 

1169 return job_info 

1170 

1171 

1172def condor_status(constraint=None, coll=None): 

1173 """Get information about HTCondor pool. 

1174 

1175 Parameters 

1176 ---------- 

1177 constraint : `str`, optional 

1178 Constraints to be passed to the query. 

1179 coll : `htcondor.Collector`, optional 

1180 Object representing HTCondor collector daemon. 

1181 

1182 Returns 

1183 ------- 

1184 pool_info : `dict` [`str`, `dict` [`str`, Any]] 

1185 Mapping between HTCondor slot names and slot information (classAds). 

1186 """ 

1187 if coll is None: 

1188 coll = htcondor.Collector() 

1189 try: 

1190 pool_ads = coll.query(constraint=constraint) 

1191 except OSError as ex: 

1192 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex 

1193 

1194 pool_info = {} 

1195 for slot in pool_ads: 

1196 pool_info[slot["name"]] = dict(slot) 

1197 _LOG.debug("condor_status returned %d ads", len(pool_info)) 

1198 return pool_info 

1199 

1200 

1201def update_job_info(job_info, other_info): 

1202 """Update results of a job query with results from another query. 

1203 

1204 Parameters 

1205 ---------- 

1206 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1207 Results of the job query that needs to be updated. 

1208 other_info : `dict` [`str`, `dict` [`str`, Any]] 

1209 Results of the other job query. 

1210 

1211 Returns 

1212 ------- 

1213 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1214 The updated results. 

1215 """ 

1216 for schedd_name, others in other_info.items(): 

1217 try: 

1218 jobs = job_info[schedd_name] 

1219 except KeyError: 

1220 job_info[schedd_name] = others 

1221 else: 

1222 for id_, ad in others.items(): 

1223 jobs.setdefault(id_, {}).update(ad) 

1224 return job_info 

1225 

1226 

1227def summary_from_dag(dir_name): 

1228 """Build bps_run_summary string from dag file. 

1229 

1230 Parameters 

1231 ---------- 

1232 dir_name : `str` 

1233 Path that includes dag file for a run. 

1234 

1235 Returns 

1236 ------- 

1237 summary : `str` 

1238 Semi-colon separated list of job labels and counts. 

1239 (Same format as saved in dag classad). 

1240 job_name_to_pipetask : `dict` [`str`, `str`] 

1241 Mapping of job names to job labels. 

1242 """ 

1243 dag = next(Path(dir_name).glob("*.dag")) 

1244 

1245 # Later code depends upon insertion order 

1246 counts = defaultdict(int) 

1247 job_name_to_pipetask = {} 

1248 try: 

1249 with open(dag) as fh: 

1250 for line in fh: 

1251 if line.startswith("JOB"): 

1252 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line) 

1253 if m: 

1254 label = m.group(2) 

1255 if label == "init": 

1256 label = "pipetaskInit" 

1257 job_name_to_pipetask[m.group(1)] = label 

1258 counts[label] += 1 

1259 else: # Check if Pegasus submission 

1260 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line) 

1261 if m: 

1262 label = pegasus_name_to_label(m.group(1)) 

1263 job_name_to_pipetask[m.group(1)] = label 

1264 counts[label] += 1 

1265 else: 

1266 _LOG.warning("Parse DAG: unmatched job line: %s", line) 

1267 elif line.startswith("FINAL"): 

1268 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line) 

1269 if m: 

1270 label = m.group(2) 

1271 job_name_to_pipetask[m.group(1)] = label 

1272 counts[label] += 1 

1273 

1274 except (OSError, PermissionError, StopIteration): 

1275 pass 

1276 

1277 summary = ";".join([f"{name}:{counts[name]}" for name in counts]) 

1278 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask) 

1279 return summary, job_name_to_pipetask 

1280 

1281 

1282def pegasus_name_to_label(name): 

1283 """Convert pegasus job name to a label for the report. 

1284 

1285 Parameters 

1286 ---------- 

1287 name : `str` 

1288 Name of job. 

1289 

1290 Returns 

1291 ------- 

1292 label : `str` 

1293 Label for job. 

1294 """ 

1295 label = "UNK" 

1296 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"): 

1297 label = "pegasus" 

1298 else: 

1299 m = re.match(r"pipetask_(\d+_)?([^_]+)", name) 

1300 if m: 

1301 label = m.group(2) 

1302 if label == "init": 

1303 label = "pipetaskInit" 

1304 

1305 return label 

1306 

1307 

1308def read_dag_status(wms_path): 

1309 """Read the node status file for DAG summary information. 

1310 

1311 Parameters 

1312 ---------- 

1313 wms_path : `str` 

1314 Path that includes node status file for a run. 

1315 

1316 Returns 

1317 ------- 

1318 dag_ad : `dict` [`str`, Any] 

1319 DAG summary information. 

1320 """ 

1321 dag_ad = {} 

1322 

1323 # While this is probably more up to date than dag classad, only read from 

1324 # file if need to. 

1325 try: 

1326 try: 

1327 node_stat_file = next(Path(wms_path).glob("*.node_status")) 

1328 _LOG.debug("Reading Node Status File %s", node_stat_file) 

1329 with open(node_stat_file) as infh: 

1330 dag_ad = classad.parseNext(infh) # pylint: disable=E1101 

1331 except StopIteration: 

1332 pass 

1333 

1334 if not dag_ad: 

1335 # Pegasus check here 

1336 try: 

1337 metrics_file = next(Path(wms_path).glob("*.dag.metrics")) 

1338 with open(metrics_file) as infh: 

1339 metrics = json.load(infh) 

1340 dag_ad["NodesTotal"] = metrics.get("jobs", 0) 

1341 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0) 

1342 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0) 

1343 dag_ad["pegasus_version"] = metrics.get("planner_version", "") 

1344 except StopIteration: 

1345 try: 

1346 metrics_file = next(Path(wms_path).glob("*.metrics")) 

1347 with open(metrics_file) as infh: 

1348 metrics = json.load(infh) 

1349 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"] 

1350 dag_ad["pegasus_version"] = metrics.get("version", "") 

1351 except StopIteration: 

1352 pass 

1353 except (OSError, PermissionError): 

1354 pass 

1355 

1356 _LOG.debug("read_dag_status: %s", dag_ad) 

1357 return dict(dag_ad) 

1358 

1359 

1360def read_node_status(wms_path): 

1361 """Read entire node status file. 

1362 

1363 Parameters 

1364 ---------- 

1365 wms_path : `str` 

1366 Path that includes node status file for a run. 

1367 

1368 Returns 

1369 ------- 

1370 jobs : `dict` [`str`, Any] 

1371 DAG summary information compiled from the node status file combined 

1372 with the information found in the node event log. 

1373 

1374 Currently, if the same job attribute is found in both files, its value 

1375 from the event log takes precedence over the value from the node status 

1376 file. 

1377 """ 

1378 # Get jobid info from other places to fill in gaps in info from node_status 

1379 _, job_name_to_pipetask = summary_from_dag(wms_path) 

1380 wms_workflow_id, loginfo = read_dag_log(wms_path) 

1381 loginfo = read_dag_nodes_log(wms_path) 

1382 _LOG.debug("loginfo = %s", loginfo) 

1383 job_name_to_id = {} 

1384 for jid, jinfo in loginfo.items(): 

1385 if "LogNotes" in jinfo: 

1386 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"]) 

1387 if m: 

1388 job_name_to_id[m.group(1)] = jid 

1389 jinfo["DAGNodeName"] = m.group(1) 

1390 

1391 try: 

1392 node_status = next(Path(wms_path).glob("*.node_status")) 

1393 except StopIteration: 

1394 return loginfo 

1395 

1396 jobs = {} 

1397 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one 

1398 try: 

1399 with open(node_status) as fh: 

1400 ads = classad.parseAds(fh) 

1401 

1402 for jclassad in ads: 

1403 if jclassad["Type"] == "DagStatus": 

1404 # skip DAG summary 

1405 pass 

1406 elif "Node" not in jclassad: 

1407 if jclassad["Type"] != "StatusEnd": 

1408 _LOG.debug("Key 'Node' not in classad: %s", jclassad) 

1409 break 

1410 else: 

1411 if jclassad["Node"] in job_name_to_pipetask: 

1412 try: 

1413 label = job_name_to_pipetask[jclassad["Node"]] 

1414 except KeyError: 

1415 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys()) 

1416 raise 

1417 elif "_" in jclassad["Node"]: 

1418 label = jclassad["Node"].split("_")[1] 

1419 else: 

1420 label = jclassad["Node"] 

1421 

1422 # Make job info as if came from condor_q 

1423 if jclassad["Node"] in job_name_to_id: 

1424 job_id = str(job_name_to_id[jclassad["Node"]]) 

1425 else: 

1426 job_id = str(fake_id) 

1427 fake_id -= 1 

1428 

1429 job = dict(jclassad) 

1430 job["ClusterId"] = int(float(job_id)) 

1431 job["DAGManJobID"] = wms_workflow_id 

1432 job["DAGNodeName"] = jclassad["Node"] 

1433 job["bps_job_label"] = label 

1434 

1435 jobs[job_id] = job 

1436 try: 

1437 jobs[job_id] |= loginfo[job_id] 

1438 except KeyError: 

1439 pass 

1440 except (OSError, PermissionError): 

1441 pass 

1442 

1443 return jobs 

1444 

1445 

1446def read_dag_log(wms_path): 

1447 """Read job information from the DAGMan log file. 

1448 

1449 Parameters 

1450 ---------- 

1451 wms_path : `str` 

1452 Path containing the DAGMan log file. 

1453 

1454 Returns 

1455 ------- 

1456 wms_workflow_id : `str` 

1457 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job. 

1458 dag_info : `dict` [`str`, `~collections.abc.Any`] 

1459 HTCondor job information read from the log file mapped to HTCondor 

1460 job id. 

1461 

1462 Raises 

1463 ------ 

1464 FileNotFoundError 

1465 If cannot find DAGMan log in given wms_path. 

1466 """ 

1467 wms_workflow_id = 0 

1468 dag_info = {} 

1469 

1470 path = Path(wms_path) 

1471 if path.exists(): 

1472 try: 

1473 filename = next(path.glob("*.dag.dagman.log")) 

1474 except StopIteration as exc: 

1475 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc 

1476 _LOG.debug("dag node log filename: %s", filename) 

1477 

1478 info = {} 

1479 job_event_log = htcondor.JobEventLog(str(filename)) 

1480 for event in job_event_log.events(stop_after=0): 

1481 id_ = f"{event['Cluster']}.{event['Proc']}" 

1482 if id_ not in info: 

1483 info[id_] = {} 

1484 wms_workflow_id = id_ # taking last job id in case of restarts 

1485 info[id_].update(event) 

1486 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1487 

1488 # only save latest DAG job 

1489 dag_info = {wms_workflow_id: info[wms_workflow_id]} 

1490 for job in dag_info.values(): 

1491 _tweak_log_info(filename, job) 

1492 

1493 return wms_workflow_id, dag_info 

1494 

1495 

1496def read_dag_nodes_log(wms_path): 

1497 """Read job information from the DAGMan nodes log file. 

1498 

1499 Parameters 

1500 ---------- 

1501 wms_path : `str` 

1502 Path containing the DAGMan nodes log file. 

1503 

1504 Returns 

1505 ------- 

1506 info : `dict` [`str`, Any] 

1507 HTCondor job information read from the log file mapped to HTCondor 

1508 job id. 

1509 

1510 Raises 

1511 ------ 

1512 FileNotFoundError 

1513 If cannot find DAGMan node log in given wms_path. 

1514 """ 

1515 try: 

1516 filename = next(Path(wms_path).glob("*.dag.nodes.log")) 

1517 except StopIteration as exc: 

1518 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc 

1519 _LOG.debug("dag node log filename: %s", filename) 

1520 

1521 info = {} 

1522 job_event_log = htcondor.JobEventLog(str(filename)) 

1523 for event in job_event_log.events(stop_after=0): 

1524 id_ = f"{event['Cluster']}.{event['Proc']}" 

1525 if id_ not in info: 

1526 info[id_] = {} 

1527 info[id_].update(event) 

1528 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1529 

1530 # Add more condor_q-like info to info parsed from log file. 

1531 for job in info.values(): 

1532 _tweak_log_info(filename, job) 

1533 

1534 return info 

1535 

1536 

1537def read_dag_info(wms_path): 

1538 """Read custom DAGMan job information from the file. 

1539 

1540 Parameters 

1541 ---------- 

1542 wms_path : `str` 

1543 Path containing the file with the DAGMan job info. 

1544 

1545 Returns 

1546 ------- 

1547 dag_info : `dict` [`str`, `dict` [`str`, Any]] 

1548 HTCondor job information. 

1549 

1550 Raises 

1551 ------ 

1552 FileNotFoundError 

1553 If cannot find DAGMan job info file in the given location. 

1554 """ 

1555 try: 

1556 filename = next(Path(wms_path).glob("*.info.json")) 

1557 except StopIteration as exc: 

1558 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc 

1559 _LOG.debug("DAGMan job information filename: %s", filename) 

1560 try: 

1561 with open(filename) as fh: 

1562 dag_info = json.load(fh) 

1563 except (OSError, PermissionError) as exc: 

1564 _LOG.debug("Retrieving DAGMan job information failed: %s", exc) 

1565 dag_info = {} 

1566 return dag_info 

1567 

1568 

1569def write_dag_info(filename, dag_info): 

1570 """Write custom job information about DAGMan job. 

1571 

1572 Parameters 

1573 ---------- 

1574 filename : `str` 

1575 Name of the file where the information will be stored. 

1576 dag_info : `dict` [`str` `dict` [`str`, Any]] 

1577 Information about the DAGMan job. 

1578 """ 

1579 schedd_name = next(iter(dag_info)) 

1580 dag_id = next(iter(dag_info[schedd_name])) 

1581 dag_ad = dag_info[schedd_name][dag_id] 

1582 try: 

1583 with open(filename, "w") as fh: 

1584 info = { 

1585 schedd_name: { 

1586 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]} 

1587 } 

1588 } 

1589 json.dump(info, fh) 

1590 except (KeyError, OSError, PermissionError) as exc: 

1591 _LOG.debug("Persisting DAGMan job information failed: %s", exc) 

1592 

1593 

1594def _tweak_log_info(filename, job): 

1595 """Massage the given job info has same structure as if came from condor_q. 

1596 

1597 Parameters 

1598 ---------- 

1599 filename : `pathlib.Path` 

1600 Name of the DAGMan log. 

1601 job : `dict` [ `str`, Any ] 

1602 A mapping between HTCondor job id and job information read from 

1603 the log. 

1604 """ 

1605 _LOG.debug("_tweak_log_info: %s %s", filename, job) 

1606 

1607 try: 

1608 job["ClusterId"] = job["Cluster"] 

1609 job["ProcId"] = job["Proc"] 

1610 job["Iwd"] = str(filename.parent) 

1611 job["Owner"] = filename.owner() 

1612 

1613 match job["MyType"]: 

1614 case "ExecuteEvent": 

1615 job["JobStatus"] = JobStatus.RUNNING 

1616 case "JobTerminatedEvent" | "PostScriptTerminatedEvent": 

1617 job["JobStatus"] = JobStatus.COMPLETED 

1618 case "SubmitEvent": 

1619 job["JobStatus"] = JobStatus.IDLE 

1620 case "JobAbortedEvent": 

1621 job["JobStatus"] = JobStatus.REMOVED 

1622 case "JobHeldEvent": 

1623 job["JobStatus"] = JobStatus.HELD 

1624 case _: 

1625 _LOG.debug("Unknown log event type: %s", job["MyType"]) 

1626 job["JobStatus"] = JobStatus.UNEXPANDED 

1627 

1628 if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}: 

1629 new_job = HTC_JOB_AD_HANDLERS.handle(job) 

1630 if new_job is not None: 

1631 job = new_job 

1632 else: 

1633 _LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"]) 

1634 

1635 except KeyError as e: 

1636 _LOG.error("Missing key %s in job: %s", str(e), job) 

1637 raise 

1638 

1639 

1640def htc_check_dagman_output(wms_path): 

1641 """Check the DAGMan output for error messages. 

1642 

1643 Parameters 

1644 ---------- 

1645 wms_path : `str` 

1646 Directory containing the DAGman output file. 

1647 

1648 Returns 

1649 ------- 

1650 message : `str` 

1651 Message containing error messages from the DAGMan output. Empty 

1652 string if no messages. 

1653 

1654 Raises 

1655 ------ 

1656 FileNotFoundError 

1657 If cannot find DAGMan standard output file in given wms_path. 

1658 """ 

1659 try: 

1660 filename = next(Path(wms_path).glob("*.dag.dagman.out")) 

1661 except StopIteration as exc: 

1662 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc 

1663 _LOG.debug("dag output filename: %s", filename) 

1664 

1665 message = "" 

1666 try: 

1667 with open(filename) as fh: 

1668 last_submit_failed = "" 

1669 for line in fh: 

1670 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line) 

1671 if m: 

1672 last_submit_failed = m.group(1) 

1673 if last_submit_failed: 

1674 message = f"Warn: Job submission issues (last: {last_submit_failed})" 

1675 except (OSError, PermissionError): 

1676 message = f"Warn: Could not read dagman output file from {wms_path}." 

1677 return message