Coverage for python/lsst/ctrl/bps/htcondor/lssthtc.py: 13%

604 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 03:42 -0700

1# This file is part of ctrl_bps_htcondor. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Placeholder HTCondor DAGMan API. 

29 

30There is new work on a python DAGMan API from HTCondor. However, at this 

31time, it tries to make things easier by assuming DAG is easily broken into 

32levels where there are 1-1 or all-to-all relationships to nodes in next 

33level. LSST workflows are more complicated. 

34""" 

35 

36__all__ = [ 

37 "DagStatus", 

38 "JobStatus", 

39 "NodeStatus", 

40 "RestrictedDict", 

41 "HTCJob", 

42 "HTCDag", 

43 "htc_backup_files", 

44 "htc_check_dagman_output", 

45 "htc_create_submit_from_cmd", 

46 "htc_create_submit_from_dag", 

47 "htc_create_submit_from_file", 

48 "htc_escape", 

49 "htc_write_attribs", 

50 "htc_write_condor_file", 

51 "htc_query_history", 

52 "htc_query_present", 

53 "htc_version", 

54 "htc_submit_dag", 

55 "condor_history", 

56 "condor_q", 

57 "condor_search", 

58 "condor_status", 

59 "update_job_info", 

60 "MISSING_ID", 

61 "summary_from_dag", 

62 "read_dag_info", 

63 "read_dag_log", 

64 "read_dag_nodes_log", 

65 "read_dag_status", 

66 "read_node_status", 

67 "write_dag_info", 

68 "pegasus_name_to_label", 

69] 

70 

71 

72import itertools 

73import json 

74import logging 

75import os 

76import pprint 

77import re 

78import subprocess 

79from collections import defaultdict 

80from collections.abc import MutableMapping 

81from datetime import datetime, timedelta 

82from enum import IntEnum 

83from pathlib import Path 

84 

85import classad 

86import htcondor 

87import networkx 

88from packaging import version 

89 

90from .handlers import HTC_JOB_AD_HANDLERS 

91 

92_LOG = logging.getLogger(__name__) 

93 

94MISSING_ID = -99999 

95 

96 

97class DagStatus(IntEnum): 

98 """HTCondor DAGMan's statuses for a DAG.""" 

99 

100 OK = 0 

101 ERROR = 1 # an error condition different than those listed here 

102 FAILED = 2 # one or more nodes in the DAG have failed 

103 ABORTED = 3 # the DAG has been aborted by an ABORT-DAG-ON specification 

104 REMOVED = 4 # the DAG has been removed by condor_rm 

105 CYCLE = 5 # a cycle was found in the DAG 

106 SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8) 

107 

108 

109class JobStatus(IntEnum): 

110 """HTCondor's statuses for jobs.""" 

111 

112 UNEXPANDED = 0 # Unexpanded 

113 IDLE = 1 # Idle 

114 RUNNING = 2 # Running 

115 REMOVED = 3 # Removed 

116 COMPLETED = 4 # Completed 

117 HELD = 5 # Held 

118 TRANSFERRING_OUTPUT = 6 # Transferring_Output 

119 SUSPENDED = 7 # Suspended 

120 

121 

122class NodeStatus(IntEnum): 

123 """HTCondor's statuses for DAGman nodes.""" 

124 

125 # (STATUS_NOT_READY): At least one parent has not yet finished or the node 

126 # is a FINAL node. 

127 NOT_READY = 0 

128 

129 # (STATUS_READY): All parents have finished, but the node is not yet 

130 # running. 

131 READY = 1 

132 

133 # (STATUS_PRERUN): The node’s PRE script is running. 

134 PRERUN = 2 

135 

136 # (STATUS_SUBMITTED): The node’s HTCondor job(s) are in the queue. 

137 # StatusDetails = "not_idle" -> running. 

138 # JobProcsHeld = 1-> hold. 

139 # JobProcsQueued = 1 -> idle. 

140 SUBMITTED = 3 

141 

142 # (STATUS_POSTRUN): The node’s POST script is running. 

143 POSTRUN = 4 

144 

145 # (STATUS_DONE): The node has completed successfully. 

146 DONE = 5 

147 

148 # (STATUS_ERROR): The node has failed. StatusDetails has info (e.g., 

149 # ULOG_JOB_ABORTED for deleted job). 

150 ERROR = 6 

151 

152 

153HTC_QUOTE_KEYS = {"environment"} 

154HTC_VALID_JOB_KEYS = { 

155 "universe", 

156 "executable", 

157 "arguments", 

158 "environment", 

159 "log", 

160 "error", 

161 "output", 

162 "should_transfer_files", 

163 "when_to_transfer_output", 

164 "getenv", 

165 "notification", 

166 "notify_user", 

167 "concurrency_limit", 

168 "transfer_executable", 

169 "transfer_input_files", 

170 "transfer_output_files", 

171 "request_cpus", 

172 "request_memory", 

173 "request_disk", 

174 "priority", 

175 "category", 

176 "requirements", 

177 "on_exit_hold", 

178 "on_exit_hold_reason", 

179 "on_exit_hold_subcode", 

180 "max_retries", 

181 "periodic_release", 

182 "periodic_remove", 

183 "accounting_group", 

184 "accounting_group_user", 

185} 

186HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"} 

187HTC_VERSION = version.parse(htcondor.__version__) 

188 

189 

190class RestrictedDict(MutableMapping): 

191 """A dictionary that only allows certain keys. 

192 

193 Parameters 

194 ---------- 

195 valid_keys : `Container` 

196 Strings that are valid keys. 

197 init_data : `dict` or `RestrictedDict`, optional 

198 Initial data. 

199 

200 Raises 

201 ------ 

202 KeyError 

203 If invalid key(s) in init_data. 

204 """ 

205 

206 def __init__(self, valid_keys, init_data=()): 

207 self.valid_keys = valid_keys 

208 self.data = {} 

209 self.update(init_data) 

210 

211 def __getitem__(self, key): 

212 """Return value for given key if exists. 

213 

214 Parameters 

215 ---------- 

216 key : `str` 

217 Identifier for value to return. 

218 

219 Returns 

220 ------- 

221 value : `~collections.abc.Any` 

222 Value associated with given key. 

223 

224 Raises 

225 ------ 

226 KeyError 

227 If key doesn't exist. 

228 """ 

229 return self.data[key] 

230 

231 def __delitem__(self, key): 

232 """Delete value for given key if exists. 

233 

234 Parameters 

235 ---------- 

236 key : `str` 

237 Identifier for value to delete. 

238 

239 Raises 

240 ------ 

241 KeyError 

242 If key doesn't exist. 

243 """ 

244 del self.data[key] 

245 

246 def __setitem__(self, key, value): 

247 """Store key,value in internal dict only if key is valid. 

248 

249 Parameters 

250 ---------- 

251 key : `str` 

252 Identifier to associate with given value. 

253 value : `~collections.abc.Any` 

254 Value to store. 

255 

256 Raises 

257 ------ 

258 KeyError 

259 If key is invalid. 

260 """ 

261 if key not in self.valid_keys: 

262 raise KeyError(f"Invalid key {key}") 

263 self.data[key] = value 

264 

265 def __iter__(self): 

266 return self.data.__iter__() 

267 

268 def __len__(self): 

269 return len(self.data) 

270 

271 def __str__(self): 

272 return str(self.data) 

273 

274 

275def htc_backup_files(wms_path, subdir=None, limit=100): 

276 """Backup select HTCondor files in the submit directory. 

277 

278 Files will be saved in separate subdirectories which will be created in 

279 the submit directory where the files are located. These subdirectories 

280 will be consecutive, zero-padded integers. Their values will correspond to 

281 the number of HTCondor rescue DAGs in the submit directory. 

282 

283 Hence, with the default settings, copies after the initial failed run will 

284 be placed in '001' subdirectory, '002' after the first restart, and so on 

285 until the limit of backups is reached. If there's no rescue DAG yet, files 

286 will be copied to '000' subdirectory. 

287 

288 Parameters 

289 ---------- 

290 wms_path : `str` or `pathlib.Path` 

291 Path to the submit directory either absolute or relative. 

292 subdir : `str` or `pathlib.Path`, optional 

293 A path, relative to the submit directory, where all subdirectories with 

294 backup files will be kept. Defaults to None which means that the backup 

295 subdirectories will be placed directly in the submit directory. 

296 limit : `int`, optional 

297 Maximal number of backups. If the number of backups reaches the limit, 

298 the last backup files will be overwritten. The default value is 100 

299 to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in 

300 version 8.8+. 

301 

302 Raises 

303 ------ 

304 FileNotFoundError 

305 If the submit directory or the file that needs to be backed up does not 

306 exist. 

307 OSError 

308 If the submit directory cannot be accessed or backing up a file failed 

309 either due to permission or filesystem related issues. 

310 

311 Notes 

312 ----- 

313 This is not a generic function for making backups. It is intended to be 

314 used once, just before a restart, to make snapshots of files which will be 

315 overwritten by HTCondor after during the next run. 

316 """ 

317 width = len(str(limit)) 

318 

319 path = Path(wms_path).resolve() 

320 if not path.is_dir(): 

321 raise FileNotFoundError(f"Directory {path} not found") 

322 

323 # Initialize the backup counter. 

324 rescue_dags = list(Path(wms_path).glob("*.rescue*")) 

325 counter = min(len(rescue_dags), limit) 

326 

327 # Create the backup directory and move select files there. 

328 dest = Path(wms_path) 

329 if subdir: 

330 # PurePath.is_relative_to() is not available before Python 3.9. Hence 

331 # we need to check is 'subdir' is in the submit directory in some other 

332 # way if it is an absolute path. 

333 subdir = Path(subdir) 

334 if subdir.is_absolute(): 

335 if dest not in subdir.parents: 

336 _LOG.warning( 

337 "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.", 

338 subdir, 

339 wms_path, 

340 ) 

341 else: 

342 dest /= subdir 

343 else: 

344 dest /= subdir 

345 dest /= f"{counter:0{width}}" 

346 try: 

347 dest.mkdir(parents=True, exist_ok=False if counter < limit else True) 

348 except FileExistsError: 

349 _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest) 

350 else: 

351 for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]: 

352 for source in path.glob(patt): 

353 if source.is_file(): 

354 target = dest / source.relative_to(path) 

355 try: 

356 source.rename(target) 

357 except OSError as exc: 

358 raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None 

359 else: 

360 raise FileNotFoundError(f"Backing up '{source}' failed: not a file") 

361 

362 

363def htc_escape(value): 

364 """Escape characters in given value based upon HTCondor syntax. 

365 

366 Parameters 

367 ---------- 

368 value : `~collections.abc.Any` 

369 Value that needs to have characters escaped if string. 

370 

371 Returns 

372 ------- 

373 new_value : `~collections.abc.Any` 

374 Given value with characters escaped appropriate for HTCondor if string. 

375 """ 

376 if isinstance(value, str): 

377 newval = value.replace('"', '""').replace("'", "''").replace("&quot;", '"') 

378 else: 

379 newval = value 

380 

381 return newval 

382 

383 

384def htc_write_attribs(stream, attrs): 

385 """Write job attributes in HTCondor format to writeable stream. 

386 

387 Parameters 

388 ---------- 

389 stream : `~io.TextIOBase` 

390 Output text stream (typically an open file). 

391 attrs : `dict` 

392 HTCondor job attributes (dictionary of attribute key, value). 

393 """ 

394 for key, value in attrs.items(): 

395 # Make sure strings are syntactically correct for HTCondor. 

396 if isinstance(value, str): 

397 pval = f'"{htc_escape(value)}"' 

398 else: 

399 pval = value 

400 

401 print(f"+{key} = {pval}", file=stream) 

402 

403 

404def htc_write_condor_file(filename, job_name, job, job_attrs): 

405 """Write an HTCondor submit file. 

406 

407 Parameters 

408 ---------- 

409 filename : `str` 

410 Filename for the HTCondor submit file. 

411 job_name : `str` 

412 Job name to use in submit file. 

413 job : `RestrictedDict` 

414 Submit script information. 

415 job_attrs : `dict` 

416 Job attributes. 

417 """ 

418 os.makedirs(os.path.dirname(filename), exist_ok=True) 

419 with open(filename, "w") as fh: 

420 for key, value in job.items(): 

421 if value is not None: 

422 if key in HTC_QUOTE_KEYS: 

423 print(f'{key}="{htc_escape(value)}"', file=fh) 

424 else: 

425 print(f"{key}={value}", file=fh) 

426 for key in ["output", "error", "log"]: 

427 if key not in job: 

428 filename = f"{job_name}.$(Cluster).${key[:3]}" 

429 print(f"{key}={filename}", file=fh) 

430 

431 if job_attrs is not None: 

432 htc_write_attribs(fh, job_attrs) 

433 print("queue", file=fh) 

434 

435 

436# To avoid doing the version check during every function call select 

437# appropriate conversion function at the import time. 

438# 

439# Make sure that *each* version specific variant of the conversion function(s) 

440# has the same signature after applying any changes! 

441if HTC_VERSION < version.parse("8.9.8"): 441 ↛ 443line 441 didn't jump to line 443, because the condition on line 441 was never true

442 

443 def htc_tune_schedd_args(**kwargs): 

444 """Ensure that arguments for Schedd are version appropriate. 

445 

446 The old arguments: 'requirements' and 'attr_list' of 

447 'Schedd.history()', 'Schedd.query()', and 'Schedd.xquery()' were 

448 deprecated in favor of 'constraint' and 'projection', respectively, 

449 starting from version 8.9.8. The function will convert "new" keyword 

450 arguments to "old" ones. 

451 

452 Parameters 

453 ---------- 

454 **kwargs 

455 Any keyword arguments that Schedd.history(), Schedd.query(), and 

456 Schedd.xquery() accepts. 

457 

458 Returns 

459 ------- 

460 kwargs : `dict` [`str`, Any] 

461 Keywords arguments that are guaranteed to work with the Python 

462 HTCondor API. 

463 

464 Notes 

465 ----- 

466 Function doesn't validate provided keyword arguments beyond converting 

467 selected arguments to their version specific form. For example, 

468 it won't remove keywords that are not supported by the methods 

469 mentioned earlier. 

470 """ 

471 translation_table = { 

472 "constraint": "requirements", 

473 "projection": "attr_list", 

474 } 

475 for new, old in translation_table.items(): 

476 try: 

477 kwargs[old] = kwargs.pop(new) 

478 except KeyError: 

479 pass 

480 return kwargs 

481 

482else: 

483 

484 def htc_tune_schedd_args(**kwargs): 

485 """Ensure that arguments for Schedd are version appropriate. 

486 

487 This is the fallback function if no version specific alteration are 

488 necessary. Effectively, a no-op. 

489 

490 Parameters 

491 ---------- 

492 **kwargs 

493 Any keyword arguments that Schedd.history(), Schedd.query(), and 

494 Schedd.xquery() accepts. 

495 

496 Returns 

497 ------- 

498 kwargs : `dict` [`str`, Any] 

499 Keywords arguments that were passed to the function. 

500 """ 

501 return kwargs 

502 

503 

504def htc_query_history(schedds, **kwargs): 

505 """Fetch history records from the condor_schedd daemon. 

506 

507 Parameters 

508 ---------- 

509 schedds : `htcondor.Schedd` 

510 HTCondor schedulers which to query for job information. 

511 **kwargs 

512 Any keyword arguments that Schedd.history() accepts. 

513 

514 Yields 

515 ------ 

516 schedd_name : `str` 

517 Name of the HTCondor scheduler managing the job queue. 

518 job_ad : `dict` [`str`, Any] 

519 A dictionary representing HTCondor ClassAd describing a job. It maps 

520 job attributes names to values of the ClassAd expressions they 

521 represent. 

522 """ 

523 # If not set, provide defaults for positional arguments. 

524 kwargs.setdefault("constraint", None) 

525 kwargs.setdefault("projection", []) 

526 kwargs = htc_tune_schedd_args(**kwargs) 

527 for schedd_name, schedd in schedds.items(): 

528 for job_ad in schedd.history(**kwargs): 

529 yield schedd_name, dict(job_ad) 

530 

531 

532def htc_query_present(schedds, **kwargs): 

533 """Query the condor_schedd daemon for job ads. 

534 

535 Parameters 

536 ---------- 

537 schedds : `htcondor.Schedd` 

538 HTCondor schedulers which to query for job information. 

539 **kwargs 

540 Any keyword arguments that Schedd.xquery() accepts. 

541 

542 Yields 

543 ------ 

544 schedd_name : `str` 

545 Name of the HTCondor scheduler managing the job queue. 

546 job_ad : `dict` [`str`, Any] 

547 A dictionary representing HTCondor ClassAd describing a job. It maps 

548 job attributes names to values of the ClassAd expressions they 

549 represent. 

550 """ 

551 kwargs = htc_tune_schedd_args(**kwargs) 

552 for schedd_name, schedd in schedds.items(): 

553 for job_ad in schedd.query(**kwargs): 

554 yield schedd_name, dict(job_ad) 

555 

556 

557def htc_version(): 

558 """Return the version given by the HTCondor API. 

559 

560 Returns 

561 ------- 

562 version : `str` 

563 HTCondor version as easily comparable string. 

564 """ 

565 return str(HTC_VERSION) 

566 

567 

568def htc_submit_dag(sub): 

569 """Submit job for execution. 

570 

571 Parameters 

572 ---------- 

573 sub : `htcondor.Submit` 

574 An object representing a job submit description. 

575 

576 Returns 

577 ------- 

578 schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

579 Information about jobs satisfying the search criteria where for each 

580 Scheduler, local HTCondor job ids are mapped to their respective 

581 classads. 

582 """ 

583 coll = htcondor.Collector() 

584 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

585 schedd = htcondor.Schedd(schedd_ad) 

586 

587 # If Schedd.submit() fails, the method will raise an exception. Usually, 

588 # that implies issues with the HTCondor pool which BPS can't address. 

589 # Hence, no effort is made to handle the exception. 

590 submit_result = schedd.submit(sub) 

591 

592 # Sadly, the ClassAd from Schedd.submit() (see above) does not have 

593 # 'GlobalJobId' so we need to run a regular query to get it anyway. 

594 schedd_name = schedd_ad["Name"] 

595 schedd_dag_info = condor_q( 

596 constraint=f"ClusterId == {submit_result.cluster()}", schedds={schedd_name: schedd} 

597 ) 

598 return schedd_dag_info 

599 

600 

601def htc_create_submit_from_dag(dag_filename, submit_options=None): 

602 """Create a DAGMan job submit description. 

603 

604 Parameters 

605 ---------- 

606 dag_filename : `str` 

607 Name of file containing HTCondor DAG commands. 

608 submit_options : `dict` [`str`, Any], optional 

609 Contains extra options for command line (Value of None means flag). 

610 

611 Returns 

612 ------- 

613 sub : `htcondor.Submit` 

614 An object representing a job submit description. 

615 

616 Notes 

617 ----- 

618 Use with HTCondor versions which support htcondor.Submit.from_dag(), 

619 i.e., 8.9.3 or newer. 

620 """ 

621 return htcondor.Submit.from_dag(dag_filename, submit_options) 

622 

623 

624def htc_create_submit_from_cmd(dag_filename, submit_options=None): 

625 """Create a DAGMan job submit description. 

626 

627 Create a DAGMan job submit description by calling ``condor_submit_dag`` 

628 on given DAG description file. 

629 

630 Parameters 

631 ---------- 

632 dag_filename : `str` 

633 Name of file containing HTCondor DAG commands. 

634 submit_options : `dict` [`str`, Any], optional 

635 Contains extra options for command line (Value of None means flag). 

636 

637 Returns 

638 ------- 

639 sub : `htcondor.Submit` 

640 An object representing a job submit description. 

641 

642 Notes 

643 ----- 

644 Use with HTCondor versions which do not support htcondor.Submit.from_dag(), 

645 i.e., older than 8.9.3. 

646 """ 

647 # Run command line condor_submit_dag command. 

648 cmd = "condor_submit_dag -f -no_submit -notification never -autorescue 1 -UseDagDir -no_recurse " 

649 

650 if submit_options is not None: 

651 for opt, val in submit_options.items(): 

652 cmd += f" -{opt} {val or ''}" 

653 cmd += f"{dag_filename}" 

654 

655 process = subprocess.Popen( 

656 cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8" 

657 ) 

658 process.wait() 

659 

660 if process.returncode != 0: 

661 print(f"Exit code: {process.returncode}") 

662 print(process.communicate()[0]) 

663 raise RuntimeError("Problems running condor_submit_dag") 

664 

665 return htc_create_submit_from_file(f"{dag_filename}.condor.sub") 

666 

667 

668def htc_create_submit_from_file(submit_file): 

669 """Parse a submission file. 

670 

671 Parameters 

672 ---------- 

673 submit_file : `str` 

674 Name of the HTCondor submit file. 

675 

676 Returns 

677 ------- 

678 sub : `htcondor.Submit` 

679 An object representing a job submit description. 

680 """ 

681 descriptors = {} 

682 with open(submit_file) as fh: 

683 for line in fh: 

684 line = line.strip() 

685 if not line.startswith("#") and not line == "queue": 

686 (key, val) = re.split(r"\s*=\s*", line, 1) 

687 descriptors[key] = val 

688 

689 # Avoid UserWarning: the line 'copy_to_spool = False' was 

690 # unused by Submit object. Is it a typo? 

691 try: 

692 del descriptors["copy_to_spool"] 

693 except KeyError: 

694 pass 

695 

696 return htcondor.Submit(descriptors) 

697 

698 

699def _htc_write_job_commands(stream, name, jobs): 

700 """Output the DAGMan job lines for single job in DAG. 

701 

702 Parameters 

703 ---------- 

704 stream : `~io.TextIOBase` 

705 Writeable text stream (typically an opened file). 

706 name : `str` 

707 Job name. 

708 jobs : `RestrictedDict` 

709 DAG job keys and values. 

710 """ 

711 if "pre" in jobs: 

712 print( 

713 f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}" 

714 f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}", 

715 file=stream, 

716 ) 

717 

718 if "post" in jobs: 

719 print( 

720 f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}" 

721 f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}", 

722 file=stream, 

723 ) 

724 

725 if "vars" in jobs: 

726 for key, value in jobs["vars"]: 

727 print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream) 

728 

729 if "pre_skip" in jobs: 

730 print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream) 

731 

732 if "retry" in jobs and jobs["retry"]: 

733 print(f"RETRY {name} {jobs['retry']} ", end="", file=stream) 

734 if "retry_unless_exit" in jobs: 

735 print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream) 

736 print("\n", file=stream) 

737 

738 if "abort_dag_on" in jobs and jobs["abort_dag_on"]: 

739 print( 

740 f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}" 

741 f" RETURN {jobs['abort_dag_on']['abort_exit']}", 

742 file=stream, 

743 ) 

744 

745 

746class HTCJob: 

747 """HTCondor job for use in building DAG. 

748 

749 Parameters 

750 ---------- 

751 name : `str` 

752 Name of the job. 

753 label : `str` 

754 Label that can used for grouping or lookup. 

755 initcmds : `RestrictedDict` 

756 Initial job commands for submit file. 

757 initdagcmds : `RestrictedDict` 

758 Initial commands for job inside DAG. 

759 initattrs : `dict` 

760 Initial dictionary of job attributes. 

761 """ 

762 

763 def __init__(self, name, label=None, initcmds=(), initdagcmds=(), initattrs=None): 

764 self.name = name 

765 self.label = label 

766 self.cmds = RestrictedDict(HTC_VALID_JOB_KEYS, initcmds) 

767 self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds) 

768 self.attrs = initattrs 

769 self.subfile = None 

770 

771 def __str__(self): 

772 return self.name 

773 

774 def add_job_cmds(self, new_commands): 

775 """Add commands to Job (overwrite existing). 

776 

777 Parameters 

778 ---------- 

779 new_commands : `dict` 

780 Submit file commands to be added to Job. 

781 """ 

782 self.cmds.update(new_commands) 

783 

784 def add_dag_cmds(self, new_commands): 

785 """Add DAG commands to Job (overwrite existing). 

786 

787 Parameters 

788 ---------- 

789 new_commands : `dict` 

790 DAG file commands to be added to Job. 

791 """ 

792 self.dagcmds.update(new_commands) 

793 

794 def add_job_attrs(self, new_attrs): 

795 """Add attributes to Job (overwrite existing). 

796 

797 Parameters 

798 ---------- 

799 new_attrs : `dict` 

800 Attributes to be added to Job. 

801 """ 

802 if self.attrs is None: 

803 self.attrs = {} 

804 if new_attrs: 

805 self.attrs.update(new_attrs) 

806 

807 def write_submit_file(self, submit_path, job_subdir=""): 

808 """Write job description to submit file. 

809 

810 Parameters 

811 ---------- 

812 submit_path : `str` 

813 Prefix path for the submit file. 

814 job_subdir : `str`, optional 

815 Template for job subdir. 

816 """ 

817 if not self.subfile: 

818 self.subfile = f"{self.name}.sub" 

819 job_subdir = job_subdir.format(self=self) 

820 if job_subdir: 

821 self.subfile = os.path.join(job_subdir, self.subfile) 

822 htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs) 

823 

824 def write_dag_commands(self, stream): 

825 """Write DAG commands for single job to output stream. 

826 

827 Parameters 

828 ---------- 

829 stream : `IO` or `str` 

830 Output Stream. 

831 """ 

832 print(f"JOB {self.name} {self.subfile}", file=stream) 

833 _htc_write_job_commands(stream, self.name, self.dagcmds) 

834 

835 def dump(self, fh): 

836 """Dump job information to output stream. 

837 

838 Parameters 

839 ---------- 

840 fh : `~io.TextIOBase` 

841 Output stream. 

842 """ 

843 printer = pprint.PrettyPrinter(indent=4, stream=fh) 

844 printer.pprint(self.name) 

845 printer.pprint(self.cmds) 

846 printer.pprint(self.attrs) 

847 

848 

849class HTCDag(networkx.DiGraph): 

850 """HTCondor DAG. 

851 

852 Parameters 

853 ---------- 

854 data : networkx.DiGraph.data 

855 Initial graph. 

856 name : `str` 

857 Name for DAG. 

858 """ 

859 

860 def __init__(self, data=None, name=""): 

861 super().__init__(data=data, name=name) 

862 

863 self.graph["attr"] = {} 

864 self.graph["run_id"] = None 

865 self.graph["submit_path"] = None 

866 self.graph["final_job"] = None 

867 

868 def __str__(self): 

869 """Represent basic DAG info as string. 

870 

871 Returns 

872 ------- 

873 info : `str` 

874 String containing basic DAG info. 

875 """ 

876 return f"{self.graph['name']} {len(self)}" 

877 

878 def add_attribs(self, attribs=None): 

879 """Add attributes to the DAG. 

880 

881 Parameters 

882 ---------- 

883 attribs : `dict` 

884 DAG attributes. 

885 """ 

886 if attribs is not None: 

887 self.graph["attr"].update(attribs) 

888 

889 def add_job(self, job, parent_names=None, child_names=None): 

890 """Add an HTCJob to the HTCDag. 

891 

892 Parameters 

893 ---------- 

894 job : `HTCJob` 

895 HTCJob to add to the HTCDag. 

896 parent_names : `~collections.abc.Iterable` [`str`], optional 

897 Names of parent jobs. 

898 child_names : `~collections.abc.Iterable` [`str`], optional 

899 Names of child jobs. 

900 """ 

901 assert isinstance(job, HTCJob) 

902 

903 # Add dag level attributes to each job 

904 job.add_job_attrs(self.graph["attr"]) 

905 

906 self.add_node(job.name, data=job) 

907 

908 if parent_names is not None: 

909 self.add_job_relationships(parent_names, job.name) 

910 

911 if child_names is not None: 

912 self.add_job_relationships(child_names, job.name) 

913 

914 def add_job_relationships(self, parents, children): 

915 """Add DAG edge between parents and children jobs. 

916 

917 Parameters 

918 ---------- 

919 parents : `list` [`str`] 

920 Contains parent job name(s). 

921 children : `list` [`str`] 

922 Contains children job name(s). 

923 """ 

924 self.add_edges_from(itertools.product(parents, children)) 

925 

926 def add_final_job(self, job): 

927 """Add an HTCJob for the FINAL job in HTCDag. 

928 

929 Parameters 

930 ---------- 

931 job : `HTCJob` 

932 HTCJob to add to the HTCDag as a FINAL job. 

933 """ 

934 # Add dag level attributes to each job 

935 job.add_job_attrs(self.graph["attr"]) 

936 

937 self.graph["final_job"] = job 

938 

939 def del_job(self, job_name): 

940 """Delete the job from the DAG. 

941 

942 Parameters 

943 ---------- 

944 job_name : `str` 

945 Name of job in DAG to delete. 

946 """ 

947 # Reconnect edges around node to delete 

948 parents = self.predecessors(job_name) 

949 children = self.successors(job_name) 

950 self.add_edges_from(itertools.product(parents, children)) 

951 

952 # Delete job node (which deletes its edges). 

953 self.remove_node(job_name) 

954 

955 def write(self, submit_path, job_subdir=""): 

956 """Write DAG to a file. 

957 

958 Parameters 

959 ---------- 

960 submit_path : `str` 

961 Prefix path for dag filename to be combined with DAG name. 

962 job_subdir : `str`, optional 

963 Template for job subdir. 

964 """ 

965 self.graph["submit_path"] = submit_path 

966 self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag") 

967 os.makedirs(submit_path, exist_ok=True) 

968 with open(self.graph["dag_filename"], "w") as fh: 

969 for _, nodeval in self.nodes().items(): 

970 job = nodeval["data"] 

971 job.write_submit_file(submit_path, job_subdir) 

972 job.write_dag_commands(fh) 

973 for edge in self.edges(): 

974 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

975 print(f"DOT {self.name}.dot", file=fh) 

976 print(f"NODE_STATUS_FILE {self.name}.node_status", file=fh) 

977 

978 # Add bps attributes to dag submission 

979 for key, value in self.graph["attr"].items(): 

980 print(f'SET_JOB_ATTR {key}= "{htc_escape(value)}"', file=fh) 

981 

982 if self.graph["final_job"]: 

983 job = self.graph["final_job"] 

984 job.write_submit_file(submit_path, job_subdir) 

985 print(f"FINAL {job.name} {job.subfile}", file=fh) 

986 if "pre" in job.dagcmds: 

987 print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh) 

988 if "post" in job.dagcmds: 

989 print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh) 

990 

991 def dump(self, fh): 

992 """Dump DAG info to output stream. 

993 

994 Parameters 

995 ---------- 

996 fh : `io.IO` or `str` 

997 Where to dump DAG info as text. 

998 """ 

999 for key, value in self.graph: 

1000 print(f"{key}={value}", file=fh) 

1001 for name, data in self.nodes().items(): 

1002 print(f"{name}:", file=fh) 

1003 data.dump(fh) 

1004 for edge in self.edges(): 

1005 print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh) 

1006 if self.graph["final_job"]: 

1007 print(f'FINAL {self.graph["final_job"].name}:', file=fh) 

1008 self.graph["final_job"].dump(fh) 

1009 

1010 def write_dot(self, filename): 

1011 """Write a dot version of the DAG. 

1012 

1013 Parameters 

1014 ---------- 

1015 filename : `str` 

1016 Name of the dot file. 

1017 """ 

1018 pos = networkx.nx_agraph.graphviz_layout(self) 

1019 networkx.draw(self, pos=pos) 

1020 networkx.drawing.nx_pydot.write_dot(self, filename) 

1021 

1022 

1023def condor_q(constraint=None, schedds=None, **kwargs): 

1024 """Get information about the jobs in the HTCondor job queue(s). 

1025 

1026 Parameters 

1027 ---------- 

1028 constraint : `str`, optional 

1029 Constraints to be passed to job query. 

1030 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1031 HTCondor schedulers which to query for job information. If None 

1032 (default), the query will be run against local scheduler only. 

1033 **kwargs : `~typing.Any` 

1034 Additional keyword arguments that need to be passed to the internal 

1035 query method. 

1036 

1037 Returns 

1038 ------- 

1039 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1040 Information about jobs satisfying the search criteria where for each 

1041 Scheduler, local HTCondor job ids are mapped to their respective 

1042 classads. 

1043 """ 

1044 return condor_query(constraint, schedds, htc_query_present, **kwargs) 

1045 

1046 

1047def condor_history(constraint=None, schedds=None, **kwargs): 

1048 """Get information about the jobs from HTCondor history records. 

1049 

1050 Parameters 

1051 ---------- 

1052 constraint : `str`, optional 

1053 Constraints to be passed to job query. 

1054 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1055 HTCondor schedulers which to query for job information. If None 

1056 (default), the query will be run against the history file of 

1057 the local scheduler only. 

1058 **kwargs : `~typing.Any` 

1059 Additional keyword arguments that need to be passed to the internal 

1060 query method. 

1061 

1062 Returns 

1063 ------- 

1064 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1065 Information about jobs satisfying the search criteria where for each 

1066 Scheduler, local HTCondor job ids are mapped to their respective 

1067 classads. 

1068 """ 

1069 return condor_query(constraint, schedds, htc_query_history, **kwargs) 

1070 

1071 

1072def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **kwargs): 

1073 """Get information about HTCondor jobs. 

1074 

1075 Parameters 

1076 ---------- 

1077 constraint : `str`, optional 

1078 Constraints to be passed to job query. 

1079 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1080 HTCondor schedulers which to query for job information. If None 

1081 (default), the query will be run against the history file of 

1082 the local scheduler only. 

1083 query_func : callable 

1084 An query function which takes following arguments: 

1085 

1086 - ``schedds``: Schedulers to query (`list` [`htcondor.Schedd`]). 

1087 - ``**kwargs``: Keyword arguments that will be passed to the query 

1088 function. 

1089 **kwargs : `~typing.Any` 

1090 Additional keyword arguments that need to be passed to the query 

1091 method. 

1092 

1093 Returns 

1094 ------- 

1095 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1096 Information about jobs satisfying the search criteria where for each 

1097 Scheduler, local HTCondor job ids are mapped to their respective 

1098 classads. 

1099 """ 

1100 if not schedds: 

1101 coll = htcondor.Collector() 

1102 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1103 schedds = {schedd_ad["Name"]: htcondor.Schedd(schedd_ad)} 

1104 

1105 # Make sure that 'ClusterId' and 'ProcId' attributes are always included 

1106 # in the job classad. They are needed to construct the job id. 

1107 added_attrs = set() 

1108 if "projection" in kwargs and kwargs["projection"]: 

1109 requested_attrs = set(kwargs["projection"]) 

1110 required_attrs = {"ClusterId", "ProcId"} 

1111 added_attrs = required_attrs - requested_attrs 

1112 for attr in added_attrs: 

1113 kwargs["projection"].append(attr) 

1114 

1115 unwanted_attrs = {"Env", "Environment"} | added_attrs 

1116 job_info = defaultdict(dict) 

1117 for schedd_name, job_ad in query_func(schedds, constraint=constraint, **kwargs): 

1118 id_ = f"{job_ad['ClusterId']}.{job_ad['ProcId']}" 

1119 for attr in set(job_ad) & unwanted_attrs: 

1120 del job_ad[attr] 

1121 job_info[schedd_name][id_] = job_ad 

1122 _LOG.debug("query returned %d jobs", sum(len(val) for val in job_info.values())) 

1123 

1124 # Restore the list of the requested attributes to its original value 

1125 # if needed. 

1126 if added_attrs: 

1127 for attr in added_attrs: 

1128 kwargs["projection"].remove(attr) 

1129 

1130 # When returning the results filter out entries for schedulers with no jobs 

1131 # matching the search criteria. 

1132 return {key: val for key, val in job_info.items() if val} 

1133 

1134 

1135def condor_search(constraint=None, hist=None, schedds=None): 

1136 """Search for running and finished jobs satisfying given criteria. 

1137 

1138 Parameters 

1139 ---------- 

1140 constraint : `str`, optional 

1141 Constraints to be passed to job query. 

1142 hist : `float` 

1143 Limit history search to this many days. 

1144 schedds : `dict` [`str`, `htcondor.Schedd`], optional 

1145 The list of the HTCondor schedulers which to query for job information. 

1146 If None (default), only the local scheduler will be queried. 

1147 

1148 Returns 

1149 ------- 

1150 job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]] 

1151 Information about jobs satisfying the search criteria where for each 

1152 Scheduler, local HTCondor job ids are mapped to their respective 

1153 classads. 

1154 """ 

1155 if not schedds: 

1156 coll = htcondor.Collector() 

1157 schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) 

1158 schedds = {schedd_ad["Name"]: htcondor.Schedd(locate_ad=schedd_ad)} 

1159 

1160 job_info = condor_q(constraint=constraint, schedds=schedds) 

1161 if hist is not None: 

1162 epoch = (datetime.now() - timedelta(days=hist)).timestamp() 

1163 constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})" 

1164 hist_info = condor_history(constraint, schedds=schedds) 

1165 update_job_info(job_info, hist_info) 

1166 return job_info 

1167 

1168 

1169def condor_status(constraint=None, coll=None): 

1170 """Get information about HTCondor pool. 

1171 

1172 Parameters 

1173 ---------- 

1174 constraint : `str`, optional 

1175 Constraints to be passed to the query. 

1176 coll : `htcondor.Collector`, optional 

1177 Object representing HTCondor collector daemon. 

1178 

1179 Returns 

1180 ------- 

1181 pool_info : `dict` [`str`, `dict` [`str`, Any]] 

1182 Mapping between HTCondor slot names and slot information (classAds). 

1183 """ 

1184 if coll is None: 

1185 coll = htcondor.Collector() 

1186 try: 

1187 pool_ads = coll.query(constraint=constraint) 

1188 except OSError as ex: 

1189 raise RuntimeError(f"Problem querying the Collector. (Constraint='{constraint}')") from ex 

1190 

1191 pool_info = {} 

1192 for slot in pool_ads: 

1193 pool_info[slot["name"]] = dict(slot) 

1194 _LOG.debug("condor_status returned %d ads", len(pool_info)) 

1195 return pool_info 

1196 

1197 

1198def update_job_info(job_info, other_info): 

1199 """Update results of a job query with results from another query. 

1200 

1201 Parameters 

1202 ---------- 

1203 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1204 Results of the job query that needs to be updated. 

1205 other_info : `dict` [`str`, `dict` [`str`, Any]] 

1206 Results of the other job query. 

1207 

1208 Returns 

1209 ------- 

1210 job_info : `dict` [`str`, `dict` [`str`, Any]] 

1211 The updated results. 

1212 """ 

1213 for schedd_name, others in other_info.items(): 

1214 try: 

1215 jobs = job_info[schedd_name] 

1216 except KeyError: 

1217 job_info[schedd_name] = others 

1218 else: 

1219 for id_, ad in others.items(): 

1220 jobs.setdefault(id_, {}).update(ad) 

1221 return job_info 

1222 

1223 

1224def summary_from_dag(dir_name): 

1225 """Build bps_run_summary string from dag file. 

1226 

1227 Parameters 

1228 ---------- 

1229 dir_name : `str` 

1230 Path that includes dag file for a run. 

1231 

1232 Returns 

1233 ------- 

1234 summary : `str` 

1235 Semi-colon separated list of job labels and counts. 

1236 (Same format as saved in dag classad). 

1237 job_name_to_pipetask : `dict` [`str`, `str`] 

1238 Mapping of job names to job labels. 

1239 """ 

1240 dag = next(Path(dir_name).glob("*.dag")) 

1241 

1242 # Later code depends upon insertion order 

1243 counts = defaultdict(int) 

1244 job_name_to_pipetask = {} 

1245 try: 

1246 with open(dag) as fh: 

1247 for line in fh: 

1248 if line.startswith("JOB"): 

1249 m = re.match(r"JOB ([^\s]+) jobs/([^/]+)/", line) 

1250 if m: 

1251 label = m.group(2) 

1252 if label == "init": 

1253 label = "pipetaskInit" 

1254 job_name_to_pipetask[m.group(1)] = label 

1255 counts[label] += 1 

1256 else: # Check if Pegasus submission 

1257 m = re.match(r"JOB ([^\s]+) ([^\s]+)", line) 

1258 if m: 

1259 label = pegasus_name_to_label(m.group(1)) 

1260 job_name_to_pipetask[m.group(1)] = label 

1261 counts[label] += 1 

1262 else: 

1263 _LOG.warning("Parse DAG: unmatched job line: %s", line) 

1264 elif line.startswith("FINAL"): 

1265 m = re.match(r"FINAL ([^\s]+) jobs/([^/]+)/", line) 

1266 if m: 

1267 label = m.group(2) 

1268 job_name_to_pipetask[m.group(1)] = label 

1269 counts[label] += 1 

1270 

1271 except (OSError, PermissionError, StopIteration): 

1272 pass 

1273 

1274 summary = ";".join([f"{name}:{counts[name]}" for name in counts]) 

1275 _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_pipetask) 

1276 return summary, job_name_to_pipetask 

1277 

1278 

1279def pegasus_name_to_label(name): 

1280 """Convert pegasus job name to a label for the report. 

1281 

1282 Parameters 

1283 ---------- 

1284 name : `str` 

1285 Name of job. 

1286 

1287 Returns 

1288 ------- 

1289 label : `str` 

1290 Label for job. 

1291 """ 

1292 label = "UNK" 

1293 if name.startswith("create_dir") or name.startswith("stage_in") or name.startswith("stage_out"): 

1294 label = "pegasus" 

1295 else: 

1296 m = re.match(r"pipetask_(\d+_)?([^_]+)", name) 

1297 if m: 

1298 label = m.group(2) 

1299 if label == "init": 

1300 label = "pipetaskInit" 

1301 

1302 return label 

1303 

1304 

1305def read_dag_status(wms_path): 

1306 """Read the node status file for DAG summary information. 

1307 

1308 Parameters 

1309 ---------- 

1310 wms_path : `str` 

1311 Path that includes node status file for a run. 

1312 

1313 Returns 

1314 ------- 

1315 dag_ad : `dict` [`str`, Any] 

1316 DAG summary information. 

1317 """ 

1318 dag_ad = {} 

1319 

1320 # While this is probably more up to date than dag classad, only read from 

1321 # file if need to. 

1322 try: 

1323 try: 

1324 node_stat_file = next(Path(wms_path).glob("*.node_status")) 

1325 _LOG.debug("Reading Node Status File %s", node_stat_file) 

1326 with open(node_stat_file) as infh: 

1327 dag_ad = classad.parseNext(infh) # pylint: disable=E1101 

1328 except StopIteration: 

1329 pass 

1330 

1331 if not dag_ad: 

1332 # Pegasus check here 

1333 try: 

1334 metrics_file = next(Path(wms_path).glob("*.dag.metrics")) 

1335 with open(metrics_file) as infh: 

1336 metrics = json.load(infh) 

1337 dag_ad["NodesTotal"] = metrics.get("jobs", 0) 

1338 dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0) 

1339 dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0) 

1340 dag_ad["pegasus_version"] = metrics.get("planner_version", "") 

1341 except StopIteration: 

1342 try: 

1343 metrics_file = next(Path(wms_path).glob("*.metrics")) 

1344 with open(metrics_file) as infh: 

1345 metrics = json.load(infh) 

1346 dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"] 

1347 dag_ad["pegasus_version"] = metrics.get("version", "") 

1348 except StopIteration: 

1349 pass 

1350 except (OSError, PermissionError): 

1351 pass 

1352 

1353 _LOG.debug("read_dag_status: %s", dag_ad) 

1354 return dict(dag_ad) 

1355 

1356 

1357def read_node_status(wms_path): 

1358 """Read entire node status file. 

1359 

1360 Parameters 

1361 ---------- 

1362 wms_path : `str` 

1363 Path that includes node status file for a run. 

1364 

1365 Returns 

1366 ------- 

1367 jobs : `dict` [`str`, Any] 

1368 DAG summary information compiled from the node status file combined 

1369 with the information found in the node event log. 

1370 

1371 Currently, if the same job attribute is found in both files, its value 

1372 from the event log takes precedence over the value from the node status 

1373 file. 

1374 """ 

1375 # Get jobid info from other places to fill in gaps in info from node_status 

1376 _, job_name_to_pipetask = summary_from_dag(wms_path) 

1377 wms_workflow_id, loginfo = read_dag_log(wms_path) 

1378 loginfo = read_dag_nodes_log(wms_path) 

1379 _LOG.debug("loginfo = %s", loginfo) 

1380 job_name_to_id = {} 

1381 for jid, jinfo in loginfo.items(): 

1382 if "LogNotes" in jinfo: 

1383 m = re.match(r"DAG Node: ([^\s]+)", jinfo["LogNotes"]) 

1384 if m: 

1385 job_name_to_id[m.group(1)] = jid 

1386 jinfo["DAGNodeName"] = m.group(1) 

1387 

1388 try: 

1389 node_status = next(Path(wms_path).glob("*.node_status")) 

1390 except StopIteration: 

1391 return loginfo 

1392 

1393 jobs = {} 

1394 fake_id = -1.0 # For nodes that do not yet have a job id, give fake one 

1395 try: 

1396 with open(node_status) as fh: 

1397 ads = classad.parseAds(fh) 

1398 

1399 for jclassad in ads: 

1400 if jclassad["Type"] == "DagStatus": 

1401 # skip DAG summary 

1402 pass 

1403 elif "Node" not in jclassad: 

1404 if jclassad["Type"] != "StatusEnd": 

1405 _LOG.debug("Key 'Node' not in classad: %s", jclassad) 

1406 break 

1407 else: 

1408 if jclassad["Node"] in job_name_to_pipetask: 

1409 try: 

1410 label = job_name_to_pipetask[jclassad["Node"]] 

1411 except KeyError: 

1412 _LOG.error("%s not in %s", jclassad["Node"], job_name_to_pipetask.keys()) 

1413 raise 

1414 elif "_" in jclassad["Node"]: 

1415 label = jclassad["Node"].split("_")[1] 

1416 else: 

1417 label = jclassad["Node"] 

1418 

1419 # Make job info as if came from condor_q 

1420 if jclassad["Node"] in job_name_to_id: 

1421 job_id = str(job_name_to_id[jclassad["Node"]]) 

1422 else: 

1423 job_id = str(fake_id) 

1424 fake_id -= 1 

1425 

1426 job = dict(jclassad) 

1427 job["ClusterId"] = int(float(job_id)) 

1428 job["DAGManJobID"] = wms_workflow_id 

1429 job["DAGNodeName"] = jclassad["Node"] 

1430 job["bps_job_label"] = label 

1431 

1432 jobs[job_id] = job 

1433 try: 

1434 jobs[job_id] |= loginfo[job_id] 

1435 except KeyError: 

1436 pass 

1437 except (OSError, PermissionError): 

1438 pass 

1439 

1440 return jobs 

1441 

1442 

1443def read_dag_log(wms_path): 

1444 """Read job information from the DAGMan log file. 

1445 

1446 Parameters 

1447 ---------- 

1448 wms_path : `str` 

1449 Path containing the DAGMan log file. 

1450 

1451 Returns 

1452 ------- 

1453 wms_workflow_id : `str` 

1454 HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job. 

1455 dag_info : `dict` [`str`, `~collections.abc.Any`] 

1456 HTCondor job information read from the log file mapped to HTCondor 

1457 job id. 

1458 

1459 Raises 

1460 ------ 

1461 FileNotFoundError 

1462 If cannot find DAGMan log in given wms_path. 

1463 """ 

1464 wms_workflow_id = 0 

1465 dag_info = {} 

1466 

1467 path = Path(wms_path) 

1468 if path.exists(): 

1469 try: 

1470 filename = next(path.glob("*.dag.dagman.log")) 

1471 except StopIteration as exc: 

1472 raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc 

1473 _LOG.debug("dag node log filename: %s", filename) 

1474 

1475 info = {} 

1476 job_event_log = htcondor.JobEventLog(str(filename)) 

1477 for event in job_event_log.events(stop_after=0): 

1478 id_ = f"{event['Cluster']}.{event['Proc']}" 

1479 if id_ not in info: 

1480 info[id_] = {} 

1481 wms_workflow_id = id_ # taking last job id in case of restarts 

1482 info[id_].update(event) 

1483 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1484 

1485 # only save latest DAG job 

1486 dag_info = {wms_workflow_id: info[wms_workflow_id]} 

1487 for job in dag_info.values(): 

1488 _tweak_log_info(filename, job) 

1489 

1490 return wms_workflow_id, dag_info 

1491 

1492 

1493def read_dag_nodes_log(wms_path): 

1494 """Read job information from the DAGMan nodes log file. 

1495 

1496 Parameters 

1497 ---------- 

1498 wms_path : `str` 

1499 Path containing the DAGMan nodes log file. 

1500 

1501 Returns 

1502 ------- 

1503 info : `dict` [`str`, Any] 

1504 HTCondor job information read from the log file mapped to HTCondor 

1505 job id. 

1506 

1507 Raises 

1508 ------ 

1509 FileNotFoundError 

1510 If cannot find DAGMan node log in given wms_path. 

1511 """ 

1512 try: 

1513 filename = next(Path(wms_path).glob("*.dag.nodes.log")) 

1514 except StopIteration as exc: 

1515 raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc 

1516 _LOG.debug("dag node log filename: %s", filename) 

1517 

1518 info = {} 

1519 job_event_log = htcondor.JobEventLog(str(filename)) 

1520 for event in job_event_log.events(stop_after=0): 

1521 id_ = f"{event['Cluster']}.{event['Proc']}" 

1522 if id_ not in info: 

1523 info[id_] = {} 

1524 info[id_].update(event) 

1525 info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"] 

1526 

1527 # Add more condor_q-like info to info parsed from log file. 

1528 for job in info.values(): 

1529 _tweak_log_info(filename, job) 

1530 

1531 return info 

1532 

1533 

1534def read_dag_info(wms_path): 

1535 """Read custom DAGMan job information from the file. 

1536 

1537 Parameters 

1538 ---------- 

1539 wms_path : `str` 

1540 Path containing the file with the DAGMan job info. 

1541 

1542 Returns 

1543 ------- 

1544 dag_info : `dict` [`str`, `dict` [`str`, Any]] 

1545 HTCondor job information. 

1546 

1547 Raises 

1548 ------ 

1549 FileNotFoundError 

1550 If cannot find DAGMan job info file in the given location. 

1551 """ 

1552 try: 

1553 filename = next(Path(wms_path).glob("*.info.json")) 

1554 except StopIteration as exc: 

1555 raise FileNotFoundError(f"File with DAGMan job information not found in {wms_path}") from exc 

1556 _LOG.debug("DAGMan job information filename: %s", filename) 

1557 try: 

1558 with open(filename) as fh: 

1559 dag_info = json.load(fh) 

1560 except (OSError, PermissionError) as exc: 

1561 _LOG.debug("Retrieving DAGMan job information failed: %s", exc) 

1562 dag_info = {} 

1563 return dag_info 

1564 

1565 

1566def write_dag_info(filename, dag_info): 

1567 """Write custom job information about DAGMan job. 

1568 

1569 Parameters 

1570 ---------- 

1571 filename : `str` 

1572 Name of the file where the information will be stored. 

1573 dag_info : `dict` [`str` `dict` [`str`, Any]] 

1574 Information about the DAGMan job. 

1575 """ 

1576 schedd_name = next(iter(dag_info)) 

1577 dag_id = next(iter(dag_info[schedd_name])) 

1578 dag_ad = dag_info[schedd_name][dag_id] 

1579 try: 

1580 with open(filename, "w") as fh: 

1581 info = { 

1582 schedd_name: { 

1583 dag_id: {"ClusterId": dag_ad["ClusterId"], "GlobalJobId": dag_ad["GlobalJobId"]} 

1584 } 

1585 } 

1586 json.dump(info, fh) 

1587 except (KeyError, OSError, PermissionError) as exc: 

1588 _LOG.debug("Persisting DAGMan job information failed: %s", exc) 

1589 

1590 

1591def _tweak_log_info(filename, job): 

1592 """Massage the given job info has same structure as if came from condor_q. 

1593 

1594 Parameters 

1595 ---------- 

1596 filename : `pathlib.Path` 

1597 Name of the DAGMan log. 

1598 job : `dict` [ `str`, Any ] 

1599 A mapping between HTCondor job id and job information read from 

1600 the log. 

1601 """ 

1602 _LOG.debug("_tweak_log_info: %s %s", filename, job) 

1603 

1604 try: 

1605 job["ClusterId"] = job["Cluster"] 

1606 job["ProcId"] = job["Proc"] 

1607 job["Iwd"] = str(filename.parent) 

1608 job["Owner"] = filename.owner() 

1609 

1610 match job["MyType"]: 

1611 case "ExecuteEvent": 

1612 job["JobStatus"] = JobStatus.RUNNING 

1613 case "JobTerminatedEvent" | "PostScriptTerminatedEvent": 

1614 job["JobStatus"] = JobStatus.COMPLETED 

1615 case "SubmitEvent": 

1616 job["JobStatus"] = JobStatus.IDLE 

1617 case "JobAbortedEvent": 

1618 job["JobStatus"] = JobStatus.REMOVED 

1619 case "JobHeldEvent": 

1620 job["JobStatus"] = JobStatus.HELD 

1621 case _: 

1622 _LOG.debug("Unknown log event type: %s", job["MyType"]) 

1623 job["JobStatus"] = JobStatus.UNEXPANDED 

1624 

1625 if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}: 

1626 new_job = HTC_JOB_AD_HANDLERS.handle(job) 

1627 if new_job is not None: 

1628 job = new_job 

1629 else: 

1630 _LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"]) 

1631 

1632 except KeyError as e: 

1633 _LOG.error("Missing key %s in job: %s", str(e), job) 

1634 raise 

1635 

1636 

1637def htc_check_dagman_output(wms_path): 

1638 """Check the DAGMan output for error messages. 

1639 

1640 Parameters 

1641 ---------- 

1642 wms_path : `str` 

1643 Directory containing the DAGman output file. 

1644 

1645 Returns 

1646 ------- 

1647 message : `str` 

1648 Message containing error messages from the DAGMan output. Empty 

1649 string if no messages. 

1650 

1651 Raises 

1652 ------ 

1653 FileNotFoundError 

1654 If cannot find DAGMan standard output file in given wms_path. 

1655 """ 

1656 try: 

1657 filename = next(Path(wms_path).glob("*.dag.dagman.out")) 

1658 except StopIteration as exc: 

1659 raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc 

1660 _LOG.debug("dag output filename: %s", filename) 

1661 

1662 message = "" 

1663 try: 

1664 with open(filename) as fh: 

1665 last_submit_failed = "" 

1666 for line in fh: 

1667 m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line) 

1668 if m: 

1669 last_submit_failed = m.group(1) 

1670 if last_submit_failed: 

1671 message = f"Warn: Job submission issues (last: {last_submit_failed})" 

1672 except (OSError, PermissionError): 

1673 message = f"Warn: Could not read dagman output file from {wms_path}." 

1674 return message