Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Class definitions for a Generic Workflow Graph. 

23""" 

24 

25__all__ = ["GenericWorkflow", "GenericWorkflowFile", "GenericWorkflowJob", "GenericWorkflowExec"] 

26 

27 

28import dataclasses 

29import itertools 

30import logging 

31from typing import Optional 

32 

33import networkx as nx 

34 

35from lsst.daf.butler.core.utils import iterable 

36from .bps_draw import draw_networkx_dot 

37 

38_LOG = logging.getLogger(__name__) 

39 

40 

41@dataclasses.dataclass 

42class GenericWorkflowFile: 

43 """Information about a file that may be needed by various workflow 

44 management services. 

45 """ 

46 name: str 

47 """Lookup key (logical file name) of file/directory. Must be unique 

48 within run. 

49 """ 

50 

51 src_uri: str or None # don't know that need ButlerURI 

52 """Original location of file/directory. 

53 """ 

54 

55 wms_transfer: bool 

56 """Whether the WMS should ignore file or not. Default is False. 

57 """ 

58 

59 job_access_remote: bool 

60 """Whether the job can remotely access file (using separately specified 

61 file access protocols). Default is False. 

62 """ 

63 

64 job_shared: bool 

65 """Whether job requires its own copy of this file. Default is False. 

66 """ 

67 

68 # As of python 3.7.8, can't use __slots__ + dataclass if give default 

69 # values, so writing own __init__. 

70 def __init__(self, name: str, src_uri: str = None, wms_transfer: bool = False, 

71 job_access_remote: bool = False, job_shared: bool = False): 

72 self.name = name 

73 self.src_uri = src_uri 

74 self.wms_transfer = wms_transfer 

75 self.job_access_remote = job_access_remote 

76 self.job_shared = job_shared 

77 

78 __slots__ = ("name", "src_uri", "wms_transfer", "job_access_remote", "job_shared") 

79 

80 def __hash__(self): 

81 return hash(self.name) 

82 

83 

84@dataclasses.dataclass 

85class GenericWorkflowExec: 

86 """Information about an executable that may be needed by various workflow 

87 management services. 

88 """ 

89 name: str 

90 """Lookup key (logical file name) of executable. Must be unique 

91 within run. 

92 """ 

93 

94 src_uri: str or None # don't know that need ButlerURI 

95 """Original location of executable. 

96 """ 

97 

98 transfer_executable: bool 

99 """Whether the WMS/plugin is responsible for staging executable to 

100 location usable by job. 

101 """ 

102 

103 # As of python 3.7.8, can't use __slots__ + dataclass if give default 

104 # values, so writing own __init__. 

105 def __init__(self, name: str, src_uri: str = None, transfer_executable: bool = False): 

106 self.name = name 

107 self.src_uri = src_uri 

108 self.transfer_executable = transfer_executable 

109 

110 __slots__ = ("name", "src_uri", "transfer_executable") 

111 

112 def __hash__(self): 

113 return hash(self.name) 

114 

115 

116@dataclasses.dataclass 

117class GenericWorkflowJob: 

118 """Information about a job that may be needed by various workflow 

119 management services. 

120 """ 

121 name: str 

122 """Name of job. Must be unique within workflow. 

123 """ 

124 

125 label: Optional[str] 

126 """Primary user-facing label for job. Does not need to be unique 

127 and may be used for summary reports. 

128 """ 

129 

130 tags: Optional[dict] 

131 """Other key/value pairs for job that user may want to use as a filter. 

132 """ 

133 

134 executable: Optional[GenericWorkflowExec] 

135 """Executable for job. 

136 """ 

137 

138 arguments: Optional[str] 

139 """Command line arguments for job. 

140 """ 

141 

142 cmdvals: Optional[dict] 

143 """Values for variables in cmdline when using lazy command line creation. 

144 """ 

145 

146 request_memory: Optional[int] # MB 

147 """Max memory (in MB) that the job is expected to need. 

148 """ 

149 

150 request_cpus: Optional[int] # cores 

151 """Max number of cpus that the job is expected to need. 

152 """ 

153 

154 request_disk: Optional[int] # MB 

155 """Max amount of job scratch disk (in MB) that the job is expected to need. 

156 """ 

157 

158 request_walltime: Optional[str] # minutes 

159 """Max amount of time (in seconds) that the job is expected to need. 

160 """ 

161 

162 compute_site: Optional[str] 

163 """Key to look up site-specific information for running the job. 

164 """ 

165 

166 mail_to: Optional[str] 

167 """Comma separated list of email addresses for emailing job status. 

168 """ 

169 

170 when_to_mail: Optional[str] 

171 """WMS-specific terminology for when to email job status. 

172 """ 

173 

174 number_of_retries: Optional[int] 

175 """Number of times to automatically retry a failed job. 

176 """ 

177 

178 retry_unless_exit: Optional[int] 

179 """Exit code for job that means to not automatically retry. 

180 """ 

181 

182 abort_on_value: Optional[int] 

183 """Job exit value for signals to abort the entire workflow. 

184 """ 

185 

186 abort_return_value: Optional[int] 

187 """Exit value to use when aborting the entire workflow. 

188 """ 

189 

190 priority: Optional[str] 

191 """Initial priority of job in WMS-format. 

192 """ 

193 

194 category: Optional[str] 

195 """WMS-facing label of job within single workflow (e.g., can be used for 

196 throttling jobs within a single workflow). 

197 """ 

198 

199 concurrency_limit: Optional[list] 

200 """Names of concurrency limits that the WMS plugin can appropriately 

201 translate to limit the number of this job across all running workflows. 

202 """ 

203 

204 queue: Optional[str] 

205 """Name of queue to use. Different WMS can translate 

206 this concept differently. 

207 """ 

208 

209 pre_cmdline: Optional[str] 

210 """Command line to be executed prior to executing job. 

211 """ 

212 

213 post_cmdline: Optional[str] 

214 """Command line to be executed after job executes. 

215 

216 Should be executed regardless of exit status. 

217 """ 

218 

219 profile: Optional[dict] 

220 """Nested dictionary of WMS-specific key/value pairs with primary key being 

221 WMS key (e.g., pegasus, condor, panda). 

222 """ 

223 

224 attrs: Optional[dict] 

225 """Key/value pairs of job attributes (for WMS that have attributes in 

226 addition to commands). 

227 """ 

228 

229 environment: Optional[dict] 

230 """Environment variable names and values to be explicitly set inside job. 

231 """ 

232 

233 # As of python 3.7.8, can't use __slots__ if give default values, so 

234 # writing own __init__. 

235 def __init__(self, name: str): 

236 self.name = name 

237 self.label = None 

238 self.tags = {} 

239 self.executable = None 

240 self.arguments = None 

241 self.cmdvals = {} 

242 self.request_memory = None 

243 self.request_cpus = None 

244 self.request_disk = None 

245 self.request_walltime = None 

246 self.compute_site = None 

247 self.mail_to = None 

248 self.when_to_mail = None 

249 self.number_of_retries = None 

250 self.retry_unless_exit = None 

251 self.abort_on_value = None 

252 self.abort_return_value = None 

253 self.priority = None 

254 self.category = None 

255 self.concurrency_limit = [] 

256 self.queue = None 

257 self.pre_cmdline = None 

258 self.post_cmdline = None 

259 self.profile = {} 

260 self.attrs = {} 

261 self.environment = {} 

262 

263 __slots__ = ("name", "label", "tags", "mail_to", "when_to_mail", 

264 "executable", "arguments", "cmdvals", 

265 "request_memory", "request_cpus", "request_disk", "request_walltime", 

266 "number_of_retries", "retry_unless_exit", "abort_on_value", "abort_return_value", 

267 "compute_site", "environment", "priority", "category", "concurrency_limit", 

268 "queue", "pre_cmdline", "post_cmdline", "profile", "attrs") 

269 

270 def __hash__(self): 

271 return hash(self.name) 

272 

273 

274class GenericWorkflow(nx.DiGraph): 

275 """A generic representation of a workflow used to submit to specific 

276 workflow management systems. 

277 

278 Parameters 

279 ---------- 

280 name : `str` 

281 Name of generic workflow. 

282 incoming_graph_data : `Any`, optional 

283 Data used to initialized graph that is passed through to nx.DiGraph 

284 constructor. Can be any type supported by networkx.DiGraph. 

285 attr : `dict` 

286 Keyword arguments passed through to nx.DiGraph constructor. 

287 """ 

288 def __init__(self, name, incoming_graph_data=None, **attr): 

289 super().__init__(incoming_graph_data, **attr) 

290 self._name = name 

291 self.run_attrs = {} 

292 self._files = {} 

293 self._executables = {} 

294 self._inputs = {} # mapping job.names to list of GenericWorkflowFile 

295 self._outputs = {} # mapping job.names to list of GenericWorkflowFile 

296 self.run_id = None 

297 self._final = None 

298 

299 @property 

300 def name(self): 

301 """Retrieve name of generic workflow. 

302 

303 Returns 

304 ------- 

305 name : `str` 

306 Name of generic workflow. 

307 """ 

308 return self._name 

309 

310 def get_files(self, data=False, transfer_only=True): 

311 """Retrieve files from generic workflow. 

312 

313 Need API in case change way files are stored (e.g., make 

314 workflow a bipartite graph with jobs and files nodes). 

315 

316 Parameters 

317 ---------- 

318 data : `bool`, optional 

319 Whether to return the file data as well as the file object name. 

320 (The defaults is False.) 

321 transfer_only : `bool`, optional 

322 Whether to only return files for which a workflow management system 

323 would be responsible for transferring. 

324 

325 Returns 

326 ------- 

327 files : `list` [`lsst.ctrl.bps.GenericWorkflowFile`] or `list` [`str`] 

328 File names or objects from generic workflow meeting specifications. 

329 """ 

330 files = [] 

331 for filename, file in self._files.items(): 

332 if not transfer_only or file.wms_transfer: 

333 if not data: 

334 files.append(filename) 

335 else: 

336 files.append(file) 

337 return files 

338 

339 def add_job(self, job, parent_names=None, child_names=None): 

340 """Add job to generic workflow. 

341 

342 Parameters 

343 ---------- 

344 job : `lsst.ctrl.bps.GenericWorkflowJob` 

345 Job to add to the generic workflow. 

346 parent_names : `list` [`str`], optional 

347 Names of jobs that are parents of given job 

348 child_names : `list` [`str`], optional 

349 Names of jobs that are children of given job 

350 """ 

351 if not isinstance(job, GenericWorkflowJob): 

352 raise RuntimeError(f"Invalid type for job to be added to GenericWorkflowGraph ({type(job)}).") 

353 if self.has_node(job.name): 

354 raise RuntimeError(f"Job {job.name} already exists in GenericWorkflowGraph.") 

355 super().add_node(job.name, job=job) 

356 self.add_job_relationships(parent_names, job.name) 

357 self.add_job_relationships(job.name, child_names) 

358 self.add_executable(job.executable) 

359 

360 def add_node(self, node_for_adding, **attr): 

361 """Override networkx function to call more specific add_job function. 

362 

363 Parameters 

364 ---------- 

365 node_for_adding : `lsst.ctrl.bps.GenericWorkflowJob` 

366 Job to be added to generic workflow. 

367 attr : 

368 Needed to match original networkx function, but not used. 

369 """ 

370 self.add_job(node_for_adding) 

371 

372 def add_job_relationships(self, parents, children): 

373 """Add dependencies between parent and child jobs. All parents will 

374 be connected to all children. 

375 

376 Parameters 

377 ---------- 

378 parents : `list` [`str`] 

379 Parent job names. 

380 children : `list` [`str`] 

381 Children job names. 

382 """ 

383 if parents is not None and children is not None: 

384 self.add_edges_from(itertools.product(iterable(parents), iterable(children))) 

385 

386 def add_edges_from(self, ebunch_to_add, **attr): 

387 """Add several edges between jobs in the generic workflow. 

388 

389 Parameters 

390 ---------- 

391 ebunch_to_add : Iterable [`tuple`] 

392 Iterable of job name pairs between which a dependency should be 

393 saved. 

394 attr : keyword arguments, optional 

395 Data can be assigned using keyword arguments (not currently used). 

396 """ 

397 for edge_to_add in ebunch_to_add: 

398 self.add_edge(edge_to_add[0], edge_to_add[1], **attr) 

399 

400 def add_edge(self, u_of_edge: str, v_of_edge: str, **attr): 

401 """Add edge connecting jobs in workflow. 

402 

403 Parameters 

404 ---------- 

405 u_of_edge : `str` 

406 Name of parent job. 

407 v_of_edge : `str` 

408 Name of child job. 

409 attr : keyword arguments, optional 

410 Attributes to save with edge. 

411 """ 

412 if u_of_edge not in self: 

413 raise RuntimeError(f"{u_of_edge} not in GenericWorkflow") 

414 if v_of_edge not in self: 

415 raise RuntimeError(f"{v_of_edge} not in GenericWorkflow") 

416 super().add_edge(u_of_edge, v_of_edge, **attr) 

417 

418 def get_job(self, job_name: str): 

419 """Retrieve job by name from workflow. 

420 

421 Parameters 

422 ---------- 

423 job_name : `str` 

424 Name of job to retrieve. 

425 

426 Returns 

427 ------- 

428 job : `lsst.ctrl.bps.GenericWorkflowJob` 

429 Job matching given job_name. 

430 """ 

431 return self.nodes[job_name]["job"] 

432 

433 def del_job(self, job_name: str): 

434 """Delete job from generic workflow leaving connected graph. 

435 

436 Parameters 

437 ---------- 

438 job_name : `str` 

439 Name of job to delete from workflow. 

440 """ 

441 # Connect all parent jobs to all children jobs. 

442 parents = self.predecessors(job_name) 

443 children = self.successors(job_name) 

444 self.add_job_relationships(parents, children) 

445 

446 # Delete job node (which deleted edges). 

447 self.remove_node(job_name) 

448 

449 def add_job_inputs(self, job_name, files): 

450 """Add files as inputs to specified job. 

451 

452 Parameters 

453 ---------- 

454 job_name : `str` 

455 Name of job to which inputs should be added 

456 files : `lsst.ctrl.bps.GenericWorkflowFile` or \ 

457 `list` [`lsst.ctrl.bps.GenericWorkflowFile`] 

458 File object(s) to be added as inputs to the specified job. 

459 """ 

460 self._inputs.setdefault(job_name, []) 

461 for file in iterable(files): 

462 # Save the central copy 

463 if file.name not in self._files: 

464 self._files[file.name] = file 

465 

466 # Save the job reference to the file 

467 self._inputs[job_name].append(file) 

468 

469 def get_file(self, name): 

470 """Retrieve a file object by name. 

471 

472 Parameters 

473 ---------- 

474 name : `str` 

475 Name of file object 

476 

477 Returns 

478 ------- 

479 gwfile : `lsst.ctrl.bps.GenericWorkflowFile` 

480 File matching given name. 

481 """ 

482 return self._files[name] 

483 

484 def add_file(self, gwfile): 

485 """Add file object. 

486 

487 Parameters 

488 ---------- 

489 gwfile : `lsst.ctrl.bps.GenericWorkflowFile` 

490 File object to add to workflow 

491 """ 

492 if gwfile.name not in self._files: 

493 self._files[gwfile.name] = gwfile 

494 else: 

495 _LOG.debug("Skipped add_file for existing file %s", gwfile.name) 

496 

497 def get_job_inputs(self, job_name, data=True, transfer_only=False): 

498 """Return the input files for the given job. 

499 

500 Parameters 

501 ---------- 

502 job_name : `str` 

503 Name of the job. 

504 data : `bool`, optional 

505 Whether to return the file data as well as the file object name. 

506 transfer_only : `bool`, optional 

507 Whether to only return files for which a workflow management system 

508 would be responsible for transferring. 

509 

510 Returns 

511 ------- 

512 inputs : `list` [`lsst.ctrl.bps.GenericWorkflowFile`] 

513 Input files for the given job. If no input files for the job, 

514 returns an empty list. 

515 """ 

516 inputs = [] 

517 if job_name in self._inputs: 

518 for gwfile in self._inputs[job_name]: 

519 if not transfer_only or gwfile.wms_transfer: 

520 if not data: 

521 inputs.append(gwfile.name) 

522 else: 

523 inputs.append(gwfile) 

524 return inputs 

525 

526 def add_job_outputs(self, job_name, files): 

527 """Add output files to a job. 

528 

529 Parameters 

530 ---------- 

531 job_name : `str` 

532 Name of job to which the files should be added as outputs. 

533 files : `list` [`lsst.ctrl.bps.GenericWorkflowFile`] 

534 File objects to be added as outputs for specified job. 

535 """ 

536 self._outputs.setdefault(job_name, []) 

537 

538 for file_ in iterable(files): 

539 # Save the central copy 

540 if file_.name not in self._files: 

541 self._files[file_.name] = file_ 

542 

543 # Save the job reference to the file 

544 self._outputs[job_name].append(file_) 

545 

546 def get_job_outputs(self, job_name, data=True, transfer_only=False): 

547 """Return the output files for the given job. 

548 

549 Parameters 

550 ---------- 

551 job_name : `str` 

552 Name of the job. 

553 data : `bool` 

554 Whether to return the file data as well as the file object name. 

555 It defaults to `True` thus returning file data as well. 

556 transfer_only : `bool` 

557 Whether to only return files for which a workflow management system 

558 would be responsible for transferring. It defaults to `False` thus 

559 returning all output files. 

560 

561 Returns 

562 ------- 

563 outputs : `list` [`lsst.ctrl.bps.GenericWorkflowFile`] 

564 Output files for the given job. If no output files for the job, 

565 returns an empty list. 

566 """ 

567 outputs = [] 

568 

569 if job_name in self._outputs: 

570 for file_name in self._outputs[job_name]: 

571 file = self._files[file_name] 

572 if not transfer_only or file.wms_transfer: 

573 if not data: 

574 outputs.append(file_name) 

575 else: 

576 outputs.append(self._files[file_name]) 

577 return outputs 

578 

579 def draw(self, stream, format_="dot"): 

580 """Output generic workflow in a visualization format. 

581 

582 Parameters 

583 ---------- 

584 stream : `str` or `io.BufferedIOBase` 

585 Stream to which the visualization should be written. 

586 format_ : `str`, optional 

587 Which visualization format to use. It defaults to the format for 

588 the dot program. 

589 """ 

590 draw_funcs = {"dot": draw_networkx_dot} 

591 if format_ in draw_funcs: 

592 draw_funcs[format_](self, stream) 

593 else: 

594 raise RuntimeError(f"Unknown draw format ({format_}") 

595 

596 def save(self, stream, format_="pickle"): 

597 """Save the generic workflow in a format that is loadable. 

598 

599 Parameters 

600 ---------- 

601 stream : `str` or `io.BufferedIOBase` 

602 Stream to pass to the format-specific writer. Accepts anything 

603 that the writer accepts. 

604 

605 format_ : `str`, optional 

606 Format in which to write the data. It defaults to pickle format. 

607 """ 

608 if format_ == "pickle": 

609 nx.write_gpickle(self, stream) 

610 else: 

611 raise RuntimeError(f"Unknown format ({format_})") 

612 

613 @classmethod 

614 def load(cls, stream, format_="pickle"): 

615 """Load a GenericWorkflow from the given stream 

616 

617 Parameters 

618 ---------- 

619 stream : `str` or `io.BufferedIOBase` 

620 Stream to pass to the format-specific loader. Accepts anything that 

621 the loader accepts. 

622 format_ : `str`, optional 

623 Format of data to expect when loading from stream. It defaults 

624 to pickle format. 

625 

626 Returns 

627 ------- 

628 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

629 Generic workflow loaded from the given stream 

630 """ 

631 if format_ == "pickle": 

632 return nx.read_gpickle(stream) 

633 

634 raise RuntimeError(f"Unknown format ({format_})") 

635 

636 def validate(self): 

637 """Run checks to ensure this is still a valid generic workflow graph. 

638 """ 

639 # Make sure a directed acyclic graph 

640 assert nx.algorithms.dag.is_directed_acyclic_graph(self) 

641 

642 def add_workflow_source(self, workflow): 

643 """Add given workflow as new source to this workflow. 

644 

645 Parameters 

646 ---------- 

647 workflow : `lsst.ctrl.bps.GenericWorkflow` 

648 """ 

649 # Find source nodes in self. 

650 self_sources = [n for n in self if self.in_degree(n) == 0] 

651 _LOG.debug("self_sources = %s", self_sources) 

652 

653 # Find sink nodes of workflow. 

654 new_sinks = [n for n in workflow if workflow.out_degree(n) == 0] 

655 _LOG.debug("new sinks = %s", new_sinks) 

656 

657 # Add new workflow nodes to self graph and make new edges. 

658 self.add_nodes_from(workflow.nodes(data=True)) 

659 self.add_edges_from(workflow.edges()) 

660 for source in self_sources: 

661 for sink in new_sinks: 

662 self.add_edge(sink, source) 

663 

664 # Files are stored separately so copy them. 

665 for job_name in workflow: 

666 self.add_job_inputs(job_name, workflow.get_job_inputs(job_name, data=True)) 

667 self.add_job_outputs(job_name, workflow.get_job_outputs(job_name, data=True)) 

668 self.add_executable(workflow.get_job(job_name).executable) 

669 

670 def add_final(self, final): 

671 """Add special final job/workflow to the generic workflow. 

672 

673 Parameters 

674 ---------- 

675 final : `lsst.ctrl.bps.GenericWorkflowJob` or \ 

676 `lsst.ctrl.bps.GenericWorkflow` 

677 Information needed to execute the special final job(s), the 

678 job(s) to be executed after all jobs that can be executed 

679 have been executed regardless of exit status of any of the 

680 jobs. 

681 """ 

682 if not isinstance(final, GenericWorkflowJob) and not isinstance(final, GenericWorkflow): 

683 raise TypeError("Invalid type for GenericWorkflow final ({type(final)})") 

684 

685 self._final = final 

686 if isinstance(final, GenericWorkflowJob): 

687 self.add_executable(final.executable) 

688 

689 def get_final(self): 

690 """Return job/workflow to be executed after all jobs that can be 

691 executed have been executed regardless of exit status of any of 

692 the jobs. 

693 

694 Returns 

695 ------- 

696 final : `lsst.ctrl.bps.GenericWorkflowJob` or \ 

697 `lsst.ctrl.bps.GenericWorkflow` 

698 Information needed to execute final job(s). 

699 """ 

700 return self._final 

701 

702 def add_executable(self, executable): 

703 """Add executable to workflow's list of executables. 

704 

705 Parameters 

706 ---------- 

707 executable : `lsst.ctrl.bps.GenericWorkflowExec` 

708 Executable object to be added to workflow. 

709 """ 

710 if executable is not None: 

711 self._executables[executable.name] = executable 

712 else: 

713 _LOG.warning("executable not specified (None); cannot add to the workflow's list of executables") 

714 

715 def get_executables(self, data=False, transfer_only=True): 

716 """Retrieve executables from generic workflow. 

717 

718 Parameters 

719 ---------- 

720 data : `bool`, optional 

721 Whether to return the executable data as well as the exec object 

722 name. (The defaults is False.) 

723 transfer_only : `bool`, optional 

724 Whether to only return executables for which transfer_executable 

725 is True. 

726 

727 Returns 

728 ------- 

729 execs : `list` [`lsst.ctrl.bps.GenericWorkflowExec`] or `list` [`str`] 

730 Filtered executable names or objects from generic workflow. 

731 """ 

732 execs = [] 

733 for name, executable in self._executables.items(): 

734 if not transfer_only or executable.transfer_executable: 

735 if not data: 

736 execs.append(name) 

737 else: 

738 execs.append(executable) 

739 return execs