Coverage for python/lsst/ctrl/bps/generic_workflow.py: 37%
317 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-17 02:09 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-17 02:09 -0800
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Class definitions for a Generic Workflow Graph.
23"""
25__all__ = ["GenericWorkflow", "GenericWorkflowFile", "GenericWorkflowJob", "GenericWorkflowExec"]
28import dataclasses
29import itertools
30import logging
31import pickle
32from collections import Counter
33from typing import Optional
35from lsst.utils.iteration import ensure_iterable
36from networkx import DiGraph, topological_sort
37from networkx.algorithms.dag import is_directed_acyclic_graph
39from .bps_draw import draw_networkx_dot
41_LOG = logging.getLogger(__name__)
44@dataclasses.dataclass
45class GenericWorkflowFile:
46 """Information about a file that may be needed by various workflow
47 management services.
48 """
50 name: str
51 """Lookup key (logical file name) of file/directory. Must be unique
52 within run.
53 """
55 src_uri: str or None # don't know that need ResourcePath
56 """Original location of file/directory.
57 """
59 wms_transfer: bool
60 """Whether the WMS should ignore file or not. Default is False.
61 """
63 job_access_remote: bool
64 """Whether the job can remotely access file (using separately specified
65 file access protocols). Default is False.
66 """
68 job_shared: bool
69 """Whether job requires its own copy of this file. Default is False.
70 """
72 # As of python 3.7.8, can't use __slots__ + dataclass if give default
73 # values, so writing own __init__.
74 def __init__(
75 self,
76 name: str,
77 src_uri: str = None,
78 wms_transfer: bool = False,
79 job_access_remote: bool = False,
80 job_shared: bool = False,
81 ):
82 self.name = name
83 self.src_uri = src_uri
84 self.wms_transfer = wms_transfer
85 self.job_access_remote = job_access_remote
86 self.job_shared = job_shared
88 __slots__ = ("name", "src_uri", "wms_transfer", "job_access_remote", "job_shared")
90 def __hash__(self):
91 return hash(self.name)
94@dataclasses.dataclass
95class GenericWorkflowExec:
96 """Information about an executable that may be needed by various workflow
97 management services.
98 """
100 name: str
101 """Lookup key (logical file name) of executable. Must be unique
102 within run.
103 """
105 src_uri: str or None # don't know that need ResourcePath
106 """Original location of executable.
107 """
109 transfer_executable: bool
110 """Whether the WMS/plugin is responsible for staging executable to
111 location usable by job.
112 """
114 # As of python 3.7.8, can't use __slots__ + dataclass if give default
115 # values, so writing own __init__.
116 def __init__(self, name: str, src_uri: str = None, transfer_executable: bool = False):
117 self.name = name
118 self.src_uri = src_uri
119 self.transfer_executable = transfer_executable
121 __slots__ = ("name", "src_uri", "transfer_executable")
123 def __hash__(self):
124 return hash(self.name)
127@dataclasses.dataclass
128class GenericWorkflowJob:
129 """Information about a job that may be needed by various workflow
130 management services.
131 """
133 name: str
134 """Name of job. Must be unique within workflow.
135 """
137 label: Optional[str]
138 """Primary user-facing label for job. Does not need to be unique
139 and may be used for summary reports.
140 """
142 quanta_counts: Optional[Counter]
143 """Counts of quanta per task label in job.
144 """
146 tags: Optional[dict]
147 """Other key/value pairs for job that user may want to use as a filter.
148 """
150 executable: Optional[GenericWorkflowExec]
151 """Executable for job.
152 """
154 arguments: Optional[str]
155 """Command line arguments for job.
156 """
158 cmdvals: Optional[dict]
159 """Values for variables in cmdline when using lazy command line creation.
160 """
162 memory_multiplier: Optional[float]
163 """Memory growth rate between retries.
164 """
166 request_memory: Optional[int] # MB
167 """Max memory (in MB) that the job is expected to need.
168 """
170 request_memory_max: Optional[int] # MB
171 """Max memory (in MB) that the job should ever use.
172 """
174 request_cpus: Optional[int] # cores
175 """Max number of cpus that the job is expected to need.
176 """
178 request_disk: Optional[int] # MB
179 """Max amount of job scratch disk (in MB) that the job is expected to need.
180 """
182 request_walltime: Optional[str] # minutes
183 """Max amount of time (in seconds) that the job is expected to need.
184 """
186 compute_site: Optional[str]
187 """Key to look up site-specific information for running the job.
188 """
190 accounting_group: Optional[str]
191 """Name of the accounting group to use.
192 """
194 accounting_user: Optional[str]
195 """Name of the user to use for accounting purposes.
196 """
198 mail_to: Optional[str]
199 """Comma separated list of email addresses for emailing job status.
200 """
202 when_to_mail: Optional[str]
203 """WMS-specific terminology for when to email job status.
204 """
206 number_of_retries: Optional[int]
207 """Number of times to automatically retry a failed job.
208 """
210 retry_unless_exit: Optional[int]
211 """Exit code for job that means to not automatically retry.
212 """
214 abort_on_value: Optional[int]
215 """Job exit value for signals to abort the entire workflow.
216 """
218 abort_return_value: Optional[int]
219 """Exit value to use when aborting the entire workflow.
220 """
222 priority: Optional[str]
223 """Initial priority of job in WMS-format.
224 """
226 category: Optional[str]
227 """WMS-facing label of job within single workflow (e.g., can be used for
228 throttling jobs within a single workflow).
229 """
231 concurrency_limit: Optional[str]
232 """Names of concurrency limits that the WMS plugin can appropriately
233 translate to limit the number of this job across all running workflows.
234 """
236 queue: Optional[str]
237 """Name of queue to use. Different WMS can translate this concept
238 differently.
239 """
241 pre_cmdline: Optional[str]
242 """Command line to be executed prior to executing job.
243 """
245 post_cmdline: Optional[str]
246 """Command line to be executed after job executes.
248 Should be executed regardless of exit status.
249 """
251 preemptible: Optional[bool]
252 """The flag indicating whether the job can be preempted.
253 """
255 profile: Optional[dict]
256 """Nested dictionary of WMS-specific key/value pairs with primary key being
257 WMS key (e.g., pegasus, condor, panda).
258 """
260 attrs: Optional[dict]
261 """Key/value pairs of job attributes (for WMS that have attributes in
262 addition to commands).
263 """
265 environment: Optional[dict]
266 """Environment variable names and values to be explicitly set inside job.
267 """
269 compute_cloud: Optional[str]
270 """Key to look up cloud-specific information for running the job.
271 """
273 # As of python 3.7.8, can't use __slots__ if give default values, so
274 # writing own __init__.
275 def __init__(self, name: str):
276 self.name = name
277 self.label = None
278 self.quanta_counts = Counter()
279 self.tags = {}
280 self.executable = None
281 self.arguments = None
282 self.cmdvals = {}
283 self.memory_multiplier = None
284 self.request_memory = None
285 self.request_memory_max = None
286 self.request_cpus = None
287 self.request_disk = None
288 self.request_walltime = None
289 self.compute_site = None
290 self.accounting_group = None
291 self.accounting_user = None
292 self.mail_to = None
293 self.when_to_mail = None
294 self.number_of_retries = None
295 self.retry_unless_exit = None
296 self.abort_on_value = None
297 self.abort_return_value = None
298 self.priority = None
299 self.category = None
300 self.concurrency_limit = None
301 self.queue = None
302 self.pre_cmdline = None
303 self.post_cmdline = None
304 self.preemptible = None
305 self.profile = {}
306 self.attrs = {}
307 self.environment = {}
308 self.compute_cloud = None
310 __slots__ = (
311 "name",
312 "label",
313 "quanta_counts",
314 "tags",
315 "mail_to",
316 "when_to_mail",
317 "executable",
318 "arguments",
319 "cmdvals",
320 "memory_multiplier",
321 "request_memory",
322 "request_memory_max",
323 "request_cpus",
324 "request_disk",
325 "request_walltime",
326 "number_of_retries",
327 "retry_unless_exit",
328 "abort_on_value",
329 "abort_return_value",
330 "compute_site",
331 "accounting_group",
332 "accounting_user",
333 "environment",
334 "priority",
335 "category",
336 "concurrency_limit",
337 "queue",
338 "pre_cmdline",
339 "post_cmdline",
340 "preemptible",
341 "profile",
342 "attrs",
343 "compute_cloud",
344 )
346 def __hash__(self):
347 return hash(self.name)
350class GenericWorkflow(DiGraph):
351 """A generic representation of a workflow used to submit to specific
352 workflow management systems.
354 Parameters
355 ----------
356 name : `str`
357 Name of generic workflow.
358 incoming_graph_data : `Any`, optional
359 Data used to initialized graph that is passed through to DiGraph
360 constructor. Can be any type supported by networkx.DiGraph.
361 attr : `dict`
362 Keyword arguments passed through to DiGraph constructor.
363 """
365 def __init__(self, name, incoming_graph_data=None, **attr):
366 super().__init__(incoming_graph_data, **attr)
367 self._name = name
368 self.run_attrs = {}
369 self._files = {}
370 self._executables = {}
371 self._inputs = {} # mapping job.names to list of GenericWorkflowFile
372 self._outputs = {} # mapping job.names to list of GenericWorkflowFile
373 self.run_id = None
374 self._final = None
376 @property
377 def name(self):
378 """Retrieve name of generic workflow.
380 Returns
381 -------
382 name : `str`
383 Name of generic workflow.
384 """
385 return self._name
387 @property
388 def quanta_counts(self):
389 """Count of quanta per task label (`collections.Counter`)."""
390 qcounts = Counter()
391 for job_name in self:
392 gwjob = self.get_job(job_name)
393 if gwjob.quanta_counts is not None:
394 qcounts += gwjob.quanta_counts
395 return qcounts
397 @property
398 def job_counts(self):
399 """Count of jobs per job label (`collections.Counter`)."""
400 jcounts = Counter()
401 for job_name in self:
402 gwjob = self.get_job(job_name)
403 jcounts[gwjob.label] += 1
405 # Final is separate
406 final = self.get_final()
407 if final:
408 if isinstance(final, GenericWorkflow):
409 for job_name in final:
410 gwjob = final.get_job(job_name)
411 jcounts[gwjob.label] += 1
412 else:
413 jcounts[final.label] += 1
415 return jcounts
417 def __iter__(self):
418 """Return iterator of job names in topologically sorted order."""
419 return topological_sort(self)
421 def get_files(self, data=False, transfer_only=True):
422 """Retrieve files from generic workflow.
424 Need API in case change way files are stored (e.g., make
425 workflow a bipartite graph with jobs and files nodes).
427 Parameters
428 ----------
429 data : `bool`, optional
430 Whether to return the file data as well as the file object name.
431 (The defaults is False.)
432 transfer_only : `bool`, optional
433 Whether to only return files for which a workflow management system
434 would be responsible for transferring.
436 Returns
437 -------
438 files : `list` [`lsst.ctrl.bps.GenericWorkflowFile`] or `list` [`str`]
439 File names or objects from generic workflow meeting specifications.
440 """
441 files = []
442 for filename, file in self._files.items():
443 if not transfer_only or file.wms_transfer:
444 if not data:
445 files.append(filename)
446 else:
447 files.append(file)
448 return files
450 def add_job(self, job, parent_names=None, child_names=None):
451 """Add job to generic workflow.
453 Parameters
454 ----------
455 job : `lsst.ctrl.bps.GenericWorkflowJob`
456 Job to add to the generic workflow.
457 parent_names : `list` [`str`], optional
458 Names of jobs that are parents of given job
459 child_names : `list` [`str`], optional
460 Names of jobs that are children of given job
461 """
462 if not isinstance(job, GenericWorkflowJob):
463 raise RuntimeError(f"Invalid type for job to be added to GenericWorkflowGraph ({type(job)}).")
464 if self.has_node(job.name):
465 raise RuntimeError(f"Job {job.name} already exists in GenericWorkflowGraph.")
466 super().add_node(job.name, job=job)
467 self.add_job_relationships(parent_names, job.name)
468 self.add_job_relationships(job.name, child_names)
469 self.add_executable(job.executable)
471 def add_node(self, node_for_adding, **attr):
472 """Override networkx function to call more specific add_job function.
474 Parameters
475 ----------
476 node_for_adding : `lsst.ctrl.bps.GenericWorkflowJob`
477 Job to be added to generic workflow.
478 attr :
479 Needed to match original networkx function, but not used.
480 """
481 self.add_job(node_for_adding)
483 def add_job_relationships(self, parents, children):
484 """Add dependencies between parent and child jobs. All parents will
485 be connected to all children.
487 Parameters
488 ----------
489 parents : `list` [`str`]
490 Parent job names.
491 children : `list` [`str`]
492 Children job names.
493 """
494 if parents is not None and children is not None:
495 self.add_edges_from(itertools.product(ensure_iterable(parents), ensure_iterable(children)))
497 def add_edges_from(self, ebunch_to_add, **attr):
498 """Add several edges between jobs in the generic workflow.
500 Parameters
501 ----------
502 ebunch_to_add : Iterable [`tuple`]
503 Iterable of job name pairs between which a dependency should be
504 saved.
505 attr : keyword arguments, optional
506 Data can be assigned using keyword arguments (not currently used).
507 """
508 for edge_to_add in ebunch_to_add:
509 self.add_edge(edge_to_add[0], edge_to_add[1], **attr)
511 def add_edge(self, u_of_edge: str, v_of_edge: str, **attr):
512 """Add edge connecting jobs in workflow.
514 Parameters
515 ----------
516 u_of_edge : `str`
517 Name of parent job.
518 v_of_edge : `str`
519 Name of child job.
520 attr : keyword arguments, optional
521 Attributes to save with edge.
522 """
523 if u_of_edge not in self:
524 raise RuntimeError(f"{u_of_edge} not in GenericWorkflow")
525 if v_of_edge not in self:
526 raise RuntimeError(f"{v_of_edge} not in GenericWorkflow")
527 super().add_edge(u_of_edge, v_of_edge, **attr)
529 def get_job(self, job_name: str):
530 """Retrieve job by name from workflow.
532 Parameters
533 ----------
534 job_name : `str`
535 Name of job to retrieve.
537 Returns
538 -------
539 job : `lsst.ctrl.bps.GenericWorkflowJob`
540 Job matching given job_name.
541 """
542 return self.nodes[job_name]["job"]
544 def del_job(self, job_name: str):
545 """Delete job from generic workflow leaving connected graph.
547 Parameters
548 ----------
549 job_name : `str`
550 Name of job to delete from workflow.
551 """
552 # Connect all parent jobs to all children jobs.
553 parents = self.predecessors(job_name)
554 children = self.successors(job_name)
555 self.add_job_relationships(parents, children)
557 # Delete job node (which deleted edges).
558 self.remove_node(job_name)
560 def add_job_inputs(self, job_name, files):
561 """Add files as inputs to specified job.
563 Parameters
564 ----------
565 job_name : `str`
566 Name of job to which inputs should be added
567 files : `lsst.ctrl.bps.GenericWorkflowFile` or \
568 `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
569 File object(s) to be added as inputs to the specified job.
570 """
571 self._inputs.setdefault(job_name, [])
572 for file in ensure_iterable(files):
573 # Save the central copy
574 if file.name not in self._files:
575 self._files[file.name] = file
577 # Save the job reference to the file
578 self._inputs[job_name].append(file)
580 def get_file(self, name):
581 """Retrieve a file object by name.
583 Parameters
584 ----------
585 name : `str`
586 Name of file object
588 Returns
589 -------
590 gwfile : `lsst.ctrl.bps.GenericWorkflowFile`
591 File matching given name.
592 """
593 return self._files[name]
595 def add_file(self, gwfile):
596 """Add file object.
598 Parameters
599 ----------
600 gwfile : `lsst.ctrl.bps.GenericWorkflowFile`
601 File object to add to workflow
602 """
603 if gwfile.name not in self._files:
604 self._files[gwfile.name] = gwfile
605 else:
606 _LOG.debug("Skipped add_file for existing file %s", gwfile.name)
608 def get_job_inputs(self, job_name, data=True, transfer_only=False):
609 """Return the input files for the given job.
611 Parameters
612 ----------
613 job_name : `str`
614 Name of the job.
615 data : `bool`, optional
616 Whether to return the file data as well as the file object name.
617 transfer_only : `bool`, optional
618 Whether to only return files for which a workflow management system
619 would be responsible for transferring.
621 Returns
622 -------
623 inputs : `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
624 Input files for the given job. If no input files for the job,
625 returns an empty list.
626 """
627 inputs = []
628 if job_name in self._inputs:
629 for gwfile in self._inputs[job_name]:
630 if not transfer_only or gwfile.wms_transfer:
631 if not data:
632 inputs.append(gwfile.name)
633 else:
634 inputs.append(gwfile)
635 return inputs
637 def add_job_outputs(self, job_name, files):
638 """Add output files to a job.
640 Parameters
641 ----------
642 job_name : `str`
643 Name of job to which the files should be added as outputs.
644 files : `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
645 File objects to be added as outputs for specified job.
646 """
647 self._outputs.setdefault(job_name, [])
649 for file_ in ensure_iterable(files):
650 # Save the central copy
651 if file_.name not in self._files:
652 self._files[file_.name] = file_
654 # Save the job reference to the file
655 self._outputs[job_name].append(file_)
657 def get_job_outputs(self, job_name, data=True, transfer_only=False):
658 """Return the output files for the given job.
660 Parameters
661 ----------
662 job_name : `str`
663 Name of the job.
664 data : `bool`
665 Whether to return the file data as well as the file object name.
666 It defaults to `True` thus returning file data as well.
667 transfer_only : `bool`
668 Whether to only return files for which a workflow management system
669 would be responsible for transferring. It defaults to `False` thus
670 returning all output files.
672 Returns
673 -------
674 outputs : `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
675 Output files for the given job. If no output files for the job,
676 returns an empty list.
677 """
678 outputs = []
680 if job_name in self._outputs:
681 for file_name in self._outputs[job_name]:
682 file = self._files[file_name]
683 if not transfer_only or file.wms_transfer:
684 if not data:
685 outputs.append(file_name)
686 else:
687 outputs.append(self._files[file_name])
688 return outputs
690 def draw(self, stream, format_="dot"):
691 """Output generic workflow in a visualization format.
693 Parameters
694 ----------
695 stream : `str` or `io.BufferedIOBase`
696 Stream to which the visualization should be written.
697 format_ : `str`, optional
698 Which visualization format to use. It defaults to the format for
699 the dot program.
700 """
701 draw_funcs = {"dot": draw_networkx_dot}
702 if format_ in draw_funcs:
703 draw_funcs[format_](self, stream)
704 else:
705 raise RuntimeError(f"Unknown draw format ({format_}")
707 def save(self, stream, format_="pickle"):
708 """Save the generic workflow in a format that is loadable.
710 Parameters
711 ----------
712 stream : `str` or `io.BufferedIOBase`
713 Stream to pass to the format-specific writer. Accepts anything
714 that the writer accepts.
716 format_ : `str`, optional
717 Format in which to write the data. It defaults to pickle format.
718 """
719 if format_ == "pickle":
720 pickle.dump(self, stream)
721 else:
722 raise RuntimeError(f"Unknown format ({format_})")
724 @classmethod
725 def load(cls, stream, format_="pickle"):
726 """Load a GenericWorkflow from the given stream
728 Parameters
729 ----------
730 stream : `str` or `io.BufferedIOBase`
731 Stream to pass to the format-specific loader. Accepts anything that
732 the loader accepts.
733 format_ : `str`, optional
734 Format of data to expect when loading from stream. It defaults
735 to pickle format.
737 Returns
738 -------
739 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
740 Generic workflow loaded from the given stream
741 """
742 if format_ == "pickle":
743 return pickle.load(stream)
745 raise RuntimeError(f"Unknown format ({format_})")
747 def validate(self):
748 """Run checks to ensure that the generic workflow graph is valid."""
749 # Make sure a directed acyclic graph
750 assert is_directed_acyclic_graph(self)
752 def add_workflow_source(self, workflow):
753 """Add given workflow as new source to this workflow.
755 Parameters
756 ----------
757 workflow : `lsst.ctrl.bps.GenericWorkflow`
758 """
759 # Find source nodes in self.
760 self_sources = [n for n in self if self.in_degree(n) == 0]
761 _LOG.debug("self_sources = %s", self_sources)
763 # Find sink nodes of workflow.
764 new_sinks = [n for n in workflow if workflow.out_degree(n) == 0]
765 _LOG.debug("new sinks = %s", new_sinks)
767 # Add new workflow nodes to self graph and make new edges.
768 self.add_nodes_from(workflow.nodes(data=True))
769 self.add_edges_from(workflow.edges())
770 for source in self_sources:
771 for sink in new_sinks:
772 self.add_edge(sink, source)
774 # Files are stored separately so copy them.
775 for job_name in workflow:
776 self.add_job_inputs(job_name, workflow.get_job_inputs(job_name, data=True))
777 self.add_job_outputs(job_name, workflow.get_job_outputs(job_name, data=True))
778 self.add_executable(workflow.get_job(job_name).executable)
780 def add_final(self, final):
781 """Add special final job/workflow to the generic workflow.
783 Parameters
784 ----------
785 final : `lsst.ctrl.bps.GenericWorkflowJob` or \
786 `lsst.ctrl.bps.GenericWorkflow`
787 Information needed to execute the special final job(s), the
788 job(s) to be executed after all jobs that can be executed
789 have been executed regardless of exit status of any of the
790 jobs.
791 """
792 if not isinstance(final, GenericWorkflowJob) and not isinstance(final, GenericWorkflow):
793 raise TypeError("Invalid type for GenericWorkflow final ({type(final)})")
795 self._final = final
796 if isinstance(final, GenericWorkflowJob):
797 self.add_executable(final.executable)
799 def get_final(self):
800 """Return job/workflow to be executed after all jobs that can be
801 executed have been executed regardless of exit status of any of
802 the jobs.
804 Returns
805 -------
806 final : `lsst.ctrl.bps.GenericWorkflowJob` or \
807 `lsst.ctrl.bps.GenericWorkflow`
808 Information needed to execute final job(s).
809 """
810 return self._final
812 def add_executable(self, executable):
813 """Add executable to workflow's list of executables.
815 Parameters
816 ----------
817 executable : `lsst.ctrl.bps.GenericWorkflowExec`
818 Executable object to be added to workflow.
819 """
820 if executable is not None:
821 self._executables[executable.name] = executable
822 else:
823 _LOG.warning("executable not specified (None); cannot add to the workflow's list of executables")
825 def get_executables(self, data=False, transfer_only=True):
826 """Retrieve executables from generic workflow.
828 Parameters
829 ----------
830 data : `bool`, optional
831 Whether to return the executable data as well as the exec object
832 name. (The defaults is False.)
833 transfer_only : `bool`, optional
834 Whether to only return executables for which transfer_executable
835 is True.
837 Returns
838 -------
839 execs : `list` [`lsst.ctrl.bps.GenericWorkflowExec`] or `list` [`str`]
840 Filtered executable names or objects from generic workflow.
841 """
842 execs = []
843 for name, executable in self._executables.items():
844 if not transfer_only or executable.transfer_executable:
845 if not data:
846 execs.append(name)
847 else:
848 execs.append(executable)
849 return execs