Coverage for python/lsst/ctrl/bps/generic_workflow.py : 29%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Class definitions for a Generic Workflow Graph.
23"""
25__all__ = ["GenericWorkflow", "GenericWorkflowFile", "GenericWorkflowJob", "GenericWorkflowExec"]
28import dataclasses
29import itertools
30import logging
31from typing import Optional
32from collections import Counter
34from networkx import DiGraph, read_gpickle, write_gpickle, topological_sort
35from networkx.algorithms.dag import is_directed_acyclic_graph
37from lsst.daf.butler.core.utils import iterable
38from .bps_draw import draw_networkx_dot
40_LOG = logging.getLogger(__name__)
43@dataclasses.dataclass
44class GenericWorkflowFile:
45 """Information about a file that may be needed by various workflow
46 management services.
47 """
48 name: str
49 """Lookup key (logical file name) of file/directory. Must be unique
50 within run.
51 """
53 src_uri: str or None # don't know that need ButlerURI
54 """Original location of file/directory.
55 """
57 wms_transfer: bool
58 """Whether the WMS should ignore file or not. Default is False.
59 """
61 job_access_remote: bool
62 """Whether the job can remotely access file (using separately specified
63 file access protocols). Default is False.
64 """
66 job_shared: bool
67 """Whether job requires its own copy of this file. Default is False.
68 """
70 # As of python 3.7.8, can't use __slots__ + dataclass if give default
71 # values, so writing own __init__.
72 def __init__(self, name: str, src_uri: str = None, wms_transfer: bool = False,
73 job_access_remote: bool = False, job_shared: bool = False):
74 self.name = name
75 self.src_uri = src_uri
76 self.wms_transfer = wms_transfer
77 self.job_access_remote = job_access_remote
78 self.job_shared = job_shared
80 __slots__ = ("name", "src_uri", "wms_transfer", "job_access_remote", "job_shared")
82 def __hash__(self):
83 return hash(self.name)
86@dataclasses.dataclass
87class GenericWorkflowExec:
88 """Information about an executable that may be needed by various workflow
89 management services.
90 """
91 name: str
92 """Lookup key (logical file name) of executable. Must be unique
93 within run.
94 """
96 src_uri: str or None # don't know that need ButlerURI
97 """Original location of executable.
98 """
100 transfer_executable: bool
101 """Whether the WMS/plugin is responsible for staging executable to
102 location usable by job.
103 """
105 # As of python 3.7.8, can't use __slots__ + dataclass if give default
106 # values, so writing own __init__.
107 def __init__(self, name: str, src_uri: str = None, transfer_executable: bool = False):
108 self.name = name
109 self.src_uri = src_uri
110 self.transfer_executable = transfer_executable
112 __slots__ = ("name", "src_uri", "transfer_executable")
114 def __hash__(self):
115 return hash(self.name)
118@dataclasses.dataclass
119class GenericWorkflowJob:
120 """Information about a job that may be needed by various workflow
121 management services.
122 """
123 name: str
124 """Name of job. Must be unique within workflow.
125 """
127 label: Optional[str]
128 """Primary user-facing label for job. Does not need to be unique
129 and may be used for summary reports.
130 """
132 quanta_counts: Optional[Counter]
133 """Counts of quanta per task label in job.
134 """
136 tags: Optional[dict]
137 """Other key/value pairs for job that user may want to use as a filter.
138 """
140 executable: Optional[GenericWorkflowExec]
141 """Executable for job.
142 """
144 arguments: Optional[str]
145 """Command line arguments for job.
146 """
148 cmdvals: Optional[dict]
149 """Values for variables in cmdline when using lazy command line creation.
150 """
152 memory_multiplier: Optional[float]
153 """Memory growth rate between retries.
154 """
156 request_memory: Optional[int] # MB
157 """Max memory (in MB) that the job is expected to need.
158 """
160 request_cpus: Optional[int] # cores
161 """Max number of cpus that the job is expected to need.
162 """
164 request_disk: Optional[int] # MB
165 """Max amount of job scratch disk (in MB) that the job is expected to need.
166 """
168 request_walltime: Optional[str] # minutes
169 """Max amount of time (in seconds) that the job is expected to need.
170 """
172 compute_site: Optional[str]
173 """Key to look up site-specific information for running the job.
174 """
176 mail_to: Optional[str]
177 """Comma separated list of email addresses for emailing job status.
178 """
180 when_to_mail: Optional[str]
181 """WMS-specific terminology for when to email job status.
182 """
184 number_of_retries: Optional[int]
185 """Number of times to automatically retry a failed job.
186 """
188 retry_unless_exit: Optional[int]
189 """Exit code for job that means to not automatically retry.
190 """
192 abort_on_value: Optional[int]
193 """Job exit value for signals to abort the entire workflow.
194 """
196 abort_return_value: Optional[int]
197 """Exit value to use when aborting the entire workflow.
198 """
200 priority: Optional[str]
201 """Initial priority of job in WMS-format.
202 """
204 category: Optional[str]
205 """WMS-facing label of job within single workflow (e.g., can be used for
206 throttling jobs within a single workflow).
207 """
209 concurrency_limit: Optional[str]
210 """Names of concurrency limits that the WMS plugin can appropriately
211 translate to limit the number of this job across all running workflows.
212 """
214 queue: Optional[str]
215 """Name of queue to use. Different WMS can translate this concept
216 differently.
217 """
219 pre_cmdline: Optional[str]
220 """Command line to be executed prior to executing job.
221 """
223 post_cmdline: Optional[str]
224 """Command line to be executed after job executes.
226 Should be executed regardless of exit status.
227 """
229 preemptible: Optional[bool]
230 """The flag indicating whether the job can be preempted.
231 """
233 profile: Optional[dict]
234 """Nested dictionary of WMS-specific key/value pairs with primary key being
235 WMS key (e.g., pegasus, condor, panda).
236 """
238 attrs: Optional[dict]
239 """Key/value pairs of job attributes (for WMS that have attributes in
240 addition to commands).
241 """
243 environment: Optional[dict]
244 """Environment variable names and values to be explicitly set inside job.
245 """
247 # As of python 3.7.8, can't use __slots__ if give default values, so
248 # writing own __init__.
249 def __init__(self, name: str):
250 self.name = name
251 self.label = None
252 self.quanta_counts = Counter()
253 self.tags = {}
254 self.executable = None
255 self.arguments = None
256 self.cmdvals = {}
257 self.memory_multiplier = None
258 self.request_memory = None
259 self.request_cpus = None
260 self.request_disk = None
261 self.request_walltime = None
262 self.compute_site = None
263 self.mail_to = None
264 self.when_to_mail = None
265 self.number_of_retries = None
266 self.retry_unless_exit = None
267 self.abort_on_value = None
268 self.abort_return_value = None
269 self.priority = None
270 self.category = None
271 self.concurrency_limit = None
272 self.queue = None
273 self.pre_cmdline = None
274 self.post_cmdline = None
275 self.preemptible = None
276 self.profile = {}
277 self.attrs = {}
278 self.environment = {}
280 __slots__ = ("name", "label", "quanta_counts", "tags", "mail_to", "when_to_mail",
281 "executable", "arguments", "cmdvals",
282 "memory_multiplier", "request_memory", "request_cpus", "request_disk", "request_walltime",
283 "number_of_retries", "retry_unless_exit", "abort_on_value", "abort_return_value",
284 "compute_site", "environment", "priority", "category", "concurrency_limit",
285 "queue", "pre_cmdline", "post_cmdline", "preemptible", "profile", "attrs")
287 def __hash__(self):
288 return hash(self.name)
291class GenericWorkflow(DiGraph):
292 """A generic representation of a workflow used to submit to specific
293 workflow management systems.
295 Parameters
296 ----------
297 name : `str`
298 Name of generic workflow.
299 incoming_graph_data : `Any`, optional
300 Data used to initialized graph that is passed through to DiGraph
301 constructor. Can be any type supported by networkx.DiGraph.
302 attr : `dict`
303 Keyword arguments passed through to DiGraph constructor.
304 """
305 def __init__(self, name, incoming_graph_data=None, **attr):
306 super().__init__(incoming_graph_data, **attr)
307 self._name = name
308 self.run_attrs = {}
309 self._files = {}
310 self._executables = {}
311 self._inputs = {} # mapping job.names to list of GenericWorkflowFile
312 self._outputs = {} # mapping job.names to list of GenericWorkflowFile
313 self.run_id = None
314 self._final = None
316 @property
317 def name(self):
318 """Retrieve name of generic workflow.
320 Returns
321 -------
322 name : `str`
323 Name of generic workflow.
324 """
325 return self._name
327 @property
328 def quanta_counts(self):
329 """Counts of quanta per task label in workflow (`collections.Counter`).
330 """
331 qcounts = Counter()
332 for job_name in self:
333 gwjob = self.get_job(job_name)
334 if gwjob.quanta_counts is not None:
335 qcounts += gwjob.quanta_counts
336 return qcounts
338 @property
339 def job_counts(self):
340 """Counts of jobs per job label in workflow (`collections.Counter`).
341 """
342 jcounts = Counter()
343 for job_name in self:
344 gwjob = self.get_job(job_name)
345 jcounts[gwjob.label] += 1
347 # Final is separate
348 final = self.get_final()
349 if final:
350 if isinstance(final, GenericWorkflow):
351 for job_name in final:
352 gwjob = final.get_job(job_name)
353 jcounts[gwjob.label] += 1
354 else:
355 jcounts[final.label] += 1
357 return jcounts
359 def __iter__(self):
360 """Return iterator of job names in topologically sorted order.
361 """
362 return topological_sort(self)
364 def get_files(self, data=False, transfer_only=True):
365 """Retrieve files from generic workflow.
367 Need API in case change way files are stored (e.g., make
368 workflow a bipartite graph with jobs and files nodes).
370 Parameters
371 ----------
372 data : `bool`, optional
373 Whether to return the file data as well as the file object name.
374 (The defaults is False.)
375 transfer_only : `bool`, optional
376 Whether to only return files for which a workflow management system
377 would be responsible for transferring.
379 Returns
380 -------
381 files : `list` [`lsst.ctrl.bps.GenericWorkflowFile`] or `list` [`str`]
382 File names or objects from generic workflow meeting specifications.
383 """
384 files = []
385 for filename, file in self._files.items():
386 if not transfer_only or file.wms_transfer:
387 if not data:
388 files.append(filename)
389 else:
390 files.append(file)
391 return files
393 def add_job(self, job, parent_names=None, child_names=None):
394 """Add job to generic workflow.
396 Parameters
397 ----------
398 job : `lsst.ctrl.bps.GenericWorkflowJob`
399 Job to add to the generic workflow.
400 parent_names : `list` [`str`], optional
401 Names of jobs that are parents of given job
402 child_names : `list` [`str`], optional
403 Names of jobs that are children of given job
404 """
405 if not isinstance(job, GenericWorkflowJob):
406 raise RuntimeError(f"Invalid type for job to be added to GenericWorkflowGraph ({type(job)}).")
407 if self.has_node(job.name):
408 raise RuntimeError(f"Job {job.name} already exists in GenericWorkflowGraph.")
409 super().add_node(job.name, job=job)
410 self.add_job_relationships(parent_names, job.name)
411 self.add_job_relationships(job.name, child_names)
412 self.add_executable(job.executable)
414 def add_node(self, node_for_adding, **attr):
415 """Override networkx function to call more specific add_job function.
417 Parameters
418 ----------
419 node_for_adding : `lsst.ctrl.bps.GenericWorkflowJob`
420 Job to be added to generic workflow.
421 attr :
422 Needed to match original networkx function, but not used.
423 """
424 self.add_job(node_for_adding)
426 def add_job_relationships(self, parents, children):
427 """Add dependencies between parent and child jobs. All parents will
428 be connected to all children.
430 Parameters
431 ----------
432 parents : `list` [`str`]
433 Parent job names.
434 children : `list` [`str`]
435 Children job names.
436 """
437 if parents is not None and children is not None:
438 self.add_edges_from(itertools.product(iterable(parents), iterable(children)))
440 def add_edges_from(self, ebunch_to_add, **attr):
441 """Add several edges between jobs in the generic workflow.
443 Parameters
444 ----------
445 ebunch_to_add : Iterable [`tuple`]
446 Iterable of job name pairs between which a dependency should be
447 saved.
448 attr : keyword arguments, optional
449 Data can be assigned using keyword arguments (not currently used).
450 """
451 for edge_to_add in ebunch_to_add:
452 self.add_edge(edge_to_add[0], edge_to_add[1], **attr)
454 def add_edge(self, u_of_edge: str, v_of_edge: str, **attr):
455 """Add edge connecting jobs in workflow.
457 Parameters
458 ----------
459 u_of_edge : `str`
460 Name of parent job.
461 v_of_edge : `str`
462 Name of child job.
463 attr : keyword arguments, optional
464 Attributes to save with edge.
465 """
466 if u_of_edge not in self:
467 raise RuntimeError(f"{u_of_edge} not in GenericWorkflow")
468 if v_of_edge not in self:
469 raise RuntimeError(f"{v_of_edge} not in GenericWorkflow")
470 super().add_edge(u_of_edge, v_of_edge, **attr)
472 def get_job(self, job_name: str):
473 """Retrieve job by name from workflow.
475 Parameters
476 ----------
477 job_name : `str`
478 Name of job to retrieve.
480 Returns
481 -------
482 job : `lsst.ctrl.bps.GenericWorkflowJob`
483 Job matching given job_name.
484 """
485 return self.nodes[job_name]["job"]
487 def del_job(self, job_name: str):
488 """Delete job from generic workflow leaving connected graph.
490 Parameters
491 ----------
492 job_name : `str`
493 Name of job to delete from workflow.
494 """
495 # Connect all parent jobs to all children jobs.
496 parents = self.predecessors(job_name)
497 children = self.successors(job_name)
498 self.add_job_relationships(parents, children)
500 # Delete job node (which deleted edges).
501 self.remove_node(job_name)
503 def add_job_inputs(self, job_name, files):
504 """Add files as inputs to specified job.
506 Parameters
507 ----------
508 job_name : `str`
509 Name of job to which inputs should be added
510 files : `lsst.ctrl.bps.GenericWorkflowFile` or \
511 `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
512 File object(s) to be added as inputs to the specified job.
513 """
514 self._inputs.setdefault(job_name, [])
515 for file in iterable(files):
516 # Save the central copy
517 if file.name not in self._files:
518 self._files[file.name] = file
520 # Save the job reference to the file
521 self._inputs[job_name].append(file)
523 def get_file(self, name):
524 """Retrieve a file object by name.
526 Parameters
527 ----------
528 name : `str`
529 Name of file object
531 Returns
532 -------
533 gwfile : `lsst.ctrl.bps.GenericWorkflowFile`
534 File matching given name.
535 """
536 return self._files[name]
538 def add_file(self, gwfile):
539 """Add file object.
541 Parameters
542 ----------
543 gwfile : `lsst.ctrl.bps.GenericWorkflowFile`
544 File object to add to workflow
545 """
546 if gwfile.name not in self._files:
547 self._files[gwfile.name] = gwfile
548 else:
549 _LOG.debug("Skipped add_file for existing file %s", gwfile.name)
551 def get_job_inputs(self, job_name, data=True, transfer_only=False):
552 """Return the input files for the given job.
554 Parameters
555 ----------
556 job_name : `str`
557 Name of the job.
558 data : `bool`, optional
559 Whether to return the file data as well as the file object name.
560 transfer_only : `bool`, optional
561 Whether to only return files for which a workflow management system
562 would be responsible for transferring.
564 Returns
565 -------
566 inputs : `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
567 Input files for the given job. If no input files for the job,
568 returns an empty list.
569 """
570 inputs = []
571 if job_name in self._inputs:
572 for gwfile in self._inputs[job_name]:
573 if not transfer_only or gwfile.wms_transfer:
574 if not data:
575 inputs.append(gwfile.name)
576 else:
577 inputs.append(gwfile)
578 return inputs
580 def add_job_outputs(self, job_name, files):
581 """Add output files to a job.
583 Parameters
584 ----------
585 job_name : `str`
586 Name of job to which the files should be added as outputs.
587 files : `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
588 File objects to be added as outputs for specified job.
589 """
590 self._outputs.setdefault(job_name, [])
592 for file_ in iterable(files):
593 # Save the central copy
594 if file_.name not in self._files:
595 self._files[file_.name] = file_
597 # Save the job reference to the file
598 self._outputs[job_name].append(file_)
600 def get_job_outputs(self, job_name, data=True, transfer_only=False):
601 """Return the output files for the given job.
603 Parameters
604 ----------
605 job_name : `str`
606 Name of the job.
607 data : `bool`
608 Whether to return the file data as well as the file object name.
609 It defaults to `True` thus returning file data as well.
610 transfer_only : `bool`
611 Whether to only return files for which a workflow management system
612 would be responsible for transferring. It defaults to `False` thus
613 returning all output files.
615 Returns
616 -------
617 outputs : `list` [`lsst.ctrl.bps.GenericWorkflowFile`]
618 Output files for the given job. If no output files for the job,
619 returns an empty list.
620 """
621 outputs = []
623 if job_name in self._outputs:
624 for file_name in self._outputs[job_name]:
625 file = self._files[file_name]
626 if not transfer_only or file.wms_transfer:
627 if not data:
628 outputs.append(file_name)
629 else:
630 outputs.append(self._files[file_name])
631 return outputs
633 def draw(self, stream, format_="dot"):
634 """Output generic workflow in a visualization format.
636 Parameters
637 ----------
638 stream : `str` or `io.BufferedIOBase`
639 Stream to which the visualization should be written.
640 format_ : `str`, optional
641 Which visualization format to use. It defaults to the format for
642 the dot program.
643 """
644 draw_funcs = {"dot": draw_networkx_dot}
645 if format_ in draw_funcs:
646 draw_funcs[format_](self, stream)
647 else:
648 raise RuntimeError(f"Unknown draw format ({format_}")
650 def save(self, stream, format_="pickle"):
651 """Save the generic workflow in a format that is loadable.
653 Parameters
654 ----------
655 stream : `str` or `io.BufferedIOBase`
656 Stream to pass to the format-specific writer. Accepts anything
657 that the writer accepts.
659 format_ : `str`, optional
660 Format in which to write the data. It defaults to pickle format.
661 """
662 if format_ == "pickle":
663 write_gpickle(self, stream)
664 else:
665 raise RuntimeError(f"Unknown format ({format_})")
667 @classmethod
668 def load(cls, stream, format_="pickle"):
669 """Load a GenericWorkflow from the given stream
671 Parameters
672 ----------
673 stream : `str` or `io.BufferedIOBase`
674 Stream to pass to the format-specific loader. Accepts anything that
675 the loader accepts.
676 format_ : `str`, optional
677 Format of data to expect when loading from stream. It defaults
678 to pickle format.
680 Returns
681 -------
682 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
683 Generic workflow loaded from the given stream
684 """
685 if format_ == "pickle":
686 return read_gpickle(stream)
688 raise RuntimeError(f"Unknown format ({format_})")
690 def validate(self):
691 """Run checks to ensure this is still a valid generic workflow graph.
692 """
693 # Make sure a directed acyclic graph
694 assert is_directed_acyclic_graph(self)
696 def add_workflow_source(self, workflow):
697 """Add given workflow as new source to this workflow.
699 Parameters
700 ----------
701 workflow : `lsst.ctrl.bps.GenericWorkflow`
702 """
703 # Find source nodes in self.
704 self_sources = [n for n in self if self.in_degree(n) == 0]
705 _LOG.debug("self_sources = %s", self_sources)
707 # Find sink nodes of workflow.
708 new_sinks = [n for n in workflow if workflow.out_degree(n) == 0]
709 _LOG.debug("new sinks = %s", new_sinks)
711 # Add new workflow nodes to self graph and make new edges.
712 self.add_nodes_from(workflow.nodes(data=True))
713 self.add_edges_from(workflow.edges())
714 for source in self_sources:
715 for sink in new_sinks:
716 self.add_edge(sink, source)
718 # Files are stored separately so copy them.
719 for job_name in workflow:
720 self.add_job_inputs(job_name, workflow.get_job_inputs(job_name, data=True))
721 self.add_job_outputs(job_name, workflow.get_job_outputs(job_name, data=True))
722 self.add_executable(workflow.get_job(job_name).executable)
724 def add_final(self, final):
725 """Add special final job/workflow to the generic workflow.
727 Parameters
728 ----------
729 final : `lsst.ctrl.bps.GenericWorkflowJob` or \
730 `lsst.ctrl.bps.GenericWorkflow`
731 Information needed to execute the special final job(s), the
732 job(s) to be executed after all jobs that can be executed
733 have been executed regardless of exit status of any of the
734 jobs.
735 """
736 if not isinstance(final, GenericWorkflowJob) and not isinstance(final, GenericWorkflow):
737 raise TypeError("Invalid type for GenericWorkflow final ({type(final)})")
739 self._final = final
740 if isinstance(final, GenericWorkflowJob):
741 self.add_executable(final.executable)
743 def get_final(self):
744 """Return job/workflow to be executed after all jobs that can be
745 executed have been executed regardless of exit status of any of
746 the jobs.
748 Returns
749 -------
750 final : `lsst.ctrl.bps.GenericWorkflowJob` or \
751 `lsst.ctrl.bps.GenericWorkflow`
752 Information needed to execute final job(s).
753 """
754 return self._final
756 def add_executable(self, executable):
757 """Add executable to workflow's list of executables.
759 Parameters
760 ----------
761 executable : `lsst.ctrl.bps.GenericWorkflowExec`
762 Executable object to be added to workflow.
763 """
764 if executable is not None:
765 self._executables[executable.name] = executable
766 else:
767 _LOG.warning("executable not specified (None); cannot add to the workflow's list of executables")
769 def get_executables(self, data=False, transfer_only=True):
770 """Retrieve executables from generic workflow.
772 Parameters
773 ----------
774 data : `bool`, optional
775 Whether to return the executable data as well as the exec object
776 name. (The defaults is False.)
777 transfer_only : `bool`, optional
778 Whether to only return executables for which transfer_executable
779 is True.
781 Returns
782 -------
783 execs : `list` [`lsst.ctrl.bps.GenericWorkflowExec`] or `list` [`str`]
784 Filtered executable names or objects from generic workflow.
785 """
786 execs = []
787 for name, executable in self._executables.items():
788 if not transfer_only or executable.transfer_executable:
789 if not data:
790 execs.append(name)
791 else:
792 execs.append(executable)
793 return execs