Coverage for python/lsst/ctrl/bps/generic_workflow.py : 33%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Class definitions for a Generic Workflow Graph
23"""
25import dataclasses
26import itertools
27from typing import Optional
28import networkx as nx
30from lsst.daf.butler.core.utils import iterable
31from lsst.daf.butler import DatasetRef
32from lsst.pipe.base import QuantumGraph
33from .bps_draw import draw_networkx_dot
36@dataclasses.dataclass
37class GenericWorkflowFile:
38 """Information about a file that may be needed by various workflow
39 management services."""
40 name: str
41 wms_transfer: bool
42 src_uri: str or None # don't know that need ButlerURI
43 dataset_ref: DatasetRef or None
44 dest_uri: str or None # don't know that need ButlerURI
45 logical_file_name: str or None
47 # As of python 3.7.8, can't use __slots__ + dataclass if give default
48 # values, so writing own __init__
49 def __init__(self, name: str, wms_transfer: bool = False, src_uri=None,
50 dataset_ref=None, dest_uri=None, logical_file_name=None):
51 self.name = name
52 self.wms_transfer = wms_transfer
53 self.src_uri = src_uri
54 self.dataset_ref = dataset_ref
55 self.dest_uri = dest_uri
56 self.logical_file_name = logical_file_name
58 __slots__ = ("name", "wms_transfer", "dataset_ref", "src_uri", "dest_uri", "logical_file_name")
60 def __hash__(self):
61 return hash(self.name)
63 def __str__(self):
64 return f"GenericWorkflowJob(name={self.name})"
67@dataclasses.dataclass
68class GenericWorkflowJob:
69 """Information about a job that may be needed by various workflow
70 management services.
71 """
72 name: str
73 label: Optional[str]
74 tags: Optional[str]
75 cmdline: Optional[str]
76 request_memory: Optional[int] # MB
77 request_cpus: Optional[int] # cores
78 request_disk: Optional[int] # MB
79 request_walltime: Optional[str] # minutes
80 compute_site: Optional[str]
81 mail_to: Optional[str]
82 when_to_mail: Optional[str]
83 number_of_retries: Optional[int]
84 retry_unless_exit: Optional[int]
85 abort_on_value: Optional[int]
86 abort_return_value: Optional[int]
87 priority: Optional[str]
88 category: Optional[str]
89 pre_cmdline: Optional[str]
90 post_cmdline: Optional[str]
91 profile: Optional[dict]
92 attrs: Optional[dict]
93 environment: Optional[dict]
94 quantum_graph: Optional[QuantumGraph]
95 qgraph_node_ids: Optional[list]
96 quanta_summary: Optional[str]
98 # As of python 3.7.8, can't use __slots__ if give default values, so writing own __init__
99 def __init__(self, name: str):
100 self.name = name
101 self.label = None
102 self.tags = None
103 self.cmdline = None
104 self.request_memory = None
105 self.request_cpus = None
106 self.request_disk = None
107 self.request_walltime = None
108 self.compute_site = None
109 self.mail_to = None
110 self.when_to_mail = None
111 self.number_of_retries = None
112 self.retry_unless_exit = None
113 self.abort_on_value = None
114 self.abort_return_value = None
115 self.priority = None
116 self.category = None
117 self.pre_cmdline = None
118 self.post_cmdline = None
119 self.profile = {}
120 self.attrs = {}
121 self.environment = {}
122 self.quantum_graph = None
123 self.qgraph_node_ids = None
124 self.quanta_summary = ""
126 __slots__ = ("name", "label", "tags", "mail_to", "when_to_mail", "cmdline", "request_memory",
127 "request_cpus", "request_disk", "request_walltime", "compute_site", "environment",
128 "number_of_retries", "retry_unless_exit", "abort_on_value", "abort_return_value",
129 "priority", "category", "pre_cmdline", "post_cmdline", "profile", "attrs",
130 "quantum_graph", "qgraph_node_ids", "quanta_summary")
132 def __hash__(self):
133 return hash(self.name)
136class GenericWorkflow(nx.DiGraph):
137 """A generic representation of a workflow used to submit to specific
138 workflow management systems.
140 Parameters
141 ----------
142 name : `str`
143 Name of generic workflow.
144 incoming_graph_data : `Any`, optional
145 Data used to initialized graph that is passed through to nx.DiGraph
146 constructor. Can be any type supported by networkx.DiGraph.
147 attr : `dict`
148 Keyword arguments passed through to nx.DiGraph constructor.
149 """
150 def __init__(self, name, incoming_graph_data=None, **attr):
151 super().__init__(incoming_graph_data, **attr)
152 self._name = name
153 self.run_attrs = {}
154 self._files = {}
155 self.run_id = None
157 @property
158 def name(self):
159 """Retrieve name of generic workflow.
161 Returns
162 -------
163 name : `str`
164 Name of generic workflow.
165 """
166 return self._name
168 def get_files(self, data=False, transfer_only=True):
169 """Retrieve files from generic workflow.
170 Need API in case change way files are stored (e.g., make
171 workflow a bipartite graph with jobs and files nodes).
173 Parameters
174 ----------
175 data : `bool`, optional
176 Whether to return the file data as well as the file object name.
177 transfer_only : `bool`, optional
178 Whether to only return files for which a workflow management system
179 would be responsible for transferring.
181 Returns
182 -------
183 files : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
184 Files from generic workflow meeting specifications.
185 """
186 files = []
187 for filename, file in self._files.items():
188 if not transfer_only or file.wms_transfer:
189 if not data:
190 files.append(filename)
191 else:
192 files.append(file)
193 return files
195 def add_job(self, job, parent_names=None, child_names=None):
196 """Add job to generic workflow.
198 Parameters
199 ----------
200 job : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
201 Job to add to the generic workflow.
202 parent_names : `list` of `str`, optional
203 Names of jobs that are parents of given job
204 child_names : `list` of `str`, optional
205 Names of jobs that are children of given job
206 """
207 if not isinstance(job, GenericWorkflowJob):
208 raise RuntimeError(f"Invalid type for job to be added to GenericWorkflowGraph ({type(job)}).")
209 if self.has_node(job.name):
210 raise RuntimeError(f"Job {job.name} already exists in GenericWorkflowGraph.")
211 super().add_node(job.name, job=job, inputs={}, outputs={})
212 self.add_job_relationships(parent_names, job.name)
213 self.add_job_relationships(job.name, child_names)
215 def add_node(self, node_for_adding, **attr):
216 """Override networkx function to call more specific add_job function.
218 Parameters
219 ----------
220 node_for_adding : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
221 Job to be added to generic workflow.
222 attr :
223 Needed to match original networkx function, but not used.
224 """
225 self.add_job(node_for_adding)
227 def add_job_relationships(self, parents, children):
228 """Add dependencies between parent and child jobs. All parents will
229 be connected to all children.
231 Parameters
232 ----------
233 parents : `list` of `str`
234 Parent job names.
235 children : `list` of `str`
236 Children job names.
237 """
238 if parents is not None and children is not None:
239 self.add_edges_from(itertools.product(iterable(parents), iterable(children)))
241 def add_edges_from(self, ebunch_to_add, **attr):
242 """Add several edges between jobs in the generic workflow.
244 Parameters
245 ----------
246 ebunch_to_add : Iterable of `tuple` of `str`
247 Iterable of job name pairs between which a dependency should be saved.
248 attr : keyword arguments, optional
249 Data can be assigned using keyword arguments (not currently used)
250 """
251 for edge_to_add in ebunch_to_add:
252 self.add_edge(edge_to_add[0], edge_to_add[1], **attr)
254 def add_edge(self, u_of_edge: str, v_of_edge: str, **attr):
255 """Add edge connecting jobs in workflow.
257 Parameters
258 ----------
259 u_of_edge : `str`
260 Name of parent job.
261 v_of_edge : `str`
262 Name of child job.
263 attr : keyword arguments, optional
264 Attributes to save with edge.
265 """
266 if u_of_edge not in self:
267 raise RuntimeError(f"{u_of_edge} not in GenericWorkflow")
268 if v_of_edge not in self:
269 raise RuntimeError(f"{v_of_edge} not in GenericWorkflow")
270 super().add_edge(u_of_edge, v_of_edge, **attr)
272 def get_job(self, job_name: str):
273 """Retrieve job by name from workflow.
275 Parameters
276 ----------
277 job_name : `str`
278 Name of job to retrieve.
280 Returns
281 -------
282 job : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
283 Job matching given job_name.
284 """
285 return self.nodes[job_name]["job"]
287 def del_job(self, job_name: str):
288 """Delete job from generic workflow leaving connected graph.
290 Parameters
291 ----------
292 job_name : `str`
293 Name of job to delete from workflow.
294 """
295 # Connect all parent jobs to all children jobs.
296 parents = self.predecessors(job_name)
297 children = self.successors(job_name)
298 self.add_job_relationships(parents, children)
300 # Delete job node (which deleted edges).
301 self.remove_node(job_name)
303 def add_job_inputs(self, job_name: str, files):
304 """Add files as inputs to specified job.
306 Parameters
307 ----------
308 job_name : `str`
309 Name of job to which inputs should be added
310 files : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile` or `list`
311 File object(s) to be added as inputs to the specified job.
312 """
313 job_inputs = self.nodes[job_name]["inputs"]
314 for file in iterable(files):
315 # Save the central copy
316 if file.name not in self._files:
317 self._files[file.name] = file
319 # Save the job reference to the file
320 job_inputs[file.name] = file
322 def get_file(self, name):
323 """Retrieve a file object by name.
325 Parameters
326 ----------
327 name : `str`
328 Name of file object
330 Returns
331 -------
332 file_ : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
333 File matching given name.
334 """
335 return self._files[name]
337 def get_job_inputs(self, job_name, data=True, transfer_only=False):
338 """Return the input files for the given job.
340 Parameters
341 ----------
342 job_name : `str`
343 Name of the job.
344 data : `bool`, optional
345 Whether to return the file data as well as the file object name.
346 transfer_only : `bool`, optional
347 Whether to only return files for which a workflow management system
348 would be responsible for transferring.
350 Returns
351 -------
352 inputs : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
353 Input files for the given job.
354 """
355 job_inputs = self.nodes[job_name]["inputs"]
356 inputs = []
357 for file_name in job_inputs:
358 file = self._files[file_name]
359 if not transfer_only or file.wms_transfer:
360 if not data:
361 inputs.append(file_name)
362 else:
363 inputs.append(self._files[file_name])
364 return inputs
366 def add_job_outputs(self, job_name, files):
367 """Add output files to a job.
369 Parameters
370 ----------
371 job_name : `str`
372 Name of job to which the files should be added as outputs.
373 files : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
374 File objects to be added as outputs for specified job.
375 """
376 job_outputs = self.nodes[job_name]["outputs"]
377 for file in files:
378 # Save the central copy
379 if file.name not in self._files:
380 self._files[file.name] = file
381 # Save the job reference to the file
382 job_outputs[file.name] = file
384 def get_job_outputs(self, job_name, data=True, transfer_only=False):
385 """Return the output files for the given job.
387 Parameters
388 ----------
389 job_name : `str`
390 Name of the job.
391 data : `bool`
392 Whether to return the file data as well as the file object name.
393 It defaults to `True` thus returning file data as well.
394 transfer_only : `bool`
395 Whether to only return files for which a workflow management system
396 would be responsible for transferring. It defaults to `False` thus
397 returning all output files.
399 Returns
400 -------
401 outputs : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
402 Output files for the given job.
403 """
404 job_outputs = self.nodes[job_name]["outputs"]
405 outputs = []
406 for file_name in job_outputs:
407 file = self._files[file_name]
408 if not transfer_only or file.wms_transfer:
409 if not data:
410 outputs.append(file_name)
411 else:
412 outputs.append(self._files[file_name])
413 return outputs
415 def draw(self, stream, format_="dot"):
416 """Output generic workflow in a visualization format.
418 Parameters
419 ----------
420 stream : `str` or `io.BufferedIOBase`
421 Stream to which the visualization should be written.
422 format_ : `str`, optional
423 Which visualization format to use. It defaults to the format for
424 the dot program.
425 """
426 draw_funcs = {"dot": draw_networkx_dot}
427 if format_ in draw_funcs:
428 draw_funcs[format_](self, stream)
429 else:
430 raise RuntimeError(f"Unknown draw format ({format_}")
432 def save(self, stream, format_="pickle"):
433 """Save the generic workflow in a format that is loadable.
435 Parameters
436 ----------
437 stream : `str` or `io.BufferedIOBase`
438 Stream to pass to the format-specific writer. Accepts anything
439 that the writer accepts.
441 format_ : `str`, optional
442 Format in which to write the data. It defaults to pickle format.
443 """
444 if format_ == "pickle":
445 nx.write_gpickle(self, stream)
446 else:
447 raise RuntimeError(f"Unknown format ({format_})")
449 @classmethod
450 def load(cls, stream, format_="pickle"):
451 """Load a GenericWorkflow from the given stream
453 Parameters
454 ----------
455 stream : `str` or `io.BufferedIOBase`
456 Stream to pass to the format-specific loader. Accepts anything that
457 the loader accepts.
458 format_ : `str`, optional
459 Format of data to expect when loading from stream. It defaults
460 to pickle format.
462 Returns
463 -------
464 generic_workflow : `~lsst.ctrl.bps.generic_workflow.GenericWorkflow`
465 Generic workflow loaded from the given stream
466 """
467 if format_ == "pickle":
468 return nx.read_gpickle(stream)
470 raise RuntimeError(f"Unknown format ({format_})")
472 def validate(self):
473 """Run checks to ensure this is still a valid generic workflow graph.
474 """
475 # Make sure a directed acyclic graph
476 assert nx.algorithms.dag.is_directed_acyclic_graph(self)