Coverage for python/lsst/ctrl/bps/generic_workflow.py : 33%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Class definitions for a Generic Workflow Graph
23"""
25import dataclasses
26import itertools
27from typing import Optional
28import networkx as nx
30from lsst.daf.butler.core.utils import iterable
31from lsst.daf.butler import DatasetRef
32from lsst.pipe.base import QuantumGraph
33from .bps_draw import draw_networkx_dot
36@dataclasses.dataclass
37class GenericWorkflowFile:
38 """Information about a file that may be needed by various workflow
39 management services."""
40 name: str
41 wms_transfer: bool
42 src_uri: str or None # don't know that need ButlerURI
43 dataset_ref: DatasetRef or None
44 dest_uri: str or None # don't know that need ButlerURI
45 logical_file_name: str or None
47 # As of python 3.7.8, can't use __slots__ + dataclass if give default
48 # values, so writing own __init__
49 def __init__(self, name: str, wms_transfer: bool = False, src_uri=None,
50 dataset_ref=None, dest_uri=None, logical_file_name=None):
51 self.name = name
52 self.wms_transfer = wms_transfer
53 self.src_uri = src_uri
54 self.dataset_ref = dataset_ref
55 self.dest_uri = dest_uri
56 self.logical_file_name = logical_file_name
58 __slots__ = ('name', 'wms_transfer', 'dataset_ref', 'src_uri', 'dest_uri', 'logical_file_name')
60 def __hash__(self):
61 return hash(self.name)
63 def __str__(self):
64 return f"GenericWorkflowJob(name={self.name})"
67@dataclasses.dataclass
68class GenericWorkflowJob:
69 """Information about a job that may be needed by various workflow
70 management services.
71 """
72 name: str
73 label: Optional[str]
74 cmdline: Optional[str]
75 request_memory: Optional[int] # MB
76 request_cpus: Optional[int] # cores
77 request_disk: Optional[int] # MB
78 request_walltime: Optional[str] # minutes
79 compute_site: Optional[str]
80 mail_to: Optional[str]
81 when_to_mail: Optional[str]
82 number_of_retries: Optional[int]
83 retry_unless_exit: Optional[int]
84 abort_on_value: Optional[int]
85 abort_return_value: Optional[int]
86 priority: Optional[str]
87 category: Optional[str]
88 pre_cmdline: Optional[str]
89 post_cmdline: Optional[str]
90 profile: Optional[dict]
91 attrs: Optional[dict]
92 environment: Optional[dict]
93 quantum_graph: Optional[QuantumGraph]
94 quanta_summary: Optional[str]
96 # As of python 3.7.8, can't use __slots__ if give default values, so writing own __init__
97 def __init__(self, name: str):
98 self.name = name
99 self.label = None
100 self.cmdline = None
101 self.request_memory = None
102 self.request_cpus = None
103 self.request_disk = None
104 self.request_walltime = None
105 self.compute_site = None
106 self.mail_to = None
107 self.when_to_mail = None
108 self.number_of_retries = None
109 self.retry_unless_exit = None
110 self.abort_on_value = None
111 self.abort_return_value = None
112 self.priority = None
113 self.category = None
114 self.pre_cmdline = None
115 self.post_cmdline = None
116 self.profile = {}
117 self.attrs = {}
118 self.environment = {}
119 self.quantum_graph = None
120 self.quanta_summary = ""
122 __slots__ = ('name', 'label', 'mail_to', 'when_to_mail', 'cmdline', 'request_memory', 'request_cpus',
123 'request_disk', 'request_walltime', 'compute_site', 'environment', 'number_of_retries',
124 'retry_unless_exit', 'abort_on_value', 'abort_return_value', 'priority',
125 'category', 'pre_cmdline', 'post_cmdline', 'profile', 'attrs',
126 'quantum_graph', 'quanta_summary')
128 def __hash__(self):
129 return hash(self.name)
132class GenericWorkflow(nx.DiGraph):
133 """A generic representation of a workflow used to submit to specific
134 workflow management systems.
136 Parameters
137 ----------
138 name : `str`
139 Name of generic workflow.
140 incoming_graph_data : `Any`, optional
141 Data used to initialized graph that is passed through to nx.DiGraph
142 constructor. Can be any type supported by networkx.DiGraph.
143 attr : `dict`
144 Keyword arguments passed through to nx.DiGraph constructor.
145 """
146 def __init__(self, name, incoming_graph_data=None, **attr):
147 super().__init__(incoming_graph_data, **attr)
148 self._name = name
149 self.run_attrs = {}
150 self._files = {}
151 self.run_id = None
153 @property
154 def name(self):
155 """Retrieve name of generic workflow.
157 Returns
158 -------
159 name : `str`
160 Name of generic workflow.
161 """
162 return self._name
164 def get_files(self, data=False, transfer_only=True):
165 """Retrieve files from generic workflow.
166 Need API in case change way files are stored (e.g., make
167 workflow a bipartite graph with jobs and files nodes).
169 Parameters
170 ----------
171 data : `bool`, optional
172 Whether to return the file data as well as the file object name.
173 transfer_only : `bool`, optional
174 Whether to only return files for which a workflow management system
175 would be responsible for transferring.
177 Returns
178 -------
179 files : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
180 Files from generic workflow meeting specifications.
181 """
182 files = []
183 for filename, file in self._files.items():
184 if not transfer_only or file.wms_transfer:
185 if not data:
186 files.append(filename)
187 else:
188 files.append(file)
189 return files
191 def add_job(self, job, parent_names=None, child_names=None):
192 """Add job to generic workflow.
194 Parameters
195 ----------
196 job : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
197 Job to add to the generic workflow.
198 parent_names : `list` of `str`, optional
199 Names of jobs that are parents of given job
200 child_names : `list` of `str`, optional
201 Names of jobs that are children of given job
202 """
203 if not isinstance(job, GenericWorkflowJob):
204 raise RuntimeError(f"Invalid type for job to be added to GenericWorkflowGraph ({type(job)}).")
205 if self.has_node(job.name):
206 raise RuntimeError(f"Job {job.name} already exists in GenericWorkflowGraph.")
207 super().add_node(job.name, job=job, inputs={}, outputs={})
208 self.add_job_relationships(parent_names, job.name)
209 self.add_job_relationships(job.name, child_names)
211 def add_node(self, node_for_adding, **attr):
212 """Override networkx function to call more specific add_job function.
214 Parameters
215 ----------
216 node_for_adding : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
217 Job to be added to generic workflow.
218 attr :
219 Needed to match original networkx function, but not used.
220 """
221 self.add_job(node_for_adding)
223 def add_job_relationships(self, parents, children):
224 """Add dependencies between parent and child jobs. All parents will
225 be connected to all children.
227 Parameters
228 ----------
229 parents : `list` of `str`
230 Parent job names.
231 children : `list` of `str`
232 Children job names.
233 """
234 if parents is not None and children is not None:
235 self.add_edges_from(itertools.product(iterable(parents), iterable(children)))
237 def add_edges_from(self, ebunch_to_add, **attr):
238 """Add several edges between jobs in the generic workflow.
240 Parameters
241 ----------
242 ebunch_to_add : Iterable of `tuple` of `str`
243 Iterable of job name pairs between which a dependency should be saved.
244 attr : keyword arguments, optional
245 Data can be assigned using keyword arguments (not currently used)
246 """
247 for edge_to_add in ebunch_to_add:
248 self.add_edge(edge_to_add[0], edge_to_add[1], **attr)
250 def add_edge(self, u_of_edge: str, v_of_edge: str, **attr):
251 """Add edge connecting jobs in workflow.
253 Parameters
254 ----------
255 u_of_edge : `str`
256 Name of parent job.
257 v_of_edge : `str`
258 Name of child job.
259 attr : keyword arguments, optional
260 Attributes to save with edge.
261 """
262 if u_of_edge not in self:
263 raise RuntimeError(f"{u_of_edge} not in GenericWorkflow")
264 if v_of_edge not in self:
265 raise RuntimeError(f"{v_of_edge} not in GenericWorkflow")
266 super().add_edge(u_of_edge, v_of_edge, **attr)
268 def get_job(self, job_name: str):
269 """Retrieve job by name from workflow.
271 Parameters
272 ----------
273 job_name : `str`
274 Name of job to retrieve.
276 Returns
277 -------
278 job : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
279 Job matching given job_name.
280 """
281 return self.nodes[job_name]['job']
283 def del_job(self, job_name: str):
284 """Delete job from generic workflow leaving connected graph.
286 Parameters
287 ----------
288 job_name : `str`
289 Name of job to delete from workflow.
290 """
291 # Connect all parent jobs to all children jobs.
292 parents = self.predecessors(job_name)
293 children = self.successors(job_name)
294 self.add_job_relationships(parents, children)
296 # Delete job node (which deleted edges).
297 self.remove_node(job_name)
299 def add_job_inputs(self, job_name: str, files):
300 """Add files as inputs to specified job.
302 Parameters
303 ----------
304 job_name : `str`
305 Name of job to which inputs should be added
306 files : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile` or `list`
307 File object(s) to be added as inputs to the specified job.
308 """
309 job_inputs = self.nodes[job_name]['inputs']
310 for file in iterable(files):
311 # Save the central copy
312 if file.name not in self._files:
313 self._files[file.name] = file
315 # Save the job reference to the file
316 job_inputs[file.name] = file
318 def get_file(self, name):
319 """Retrieve a file object by name.
321 Parameters
322 ----------
323 name : `str`
324 Name of file object
326 Returns
327 -------
328 file_ : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
329 File matching given name.
330 """
331 return self._files[name]
333 def get_job_inputs(self, job_name, data=True, transfer_only=False):
334 """Return the input files for the given job.
336 Parameters
337 ----------
338 job_name : `str`
339 Name of the job.
340 data : `bool`, optional
341 Whether to return the file data as well as the file object name.
342 transfer_only : `bool`, optional
343 Whether to only return files for which a workflow management system
344 would be responsible for transferring.
346 Returns
347 -------
348 inputs : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
349 Input files for the given job.
350 """
351 job_inputs = self.nodes[job_name]['inputs']
352 inputs = []
353 for file_name in job_inputs:
354 file = self._files[file_name]
355 if not transfer_only or file.wms_transfer:
356 if not data:
357 inputs.append(file_name)
358 else:
359 inputs.append(self._files[file_name])
360 return inputs
362 def add_job_outputs(self, job_name, files):
363 """Add output files to a job.
365 Parameters
366 ----------
367 job_name : `str`
368 Name of job to which the files should be added as outputs.
369 files : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
370 File objects to be added as outputs for specified job.
371 """
372 job_outputs = self.nodes[job_name]['outputs']
373 for file in files:
374 # Save the central copy
375 if file.name not in self._files:
376 self._files[file.name] = file
377 # Save the job reference to the file
378 job_outputs[file.name] = file
380 def get_job_outputs(self, job_name, data=True, transfer_only=False):
381 """Return the output files for the given job.
383 Parameters
384 ----------
385 job_name : `str`
386 Name of the job.
387 data : `bool`
388 Whether to return the file data as well as the file object name.
389 It defaults to `True` thus returning file data as well.
390 transfer_only : `bool`
391 Whether to only return files for which a workflow management system
392 would be responsible for transferring. It defaults to `False` thus
393 returning all output files.
395 Returns
396 -------
397 outputs : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
398 Output files for the given job.
399 """
400 job_outputs = self.nodes[job_name]['outputs']
401 outputs = []
402 for file_name in job_outputs:
403 file = self._files[file_name]
404 if not transfer_only or file.wms_transfer:
405 if not data:
406 outputs.append(file_name)
407 else:
408 outputs.append(self._files[file_name])
409 return outputs
411 def draw(self, stream, format_="dot"):
412 """Output generic workflow in a visualization format.
414 Parameters
415 ----------
416 stream : `str` or `io.BufferedIOBase`
417 Stream to which the visualization should be written.
418 format_ : `str`, optional
419 Which visualization format to use. It defaults to the format for
420 the dot program.
421 """
422 draw_funcs = {'dot': draw_networkx_dot}
423 if format_ in draw_funcs:
424 draw_funcs[format_](self, stream)
425 else:
426 raise RuntimeError(f"Unknown draw format ({format_}")
428 def save(self, stream, format_='pickle'):
429 """Save the generic workflow in a format that is loadable.
431 Parameters
432 ----------
433 stream : `str` or `io.BufferedIOBase`
434 Stream to pass to the format-specific writer. Accepts anything
435 that the writer accepts.
437 format_ : `str`, optional
438 Format in which to write the data. It defaults to pickle format.
439 """
440 if format_ == 'pickle':
441 nx.write_gpickle(self, stream)
442 else:
443 raise RuntimeError(f"Unknown format ({format_})")
445 @classmethod
446 def load(cls, stream, format_="pickle"):
447 """Load a GenericWorkflow from the given stream
449 Parameters
450 ----------
451 stream : `str` or `io.BufferedIOBase`
452 Stream to pass to the format-specific loader. Accepts anything that
453 the loader accepts.
454 format_ : `str`, optional
455 Format of data to expect when loading from stream. It defaults
456 to pickle format.
458 Returns
459 -------
460 generic_workflow : `~lsst.ctrl.bps.generic_workflow.GenericWorkflow`
461 Generic workflow loaded from the given stream
462 """
463 if format_ == 'pickle':
464 return nx.read_gpickle(stream)
466 raise RuntimeError(f"Unknown format ({format_})")
468 def validate(self):
469 """Run checks to ensure this is still a valid generic workflow graph.
470 """
471 # Make sure a directed acyclic graph
472 assert nx.algorithms.dag.is_directed_acyclic_graph(self)