Coverage for python/lsst/ctrl/bps/generic_workflow.py : 33%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Class definitions for a Generic Workflow Graph
23"""
25import dataclasses
26import itertools
27from typing import Optional
29import networkx as nx
31from lsst.daf.butler.core.utils import iterable
32from lsst.daf.butler import DatasetRef
33from lsst.pipe.base import QuantumGraph
34from .bps_draw import draw_networkx_dot
37@dataclasses.dataclass
38class GenericWorkflowFile:
39 """Information about a file that may be needed by various workflow
40 management services."""
41 name: str
42 wms_transfer: bool
43 src_uri: str or None # don't know that need ButlerURI
44 dataset_ref: DatasetRef or None
45 dest_uri: str or None # don't know that need ButlerURI
46 logical_file_name: str or None
48 # As of python 3.7.8, can't use __slots__ + dataclass if give default
49 # values, so writing own __init__
50 def __init__(self, name: str, wms_transfer: bool = False, src_uri=None,
51 dataset_ref=None, dest_uri=None, logical_file_name=None):
52 self.name = name
53 self.wms_transfer = wms_transfer
54 self.src_uri = src_uri
55 self.dataset_ref = dataset_ref
56 self.dest_uri = dest_uri
57 self.logical_file_name = logical_file_name
59 __slots__ = ("name", "wms_transfer", "dataset_ref", "src_uri", "dest_uri", "logical_file_name")
61 def __hash__(self):
62 return hash(self.name)
64 def __str__(self):
65 return f"GenericWorkflowJob(name={self.name})"
68@dataclasses.dataclass
69class GenericWorkflowJob:
70 """Information about a job that may be needed by various workflow
71 management services.
72 """
73 name: str
74 label: Optional[str]
75 tags: Optional[str]
76 cmdline: Optional[str]
77 request_memory: Optional[int] # MB
78 request_cpus: Optional[int] # cores
79 request_disk: Optional[int] # MB
80 request_walltime: Optional[str] # minutes
81 compute_site: Optional[str]
82 mail_to: Optional[str]
83 when_to_mail: Optional[str]
84 number_of_retries: Optional[int]
85 retry_unless_exit: Optional[int]
86 abort_on_value: Optional[int]
87 abort_return_value: Optional[int]
88 priority: Optional[str]
89 category: Optional[str]
90 pre_cmdline: Optional[str]
91 post_cmdline: Optional[str]
92 profile: Optional[dict]
93 attrs: Optional[dict]
94 environment: Optional[dict]
95 quantum_graph: Optional[QuantumGraph]
96 qgraph_node_ids: Optional[list]
97 quanta_summary: Optional[str]
99 # As of python 3.7.8, can't use __slots__ if give default values, so writing own __init__
100 def __init__(self, name: str):
101 self.name = name
102 self.label = None
103 self.tags = None
104 self.cmdline = None
105 self.request_memory = None
106 self.request_cpus = None
107 self.request_disk = None
108 self.request_walltime = None
109 self.compute_site = None
110 self.mail_to = None
111 self.when_to_mail = None
112 self.number_of_retries = None
113 self.retry_unless_exit = None
114 self.abort_on_value = None
115 self.abort_return_value = None
116 self.priority = None
117 self.category = None
118 self.pre_cmdline = None
119 self.post_cmdline = None
120 self.profile = {}
121 self.attrs = {}
122 self.environment = {}
123 self.quantum_graph = None
124 self.qgraph_node_ids = None
125 self.quanta_summary = ""
127 __slots__ = ("name", "label", "tags", "mail_to", "when_to_mail", "cmdline", "request_memory",
128 "request_cpus", "request_disk", "request_walltime", "compute_site", "environment",
129 "number_of_retries", "retry_unless_exit", "abort_on_value", "abort_return_value",
130 "priority", "category", "pre_cmdline", "post_cmdline", "profile", "attrs",
131 "quantum_graph", "qgraph_node_ids", "quanta_summary")
133 def __hash__(self):
134 return hash(self.name)
137class GenericWorkflow(nx.DiGraph):
138 """A generic representation of a workflow used to submit to specific
139 workflow management systems.
141 Parameters
142 ----------
143 name : `str`
144 Name of generic workflow.
145 incoming_graph_data : `Any`, optional
146 Data used to initialized graph that is passed through to nx.DiGraph
147 constructor. Can be any type supported by networkx.DiGraph.
148 attr : `dict`
149 Keyword arguments passed through to nx.DiGraph constructor.
150 """
151 def __init__(self, name, incoming_graph_data=None, **attr):
152 super().__init__(incoming_graph_data, **attr)
153 self._name = name
154 self.run_attrs = {}
155 self._files = {}
156 self.run_id = None
158 @property
159 def name(self):
160 """Retrieve name of generic workflow.
162 Returns
163 -------
164 name : `str`
165 Name of generic workflow.
166 """
167 return self._name
169 def get_files(self, data=False, transfer_only=True):
170 """Retrieve files from generic workflow.
171 Need API in case change way files are stored (e.g., make
172 workflow a bipartite graph with jobs and files nodes).
174 Parameters
175 ----------
176 data : `bool`, optional
177 Whether to return the file data as well as the file object name.
178 transfer_only : `bool`, optional
179 Whether to only return files for which a workflow management system
180 would be responsible for transferring.
182 Returns
183 -------
184 files : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
185 Files from generic workflow meeting specifications.
186 """
187 files = []
188 for filename, file in self._files.items():
189 if not transfer_only or file.wms_transfer:
190 if not data:
191 files.append(filename)
192 else:
193 files.append(file)
194 return files
196 def add_job(self, job, parent_names=None, child_names=None):
197 """Add job to generic workflow.
199 Parameters
200 ----------
201 job : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
202 Job to add to the generic workflow.
203 parent_names : `list` of `str`, optional
204 Names of jobs that are parents of given job
205 child_names : `list` of `str`, optional
206 Names of jobs that are children of given job
207 """
208 if not isinstance(job, GenericWorkflowJob):
209 raise RuntimeError(f"Invalid type for job to be added to GenericWorkflowGraph ({type(job)}).")
210 if self.has_node(job.name):
211 raise RuntimeError(f"Job {job.name} already exists in GenericWorkflowGraph.")
212 super().add_node(job.name, job=job, inputs={}, outputs={})
213 self.add_job_relationships(parent_names, job.name)
214 self.add_job_relationships(job.name, child_names)
216 def add_node(self, node_for_adding, **attr):
217 """Override networkx function to call more specific add_job function.
219 Parameters
220 ----------
221 node_for_adding : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
222 Job to be added to generic workflow.
223 attr :
224 Needed to match original networkx function, but not used.
225 """
226 self.add_job(node_for_adding)
228 def add_job_relationships(self, parents, children):
229 """Add dependencies between parent and child jobs. All parents will
230 be connected to all children.
232 Parameters
233 ----------
234 parents : `list` of `str`
235 Parent job names.
236 children : `list` of `str`
237 Children job names.
238 """
239 if parents is not None and children is not None:
240 self.add_edges_from(itertools.product(iterable(parents), iterable(children)))
242 def add_edges_from(self, ebunch_to_add, **attr):
243 """Add several edges between jobs in the generic workflow.
245 Parameters
246 ----------
247 ebunch_to_add : Iterable of `tuple` of `str`
248 Iterable of job name pairs between which a dependency should be saved.
249 attr : keyword arguments, optional
250 Data can be assigned using keyword arguments (not currently used)
251 """
252 for edge_to_add in ebunch_to_add:
253 self.add_edge(edge_to_add[0], edge_to_add[1], **attr)
255 def add_edge(self, u_of_edge: str, v_of_edge: str, **attr):
256 """Add edge connecting jobs in workflow.
258 Parameters
259 ----------
260 u_of_edge : `str`
261 Name of parent job.
262 v_of_edge : `str`
263 Name of child job.
264 attr : keyword arguments, optional
265 Attributes to save with edge.
266 """
267 if u_of_edge not in self:
268 raise RuntimeError(f"{u_of_edge} not in GenericWorkflow")
269 if v_of_edge not in self:
270 raise RuntimeError(f"{v_of_edge} not in GenericWorkflow")
271 super().add_edge(u_of_edge, v_of_edge, **attr)
273 def get_job(self, job_name: str):
274 """Retrieve job by name from workflow.
276 Parameters
277 ----------
278 job_name : `str`
279 Name of job to retrieve.
281 Returns
282 -------
283 job : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowJob`
284 Job matching given job_name.
285 """
286 return self.nodes[job_name]["job"]
288 def del_job(self, job_name: str):
289 """Delete job from generic workflow leaving connected graph.
291 Parameters
292 ----------
293 job_name : `str`
294 Name of job to delete from workflow.
295 """
296 # Connect all parent jobs to all children jobs.
297 parents = self.predecessors(job_name)
298 children = self.successors(job_name)
299 self.add_job_relationships(parents, children)
301 # Delete job node (which deleted edges).
302 self.remove_node(job_name)
304 def add_job_inputs(self, job_name: str, files):
305 """Add files as inputs to specified job.
307 Parameters
308 ----------
309 job_name : `str`
310 Name of job to which inputs should be added
311 files : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile` or `list`
312 File object(s) to be added as inputs to the specified job.
313 """
314 job_inputs = self.nodes[job_name]["inputs"]
315 for file in iterable(files):
316 # Save the central copy
317 if file.name not in self._files:
318 self._files[file.name] = file
320 # Save the job reference to the file
321 job_inputs[file.name] = file
323 def get_file(self, name):
324 """Retrieve a file object by name.
326 Parameters
327 ----------
328 name : `str`
329 Name of file object
331 Returns
332 -------
333 file_ : `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
334 File matching given name.
335 """
336 return self._files[name]
338 def get_job_inputs(self, job_name, data=True, transfer_only=False):
339 """Return the input files for the given job.
341 Parameters
342 ----------
343 job_name : `str`
344 Name of the job.
345 data : `bool`, optional
346 Whether to return the file data as well as the file object name.
347 transfer_only : `bool`, optional
348 Whether to only return files for which a workflow management system
349 would be responsible for transferring.
351 Returns
352 -------
353 inputs : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
354 Input files for the given job.
355 """
356 job_inputs = self.nodes[job_name]["inputs"]
357 inputs = []
358 for file_name in job_inputs:
359 file = self._files[file_name]
360 if not transfer_only or file.wms_transfer:
361 if not data:
362 inputs.append(file_name)
363 else:
364 inputs.append(self._files[file_name])
365 return inputs
367 def add_job_outputs(self, job_name, files):
368 """Add output files to a job.
370 Parameters
371 ----------
372 job_name : `str`
373 Name of job to which the files should be added as outputs.
374 files : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
375 File objects to be added as outputs for specified job.
376 """
377 job_outputs = self.nodes[job_name]["outputs"]
378 for file in files:
379 # Save the central copy
380 if file.name not in self._files:
381 self._files[file.name] = file
382 # Save the job reference to the file
383 job_outputs[file.name] = file
385 def get_job_outputs(self, job_name, data=True, transfer_only=False):
386 """Return the output files for the given job.
388 Parameters
389 ----------
390 job_name : `str`
391 Name of the job.
392 data : `bool`
393 Whether to return the file data as well as the file object name.
394 It defaults to `True` thus returning file data as well.
395 transfer_only : `bool`
396 Whether to only return files for which a workflow management system
397 would be responsible for transferring. It defaults to `False` thus
398 returning all output files.
400 Returns
401 -------
402 outputs : `list` of `~lsst.ctrl.bps.generic_workflow.GenericWorkflowFile`
403 Output files for the given job.
404 """
405 job_outputs = self.nodes[job_name]["outputs"]
406 outputs = []
407 for file_name in job_outputs:
408 file = self._files[file_name]
409 if not transfer_only or file.wms_transfer:
410 if not data:
411 outputs.append(file_name)
412 else:
413 outputs.append(self._files[file_name])
414 return outputs
416 def draw(self, stream, format_="dot"):
417 """Output generic workflow in a visualization format.
419 Parameters
420 ----------
421 stream : `str` or `io.BufferedIOBase`
422 Stream to which the visualization should be written.
423 format_ : `str`, optional
424 Which visualization format to use. It defaults to the format for
425 the dot program.
426 """
427 draw_funcs = {"dot": draw_networkx_dot}
428 if format_ in draw_funcs:
429 draw_funcs[format_](self, stream)
430 else:
431 raise RuntimeError(f"Unknown draw format ({format_}")
433 def save(self, stream, format_="pickle"):
434 """Save the generic workflow in a format that is loadable.
436 Parameters
437 ----------
438 stream : `str` or `io.BufferedIOBase`
439 Stream to pass to the format-specific writer. Accepts anything
440 that the writer accepts.
442 format_ : `str`, optional
443 Format in which to write the data. It defaults to pickle format.
444 """
445 if format_ == "pickle":
446 nx.write_gpickle(self, stream)
447 else:
448 raise RuntimeError(f"Unknown format ({format_})")
450 @classmethod
451 def load(cls, stream, format_="pickle"):
452 """Load a GenericWorkflow from the given stream
454 Parameters
455 ----------
456 stream : `str` or `io.BufferedIOBase`
457 Stream to pass to the format-specific loader. Accepts anything that
458 the loader accepts.
459 format_ : `str`, optional
460 Format of data to expect when loading from stream. It defaults
461 to pickle format.
463 Returns
464 -------
465 generic_workflow : `~lsst.ctrl.bps.generic_workflow.GenericWorkflow`
466 Generic workflow loaded from the given stream
467 """
468 if format_ == "pickle":
469 return nx.read_gpickle(stream)
471 raise RuntimeError(f"Unknown format ({format_})")
473 def validate(self):
474 """Run checks to ensure this is still a valid generic workflow graph.
475 """
476 # Make sure a directed acyclic graph
477 assert nx.algorithms.dag.is_directed_acyclic_graph(self)