Coverage for python/lsst/ctrl/bps/parsl/workflow.py: 23%
107 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:55 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:55 +0000
1import logging
2import os
3import pickle
4from collections.abc import Iterable, Mapping
6import parsl
7import parsl.config
8from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob
9from parsl.app.app import bash_app
10from parsl.app.futures import Future
12from .configuration import get_bps_config_value, get_workflow_filename, set_parsl_logging
13from .job import ParslJob, get_file_paths
14from .site import SiteConfig
16__all__ = ("ParslWorkflow", "get_parsl_config")
18_log = logging.getLogger("lsst.ctrl.bps.parsl")
21def get_parsl_config(config: BpsConfig) -> parsl.config.Config:
22 """Construct parsl configuration from BPS configuration
24 For details on the site configuration, see `SiteConfig`. For details on the
25 monitor configuration, see ``get_parsl_monitor``.
27 The retries are set from the ``site.<computeSite>.retries`` value.
29 Parameters
30 ----------
31 config : `BpsConfig`
32 BPS configuration
34 Returns
35 -------
36 parsl_config : `parsl.config.Config`
37 Parsl configuration.
38 """
39 site = SiteConfig.from_config(config)
40 executors = site.get_executors()
41 retries = get_bps_config_value(site.site, "retries", int, 1)
42 monitor = site.get_monitor()
43 return parsl.config.Config(
44 executors=executors, monitoring=monitor, retries=retries, checkpoint_mode="task_exit"
45 )
48class ParslWorkflow(BaseWmsWorkflow):
49 """Parsl-based workflow object to manage execution of workflow.
51 Parameters
52 ----------
53 name : `str`
54 Unique name of workflow.
55 config : `lsst.ctrl.bps.BpsConfig`
56 Generic workflow config.
57 path : `str`
58 Path prefix for workflow output files.
59 jobs : `dict` mapping `str` to `ParslJob`
60 Jobs to be executed.
61 parents : `dict` mapping `str` to iterable of `str`
62 Dependency tree. Keywords are job names, and values are a list of job
63 names that must be executed before the keyword job name can be
64 executed.
65 endpoints : iterable of `str`
66 Endpoints of the dependency tree. These jobs (specified by name) have
67 no children.
68 final : `ParslJob`, optional
69 Final job to be done, e.g., to merge the execution butler. This is done
70 locally.
71 """
73 def __init__(
74 self,
75 name: str,
76 config: BpsConfig,
77 path: str,
78 jobs: dict[str, ParslJob],
79 parents: Mapping[str, Iterable[str]],
80 endpoints: Iterable[str],
81 final: ParslJob | None = None,
82 ):
83 super().__init__(name, config)
85 self.path = path
86 self.bps_config = config
87 self.parsl_config = get_parsl_config(config)
88 self.site = SiteConfig.from_config(config)
89 self.dfk: parsl.DataFlowKernel | None = None # type: ignore
90 self.command_prefix = self.site.get_command_prefix()
92 # these are function decorators
93 self.apps = {
94 ex.label: bash_app(executors=[ex.label], cache=True, ignore_for_cache=["stderr", "stdout"])
95 for ex in self.parsl_config.executors
96 }
98 self.jobs = jobs
99 self.parents = parents
100 self.endpoints = endpoints
101 self.final = final
103 def __reduce__(self):
104 """Recipe for pickle"""
105 return type(self), (
106 self.name,
107 self.bps_config,
108 self.path,
109 self.jobs,
110 self.parents,
111 self.endpoints,
112 self.final,
113 )
115 @classmethod
116 def from_generic_workflow(
117 cls, config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str, service_class: str
118 ) -> BaseWmsWorkflow:
119 """Create a ParslWorkflow object from a BPS GenericWorkflow.
121 Parameters
122 ----------
123 config: `BpsConfig`
124 Configuration of the workflow.
125 generic_workflow: `lsst.ctrl.bps.generic_workflow.GenericWorkflow`
126 Generic representation of a single workflow.
127 out_prefix: `str`
128 Prefix for workflow output files.
129 service_class: `str`
130 Full module name of WMS service class that created this workflow.
132 Returns
133 -------
134 self : `ParslWorkflow`
135 Constructed workflow.
136 """
137 # Generate list of jobs
138 jobs: dict[str, ParslJob] = {}
139 for job_name in generic_workflow:
140 generic_job = generic_workflow.get_job(job_name)
141 assert generic_job.name not in jobs
142 jobs[job_name] = ParslJob(generic_job, config, get_file_paths(generic_workflow, job_name))
144 parents = {name: set(generic_workflow.predecessors(name)) for name in jobs}
145 endpoints = [name for name in jobs if generic_workflow.out_degree(name) == 0]
147 # Add final job: execution butler merge
148 job = generic_workflow.get_final()
149 final: ParslJob | None = None
150 if job is not None:
151 assert isinstance(job, GenericWorkflowJob)
152 final = ParslJob(job, config, get_file_paths(generic_workflow, job.name))
154 return cls(generic_workflow.name, config, out_prefix, jobs, parents, endpoints, final)
156 def write(self, out_prefix: str):
157 """Write workflow state
159 This, in combination with the parsl checkpoint files, can be used to
160 restart a workflow that was interrupted.
162 Parameters
163 ----------
164 out_prefix : `str`
165 Root directory to be used for WMS workflow inputs and outputs
166 as well as internal WMS files.
167 """
168 filename = get_workflow_filename(out_prefix)
169 _log.info("Writing workflow with ID=%s", out_prefix)
170 with open(filename, "wb") as fd:
171 pickle.dump(self, fd)
173 @classmethod
174 def read(cls, out_prefix: str) -> "ParslWorkflow":
175 """Construct from the saved workflow state
177 Parameters
178 ----------
179 out_prefix : `str`
180 Root directory to be used for WMS workflow inputs and outputs
181 as well as internal WMS files.
183 Returns
184 -------
185 self : `ParslWorkflow`
186 Constructed workflow.
187 """
188 filename = get_workflow_filename(out_prefix)
189 with open(filename, "rb") as fd:
190 self = pickle.load(fd)
191 assert isinstance(self, cls)
192 return self
194 def run(self, block: bool = True) -> list[Future | None]:
195 """Run the workflow
197 Parameters
198 ----------
199 block : `bool`, optional
200 Block returning from this method until the workflow is complete? If
201 `False`, jobs may still be running when this returns, and it is the
202 user's responsibility to call the ``finalize_jobs`` and
203 ``shutdown`` methods when they are complete.
205 Returns
206 -------
207 futures : `list` of `Future`
208 `Future` objects linked to the execution of the endpoint jobs.
209 """
210 futures = [self.execute(name) for name in self.endpoints]
211 if block:
212 # Calling .exception() for each future blocks returning
213 # from this method until all the jobs have executed or
214 # raised an error. This is needed for running in a
215 # non-interactive python process that would otherwise end
216 # before the futures resolve.
217 for ff in futures:
218 if ff is not None:
219 ff.exception()
220 self.shutdown()
221 self.finalize_jobs()
222 return futures
224 def execute(self, name: str) -> Future | None:
225 """Execute a job
227 Parameters
228 ----------
229 name : `str`
230 Name of job to execute.
232 Returns
233 -------
234 future : `Future` or `None`
235 A `Future` object linked to the execution of the job, or `None` if
236 the job is being reserved to run locally.
237 """
238 if name in ("pipetaskInit", "mergeExecutionButler"):
239 # These get done outside of parsl
240 return None
241 job = self.jobs[name]
242 inputs = [self.execute(parent) for parent in self.parents[name]]
243 executors = self.parsl_config.executors
244 if len(executors) > 1:
245 label = self.site.select_executor(job)
246 else:
247 label = executors[0].label
248 return job.get_future(
249 self.apps[label],
250 [ff for ff in inputs if ff is not None],
251 self.command_prefix,
252 self.site.add_resources,
253 )
255 def load_dfk(self):
256 """Load data frame kernel
258 This starts parsl.
259 """
260 if self.dfk is not None:
261 raise RuntimeError("Workflow has already started.")
262 set_parsl_logging(self.bps_config)
263 self.dfk = parsl.load(self.parsl_config)
265 def start(self):
266 """Start the workflow"""
267 self.initialize_jobs()
268 self.load_dfk()
270 def restart(self):
271 """Restart the workflow after interruption"""
272 self.parsl_config.checkpoint_files = parsl.utils.get_last_checkpoint()
273 self.load_dfk()
275 def shutdown(self):
276 """Shut down the workflow
278 This stops parsl.
279 """
280 if self.dfk is None:
281 raise RuntimeError("Workflow not started.")
282 self.dfk.cleanup()
283 self.dfk = None
284 parsl.DataFlowKernelLoader.clear()
286 def initialize_jobs(self):
287 """Run initial jobs
289 These jobs are run locally before any other jobs are submitted to
290 parsl.
292 This is used to set up the butler.
293 """
294 job = self.jobs.get("pipetaskInit", None)
295 if job is not None:
296 os.makedirs(os.path.join(self.path, "logs"))
297 job.run_local()
299 def finalize_jobs(self):
300 """Run final jobs
302 These jobs are run locally after all other jobs are complete.
304 This is used to merge the execution butler.
305 """
306 if self.final is not None and not self.final.done:
307 self.final.run_local()