Coverage for python/lsst/ctrl/bps/parsl/workflow.py: 23%
107 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-07 09:51 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-07 09:51 +0000
1# This file is part of ctrl_bps_parsl.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org) and the LSST DESC (https://www.lsstdesc.org/).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28import logging
29import os
30import pickle
31from collections.abc import Iterable, Mapping
33import parsl
34import parsl.config
35from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob
36from parsl.app.app import bash_app
37from parsl.app.futures import Future
39from .configuration import get_bps_config_value, get_workflow_filename, set_parsl_logging
40from .job import ParslJob, get_file_paths
41from .site import SiteConfig
43__all__ = ("ParslWorkflow", "get_parsl_config")
45_log = logging.getLogger("lsst.ctrl.bps.parsl")
48def get_parsl_config(config: BpsConfig) -> parsl.config.Config:
49 """Construct parsl configuration from BPS configuration
51 For details on the site configuration, see `SiteConfig`. For details on the
52 monitor configuration, see ``get_parsl_monitor``.
54 The retries are set from the ``site.<computeSite>.retries`` value.
56 Parameters
57 ----------
58 config : `BpsConfig`
59 BPS configuration
61 Returns
62 -------
63 parsl_config : `parsl.config.Config`
64 Parsl configuration.
65 """
66 site = SiteConfig.from_config(config)
67 executors = site.get_executors()
68 retries = get_bps_config_value(site.site, "retries", int, 1)
69 monitor = site.get_monitor()
70 return parsl.config.Config(
71 executors=executors, monitoring=monitor, retries=retries, checkpoint_mode="task_exit"
72 )
75class ParslWorkflow(BaseWmsWorkflow):
76 """Parsl-based workflow object to manage execution of workflow.
78 Parameters
79 ----------
80 name : `str`
81 Unique name of workflow.
82 config : `lsst.ctrl.bps.BpsConfig`
83 Generic workflow config.
84 path : `str`
85 Path prefix for workflow output files.
86 jobs : `dict` mapping `str` to `ParslJob`
87 Jobs to be executed.
88 parents : `dict` mapping `str` to iterable of `str`
89 Dependency tree. Keywords are job names, and values are a list of job
90 names that must be executed before the keyword job name can be
91 executed.
92 endpoints : iterable of `str`
93 Endpoints of the dependency tree. These jobs (specified by name) have
94 no children.
95 final : `ParslJob`, optional
96 Final job to be done, e.g., to merge the execution butler. This is done
97 locally.
98 """
100 def __init__(
101 self,
102 name: str,
103 config: BpsConfig,
104 path: str,
105 jobs: dict[str, ParslJob],
106 parents: Mapping[str, Iterable[str]],
107 endpoints: Iterable[str],
108 final: ParslJob | None = None,
109 ):
110 super().__init__(name, config)
112 self.path = path
113 self.bps_config = config
114 self.parsl_config = get_parsl_config(config)
115 self.site = SiteConfig.from_config(config)
116 self.dfk: parsl.DataFlowKernel | None = None # type: ignore
117 self.command_prefix = self.site.get_command_prefix()
119 # these are function decorators
120 self.apps = {
121 ex.label: bash_app(executors=[ex.label], cache=True, ignore_for_cache=["stderr", "stdout"])
122 for ex in self.parsl_config.executors
123 }
125 self.jobs = jobs
126 self.parents = parents
127 self.endpoints = endpoints
128 self.final = final
130 def __reduce__(self):
131 """Recipe for pickle"""
132 return type(self), (
133 self.name,
134 self.bps_config,
135 self.path,
136 self.jobs,
137 self.parents,
138 self.endpoints,
139 self.final,
140 )
142 @classmethod
143 def from_generic_workflow(
144 cls, config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str, service_class: str
145 ) -> BaseWmsWorkflow:
146 """Create a ParslWorkflow object from a BPS GenericWorkflow.
148 Parameters
149 ----------
150 config: `BpsConfig`
151 Configuration of the workflow.
152 generic_workflow: `lsst.ctrl.bps.generic_workflow.GenericWorkflow`
153 Generic representation of a single workflow.
154 out_prefix: `str`
155 Prefix for workflow output files.
156 service_class: `str`
157 Full module name of WMS service class that created this workflow.
159 Returns
160 -------
161 self : `ParslWorkflow`
162 Constructed workflow.
163 """
164 # Generate list of jobs
165 jobs: dict[str, ParslJob] = {}
166 for job_name in generic_workflow:
167 generic_job = generic_workflow.get_job(job_name)
168 assert generic_job.name not in jobs
169 jobs[job_name] = ParslJob(generic_job, config, get_file_paths(generic_workflow, job_name))
171 parents = {name: set(generic_workflow.predecessors(name)) for name in jobs}
172 endpoints = [name for name in jobs if generic_workflow.out_degree(name) == 0]
174 # Add final job: execution butler merge
175 job = generic_workflow.get_final()
176 final: ParslJob | None = None
177 if job is not None:
178 assert isinstance(job, GenericWorkflowJob)
179 final = ParslJob(job, config, get_file_paths(generic_workflow, job.name))
181 return cls(generic_workflow.name, config, out_prefix, jobs, parents, endpoints, final)
183 def write(self, out_prefix: str):
184 """Write workflow state
186 This, in combination with the parsl checkpoint files, can be used to
187 restart a workflow that was interrupted.
189 Parameters
190 ----------
191 out_prefix : `str`
192 Root directory to be used for WMS workflow inputs and outputs
193 as well as internal WMS files.
194 """
195 filename = get_workflow_filename(out_prefix)
196 _log.info("Writing workflow with ID=%s", out_prefix)
197 with open(filename, "wb") as fd:
198 pickle.dump(self, fd)
200 @classmethod
201 def read(cls, out_prefix: str) -> "ParslWorkflow":
202 """Construct from the saved workflow state
204 Parameters
205 ----------
206 out_prefix : `str`
207 Root directory to be used for WMS workflow inputs and outputs
208 as well as internal WMS files.
210 Returns
211 -------
212 self : `ParslWorkflow`
213 Constructed workflow.
214 """
215 filename = get_workflow_filename(out_prefix)
216 with open(filename, "rb") as fd:
217 self = pickle.load(fd)
218 assert isinstance(self, cls)
219 return self
221 def run(self, block: bool = True) -> list[Future | None]:
222 """Run the workflow
224 Parameters
225 ----------
226 block : `bool`, optional
227 Block returning from this method until the workflow is complete? If
228 `False`, jobs may still be running when this returns, and it is the
229 user's responsibility to call the ``finalize_jobs`` and
230 ``shutdown`` methods when they are complete.
232 Returns
233 -------
234 futures : `list` of `Future`
235 `Future` objects linked to the execution of the endpoint jobs.
236 """
237 futures = [self.execute(name) for name in self.endpoints]
238 if block:
239 # Calling .exception() for each future blocks returning
240 # from this method until all the jobs have executed or
241 # raised an error. This is needed for running in a
242 # non-interactive python process that would otherwise end
243 # before the futures resolve.
244 for ff in futures:
245 if ff is not None:
246 ff.exception()
247 self.shutdown()
248 self.finalize_jobs()
249 return futures
251 def execute(self, name: str) -> Future | None:
252 """Execute a job
254 Parameters
255 ----------
256 name : `str`
257 Name of job to execute.
259 Returns
260 -------
261 future : `Future` or `None`
262 A `Future` object linked to the execution of the job, or `None` if
263 the job is being reserved to run locally.
264 """
265 if name in ("pipetaskInit", "mergeExecutionButler"):
266 # These get done outside of parsl
267 return None
268 job = self.jobs[name]
269 inputs = [self.execute(parent) for parent in self.parents[name]]
270 executors = self.parsl_config.executors
271 if len(executors) > 1:
272 label = self.site.select_executor(job)
273 else:
274 label = executors[0].label
275 return job.get_future(
276 self.apps[label],
277 [ff for ff in inputs if ff is not None],
278 self.command_prefix,
279 self.site.add_resources,
280 )
282 def load_dfk(self):
283 """Load data frame kernel
285 This starts parsl.
286 """
287 if self.dfk is not None:
288 raise RuntimeError("Workflow has already started.")
289 set_parsl_logging(self.bps_config)
290 self.dfk = parsl.load(self.parsl_config)
292 def start(self):
293 """Start the workflow"""
294 self.initialize_jobs()
295 self.load_dfk()
297 def restart(self):
298 """Restart the workflow after interruption"""
299 self.parsl_config.checkpoint_files = parsl.utils.get_last_checkpoint()
300 self.load_dfk()
302 def shutdown(self):
303 """Shut down the workflow
305 This stops parsl.
306 """
307 if self.dfk is None:
308 raise RuntimeError("Workflow not started.")
309 self.dfk.cleanup()
310 self.dfk = None
311 parsl.DataFlowKernelLoader.clear()
313 def initialize_jobs(self):
314 """Run initial jobs
316 These jobs are run locally before any other jobs are submitted to
317 parsl.
319 This is used to set up the butler.
320 """
321 job = self.jobs.get("pipetaskInit", None)
322 if job is not None:
323 os.makedirs(os.path.join(self.path, "logs"))
324 job.run_local()
326 def finalize_jobs(self):
327 """Run final jobs
329 These jobs are run locally after all other jobs are complete.
331 This is used to merge the execution butler.
332 """
333 if self.final is not None and not self.final.done:
334 self.final.run_local()