Coverage for python/lsst/ctrl/bps/parsl/workflow.py: 23%

107 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-23 11:14 +0000

1# This file is part of ctrl_bps_parsl. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org) and the LSST DESC (https://www.lsstdesc.org/). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28import logging 

29import os 

30import pickle 

31from collections.abc import Iterable, Mapping 

32 

33import parsl 

34import parsl.config 

35from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob 

36from parsl.app.app import bash_app 

37from parsl.app.futures import Future 

38 

39from .configuration import get_bps_config_value, get_workflow_filename, set_parsl_logging 

40from .job import ParslJob, get_file_paths 

41from .site import SiteConfig 

42 

43__all__ = ("ParslWorkflow", "get_parsl_config") 

44 

45_log = logging.getLogger("lsst.ctrl.bps.parsl") 

46 

47 

48def get_parsl_config(config: BpsConfig) -> parsl.config.Config: 

49 """Construct parsl configuration from BPS configuration 

50 

51 For details on the site configuration, see `SiteConfig`. For details on the 

52 monitor configuration, see ``get_parsl_monitor``. 

53 

54 The retries are set from the ``site.<computeSite>.retries`` value. 

55 

56 Parameters 

57 ---------- 

58 config : `BpsConfig` 

59 BPS configuration 

60 

61 Returns 

62 ------- 

63 parsl_config : `parsl.config.Config` 

64 Parsl configuration. 

65 """ 

66 site = SiteConfig.from_config(config) 

67 executors = site.get_executors() 

68 retries = get_bps_config_value(site.site, "retries", int, 1) 

69 monitor = site.get_monitor() 

70 return parsl.config.Config( 

71 executors=executors, monitoring=monitor, retries=retries, checkpoint_mode="task_exit" 

72 ) 

73 

74 

75class ParslWorkflow(BaseWmsWorkflow): 

76 """Parsl-based workflow object to manage execution of workflow. 

77 

78 Parameters 

79 ---------- 

80 name : `str` 

81 Unique name of workflow. 

82 config : `lsst.ctrl.bps.BpsConfig` 

83 Generic workflow config. 

84 path : `str` 

85 Path prefix for workflow output files. 

86 jobs : `dict` mapping `str` to `ParslJob` 

87 Jobs to be executed. 

88 parents : `dict` mapping `str` to iterable of `str` 

89 Dependency tree. Keywords are job names, and values are a list of job 

90 names that must be executed before the keyword job name can be 

91 executed. 

92 endpoints : iterable of `str` 

93 Endpoints of the dependency tree. These jobs (specified by name) have 

94 no children. 

95 final : `ParslJob`, optional 

96 Final job to be done, e.g., to merge the execution butler. This is done 

97 locally. 

98 """ 

99 

100 def __init__( 

101 self, 

102 name: str, 

103 config: BpsConfig, 

104 path: str, 

105 jobs: dict[str, ParslJob], 

106 parents: Mapping[str, Iterable[str]], 

107 endpoints: Iterable[str], 

108 final: ParslJob | None = None, 

109 ): 

110 super().__init__(name, config) 

111 

112 self.path = path 

113 self.bps_config = config 

114 self.parsl_config = get_parsl_config(config) 

115 self.site = SiteConfig.from_config(config) 

116 self.dfk: parsl.DataFlowKernel | None = None # type: ignore 

117 self.command_prefix = self.site.get_command_prefix() 

118 

119 # these are function decorators 

120 self.apps = { 

121 ex.label: bash_app(executors=[ex.label], cache=True, ignore_for_cache=["stderr", "stdout"]) 

122 for ex in self.parsl_config.executors 

123 } 

124 

125 self.jobs = jobs 

126 self.parents = parents 

127 self.endpoints = endpoints 

128 self.final = final 

129 

130 def __reduce__(self): 

131 """Recipe for pickle""" 

132 return type(self), ( 

133 self.name, 

134 self.bps_config, 

135 self.path, 

136 self.jobs, 

137 self.parents, 

138 self.endpoints, 

139 self.final, 

140 ) 

141 

142 @classmethod 

143 def from_generic_workflow( 

144 cls, config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str, service_class: str 

145 ) -> BaseWmsWorkflow: 

146 """Create a ParslWorkflow object from a BPS GenericWorkflow. 

147 

148 Parameters 

149 ---------- 

150 config: `BpsConfig` 

151 Configuration of the workflow. 

152 generic_workflow: `lsst.ctrl.bps.generic_workflow.GenericWorkflow` 

153 Generic representation of a single workflow. 

154 out_prefix: `str` 

155 Prefix for workflow output files. 

156 service_class: `str` 

157 Full module name of WMS service class that created this workflow. 

158 

159 Returns 

160 ------- 

161 self : `ParslWorkflow` 

162 Constructed workflow. 

163 """ 

164 # Generate list of jobs 

165 jobs: dict[str, ParslJob] = {} 

166 for job_name in generic_workflow: 

167 generic_job = generic_workflow.get_job(job_name) 

168 assert generic_job.name not in jobs 

169 jobs[job_name] = ParslJob(generic_job, config, get_file_paths(generic_workflow, job_name)) 

170 

171 parents = {name: set(generic_workflow.predecessors(name)) for name in jobs} 

172 endpoints = [name for name in jobs if generic_workflow.out_degree(name) == 0] 

173 

174 # Add final job: execution butler merge 

175 job = generic_workflow.get_final() 

176 final: ParslJob | None = None 

177 if job is not None: 

178 assert isinstance(job, GenericWorkflowJob) 

179 final = ParslJob(job, config, get_file_paths(generic_workflow, job.name)) 

180 

181 return cls(generic_workflow.name, config, out_prefix, jobs, parents, endpoints, final) 

182 

183 def write(self, out_prefix: str): 

184 """Write workflow state 

185 

186 This, in combination with the parsl checkpoint files, can be used to 

187 restart a workflow that was interrupted. 

188 

189 Parameters 

190 ---------- 

191 out_prefix : `str` 

192 Root directory to be used for WMS workflow inputs and outputs 

193 as well as internal WMS files. 

194 """ 

195 filename = get_workflow_filename(out_prefix) 

196 _log.info("Writing workflow with ID=%s", out_prefix) 

197 with open(filename, "wb") as fd: 

198 pickle.dump(self, fd) 

199 

200 @classmethod 

201 def read(cls, out_prefix: str) -> "ParslWorkflow": 

202 """Construct from the saved workflow state 

203 

204 Parameters 

205 ---------- 

206 out_prefix : `str` 

207 Root directory to be used for WMS workflow inputs and outputs 

208 as well as internal WMS files. 

209 

210 Returns 

211 ------- 

212 self : `ParslWorkflow` 

213 Constructed workflow. 

214 """ 

215 filename = get_workflow_filename(out_prefix) 

216 with open(filename, "rb") as fd: 

217 self = pickle.load(fd) 

218 assert isinstance(self, cls) 

219 return self 

220 

221 def run(self, block: bool = True) -> list[Future | None]: 

222 """Run the workflow 

223 

224 Parameters 

225 ---------- 

226 block : `bool`, optional 

227 Block returning from this method until the workflow is complete? If 

228 `False`, jobs may still be running when this returns, and it is the 

229 user's responsibility to call the ``finalize_jobs`` and 

230 ``shutdown`` methods when they are complete. 

231 

232 Returns 

233 ------- 

234 futures : `list` of `Future` 

235 `Future` objects linked to the execution of the endpoint jobs. 

236 """ 

237 futures = [self.execute(name) for name in self.endpoints] 

238 if block: 

239 # Calling .exception() for each future blocks returning 

240 # from this method until all the jobs have executed or 

241 # raised an error. This is needed for running in a 

242 # non-interactive python process that would otherwise end 

243 # before the futures resolve. 

244 for ff in futures: 

245 if ff is not None: 

246 ff.exception() 

247 self.shutdown() 

248 self.finalize_jobs() 

249 return futures 

250 

251 def execute(self, name: str) -> Future | None: 

252 """Execute a job 

253 

254 Parameters 

255 ---------- 

256 name : `str` 

257 Name of job to execute. 

258 

259 Returns 

260 ------- 

261 future : `Future` or `None` 

262 A `Future` object linked to the execution of the job, or `None` if 

263 the job is being reserved to run locally. 

264 """ 

265 if name in ("pipetaskInit", "mergeExecutionButler"): 

266 # These get done outside of parsl 

267 return None 

268 job = self.jobs[name] 

269 inputs = [self.execute(parent) for parent in self.parents[name]] 

270 executors = self.parsl_config.executors 

271 if len(executors) > 1: 

272 label = self.site.select_executor(job) 

273 else: 

274 label = executors[0].label 

275 return job.get_future( 

276 self.apps[label], 

277 [ff for ff in inputs if ff is not None], 

278 self.command_prefix, 

279 self.site.add_resources, 

280 ) 

281 

282 def load_dfk(self): 

283 """Load data frame kernel 

284 

285 This starts parsl. 

286 """ 

287 if self.dfk is not None: 

288 raise RuntimeError("Workflow has already started.") 

289 set_parsl_logging(self.bps_config) 

290 self.dfk = parsl.load(self.parsl_config) 

291 

292 def start(self): 

293 """Start the workflow""" 

294 self.initialize_jobs() 

295 self.load_dfk() 

296 

297 def restart(self): 

298 """Restart the workflow after interruption""" 

299 self.parsl_config.checkpoint_files = parsl.utils.get_last_checkpoint() 

300 self.load_dfk() 

301 

302 def shutdown(self): 

303 """Shut down the workflow 

304 

305 This stops parsl. 

306 """ 

307 if self.dfk is None: 

308 raise RuntimeError("Workflow not started.") 

309 self.dfk.cleanup() 

310 self.dfk = None 

311 parsl.DataFlowKernelLoader.clear() 

312 

313 def initialize_jobs(self): 

314 """Run initial jobs 

315 

316 These jobs are run locally before any other jobs are submitted to 

317 parsl. 

318 

319 This is used to set up the butler. 

320 """ 

321 job = self.jobs.get("pipetaskInit", None) 

322 if job is not None: 

323 os.makedirs(os.path.join(self.path, "logs")) 

324 job.run_local() 

325 

326 def finalize_jobs(self): 

327 """Run final jobs 

328 

329 These jobs are run locally after all other jobs are complete. 

330 

331 This is used to merge the execution butler. 

332 """ 

333 if self.final is not None and not self.final.done: 

334 self.final.run_local()