Coverage for python/lsst/ctrl/bps/parsl/workflow.py: 24%

108 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-29 09:36 +0000

1# This file is part of ctrl_bps_parsl. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org) and the LSST DESC (https://www.lsstdesc.org/). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28import logging 

29import os 

30import pickle 

31from collections.abc import Iterable, Mapping 

32 

33import parsl 

34import parsl.config 

35from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob 

36from parsl.app.app import bash_app 

37from parsl.app.bash import BashApp 

38from parsl.app.futures import Future 

39 

40from .configuration import get_bps_config_value, get_workflow_filename, set_parsl_logging 

41from .job import ParslJob, get_file_paths 

42from .site import SiteConfig 

43 

44__all__ = ("ParslWorkflow", "get_parsl_config") 

45 

46_log = logging.getLogger("lsst.ctrl.bps.parsl") 

47 

48 

49def get_parsl_config(config: BpsConfig) -> parsl.config.Config: 

50 """Construct parsl configuration from BPS configuration 

51 

52 For details on the site configuration, see `SiteConfig`. For details on the 

53 monitor configuration, see ``get_parsl_monitor``. 

54 

55 The retries are set from the ``site.<computeSite>.retries`` value. 

56 

57 Parameters 

58 ---------- 

59 config : `BpsConfig` 

60 BPS configuration 

61 

62 Returns 

63 ------- 

64 parsl_config : `parsl.config.Config` 

65 Parsl configuration. 

66 """ 

67 site = SiteConfig.from_config(config) 

68 executors = site.get_executors() 

69 retries = get_bps_config_value(site.site, "retries", int, 1) 

70 monitor = site.get_monitor() 

71 return parsl.config.Config( 

72 executors=executors, monitoring=monitor, retries=retries, checkpoint_mode="task_exit" 

73 ) 

74 

75 

76class ParslWorkflow(BaseWmsWorkflow): 

77 """Parsl-based workflow object to manage execution of workflow. 

78 

79 Parameters 

80 ---------- 

81 name : `str` 

82 Unique name of workflow. 

83 config : `lsst.ctrl.bps.BpsConfig` 

84 Generic workflow config. 

85 path : `str` 

86 Path prefix for workflow output files. 

87 jobs : `dict` mapping `str` to `ParslJob` 

88 Jobs to be executed. 

89 parents : `dict` mapping `str` to iterable of `str` 

90 Dependency tree. Keywords are job names, and values are a list of job 

91 names that must be executed before the keyword job name can be 

92 executed. 

93 endpoints : iterable of `str` 

94 Endpoints of the dependency tree. These jobs (specified by name) have 

95 no children. 

96 final : `ParslJob`, optional 

97 Final job to be done, e.g., to merge the execution butler. This is done 

98 locally. 

99 """ 

100 

101 def __init__( 

102 self, 

103 name: str, 

104 config: BpsConfig, 

105 path: str, 

106 jobs: dict[str, ParslJob], 

107 parents: Mapping[str, Iterable[str]], 

108 endpoints: Iterable[str], 

109 final: ParslJob | None = None, 

110 ): 

111 super().__init__(name, config) 

112 

113 self.path = path 

114 self.bps_config = config 

115 self.parsl_config = get_parsl_config(config) 

116 self.site = SiteConfig.from_config(config) 

117 self.dfk: parsl.DataFlowKernel | None = None # type: ignore 

118 self.command_prefix = self.site.get_command_prefix() 

119 

120 # these are function decorators 

121 self.apps: dict[str, BashApp] = { 

122 ex.label: bash_app( # type: ignore 

123 executors=[ex.label], cache=True, ignore_for_cache=["stderr", "stdout"] 

124 ) 

125 for ex in self.parsl_config.executors 

126 } 

127 

128 self.jobs = jobs 

129 self.parents = parents 

130 self.endpoints = endpoints 

131 self.final = final 

132 

133 def __reduce__(self): 

134 """Recipe for pickle""" 

135 return type(self), ( 

136 self.name, 

137 self.bps_config, 

138 self.path, 

139 self.jobs, 

140 self.parents, 

141 self.endpoints, 

142 self.final, 

143 ) 

144 

145 @classmethod 

146 def from_generic_workflow( 

147 cls, config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str, service_class: str 

148 ) -> BaseWmsWorkflow: 

149 """Create a ParslWorkflow object from a BPS GenericWorkflow. 

150 

151 Parameters 

152 ---------- 

153 config: `BpsConfig` 

154 Configuration of the workflow. 

155 generic_workflow: `lsst.ctrl.bps.generic_workflow.GenericWorkflow` 

156 Generic representation of a single workflow. 

157 out_prefix: `str` 

158 Prefix for workflow output files. 

159 service_class: `str` 

160 Full module name of WMS service class that created this workflow. 

161 

162 Returns 

163 ------- 

164 self : `ParslWorkflow` 

165 Constructed workflow. 

166 """ 

167 # Generate list of jobs 

168 jobs: dict[str, ParslJob] = {} 

169 for job_name in generic_workflow: 

170 generic_job = generic_workflow.get_job(job_name) 

171 assert generic_job.name not in jobs 

172 jobs[job_name] = ParslJob(generic_job, config, get_file_paths(generic_workflow, job_name)) 

173 

174 parents = {name: set(generic_workflow.predecessors(name)) for name in jobs} 

175 endpoints = [name for name in jobs if generic_workflow.out_degree(name) == 0] 

176 

177 # Add final job: execution butler merge 

178 job = generic_workflow.get_final() 

179 final: ParslJob | None = None 

180 if job is not None: 

181 assert isinstance(job, GenericWorkflowJob) 

182 final = ParslJob(job, config, get_file_paths(generic_workflow, job.name)) 

183 

184 return cls(generic_workflow.name, config, out_prefix, jobs, parents, endpoints, final) 

185 

186 def write(self, out_prefix: str): 

187 """Write workflow state 

188 

189 This, in combination with the parsl checkpoint files, can be used to 

190 restart a workflow that was interrupted. 

191 

192 Parameters 

193 ---------- 

194 out_prefix : `str` 

195 Root directory to be used for WMS workflow inputs and outputs 

196 as well as internal WMS files. 

197 """ 

198 filename = get_workflow_filename(out_prefix) 

199 _log.info("Writing workflow with ID=%s", out_prefix) 

200 with open(filename, "wb") as fd: 

201 pickle.dump(self, fd) 

202 

203 @classmethod 

204 def read(cls, out_prefix: str) -> "ParslWorkflow": 

205 """Construct from the saved workflow state 

206 

207 Parameters 

208 ---------- 

209 out_prefix : `str` 

210 Root directory to be used for WMS workflow inputs and outputs 

211 as well as internal WMS files. 

212 

213 Returns 

214 ------- 

215 self : `ParslWorkflow` 

216 Constructed workflow. 

217 """ 

218 filename = get_workflow_filename(out_prefix) 

219 with open(filename, "rb") as fd: 

220 self = pickle.load(fd) 

221 assert isinstance(self, cls) 

222 return self 

223 

224 def run(self, block: bool = True) -> list[Future | None]: 

225 """Run the workflow 

226 

227 Parameters 

228 ---------- 

229 block : `bool`, optional 

230 Block returning from this method until the workflow is complete? If 

231 `False`, jobs may still be running when this returns, and it is the 

232 user's responsibility to call the ``finalize_jobs`` and 

233 ``shutdown`` methods when they are complete. 

234 

235 Returns 

236 ------- 

237 futures : `list` of `Future` 

238 `Future` objects linked to the execution of the endpoint jobs. 

239 """ 

240 futures = [self.execute(name) for name in self.endpoints] 

241 if block: 

242 # Calling .exception() for each future blocks returning 

243 # from this method until all the jobs have executed or 

244 # raised an error. This is needed for running in a 

245 # non-interactive python process that would otherwise end 

246 # before the futures resolve. 

247 for ff in futures: 

248 if ff is not None: 

249 ff.exception() 

250 self.shutdown() 

251 self.finalize_jobs() 

252 return futures 

253 

254 def execute(self, name: str) -> Future | None: 

255 """Execute a job 

256 

257 Parameters 

258 ---------- 

259 name : `str` 

260 Name of job to execute. 

261 

262 Returns 

263 ------- 

264 future : `Future` or `None` 

265 A `Future` object linked to the execution of the job, or `None` if 

266 the job is being reserved to run locally. 

267 """ 

268 if name in ("pipetaskInit", "mergeExecutionButler"): 

269 # These get done outside of parsl 

270 return None 

271 job = self.jobs[name] 

272 inputs = [self.execute(parent) for parent in self.parents[name]] 

273 executors = self.parsl_config.executors 

274 if len(executors) > 1: 

275 label = self.site.select_executor(job) 

276 else: 

277 label = executors[0].label 

278 return job.get_future( 

279 self.apps[label], 

280 [ff for ff in inputs if ff is not None], 

281 self.command_prefix, 

282 self.site.add_resources, 

283 ) 

284 

285 def load_dfk(self): 

286 """Load data frame kernel 

287 

288 This starts parsl. 

289 """ 

290 if self.dfk is not None: 

291 raise RuntimeError("Workflow has already started.") 

292 set_parsl_logging(self.bps_config) 

293 self.dfk = parsl.load(self.parsl_config) 

294 

295 def start(self): 

296 """Start the workflow""" 

297 self.initialize_jobs() 

298 self.load_dfk() 

299 

300 def restart(self): 

301 """Restart the workflow after interruption""" 

302 self.parsl_config.checkpoint_files = parsl.utils.get_last_checkpoint() 

303 self.load_dfk() 

304 

305 def shutdown(self): 

306 """Shut down the workflow 

307 

308 This stops parsl. 

309 """ 

310 if self.dfk is None: 

311 raise RuntimeError("Workflow not started.") 

312 self.dfk.cleanup() 

313 self.dfk = None 

314 parsl.DataFlowKernelLoader.clear() 

315 

316 def initialize_jobs(self): 

317 """Run initial jobs 

318 

319 These jobs are run locally before any other jobs are submitted to 

320 parsl. 

321 

322 This is used to set up the butler. 

323 """ 

324 job = self.jobs.get("pipetaskInit", None) 

325 if job is not None: 

326 os.makedirs(os.path.join(self.path, "logs")) 

327 job.run_local() 

328 

329 def finalize_jobs(self): 

330 """Run final jobs 

331 

332 These jobs are run locally after all other jobs are complete. 

333 

334 This is used to merge the execution butler. 

335 """ 

336 if self.final is not None and not self.final.done: 

337 self.final.run_local()