Coverage for python/lsst/ctrl/bps/parsl/workflow.py: 23%

107 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-12 09:37 +0000

1import logging 

2import os 

3import pickle 

4from collections.abc import Iterable, Mapping 

5 

6import parsl 

7import parsl.config 

8from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob 

9from parsl.app.app import bash_app 

10from parsl.app.futures import Future 

11 

12from .configuration import get_bps_config_value, get_workflow_filename, set_parsl_logging 

13from .job import ParslJob, get_file_paths 

14from .site import SiteConfig 

15 

16__all__ = ("ParslWorkflow", "get_parsl_config") 

17 

18_log = logging.getLogger("lsst.ctrl.bps.parsl") 

19 

20 

21def get_parsl_config(config: BpsConfig) -> parsl.config.Config: 

22 """Construct parsl configuration from BPS configuration 

23 

24 For details on the site configuration, see `SiteConfig`. For details on the 

25 monitor configuration, see ``get_parsl_monitor``. 

26 

27 The retries are set from the ``site.<computeSite>.retries`` value. 

28 

29 Parameters 

30 ---------- 

31 config : `BpsConfig` 

32 BPS configuration 

33 

34 Returns 

35 ------- 

36 parsl_config : `parsl.config.Config` 

37 Parsl configuration. 

38 """ 

39 site = SiteConfig.from_config(config) 

40 executors = site.get_executors() 

41 retries = get_bps_config_value(site.site, "retries", int, 1) 

42 monitor = site.get_monitor() 

43 return parsl.config.Config( 

44 executors=executors, monitoring=monitor, retries=retries, checkpoint_mode="task_exit" 

45 ) 

46 

47 

48class ParslWorkflow(BaseWmsWorkflow): 

49 """Parsl-based workflow object to manage execution of workflow. 

50 

51 Parameters 

52 ---------- 

53 name : `str` 

54 Unique name of workflow. 

55 config : `lsst.ctrl.bps.BpsConfig` 

56 Generic workflow config. 

57 path : `str` 

58 Path prefix for workflow output files. 

59 jobs : `dict` mapping `str` to `ParslJob` 

60 Jobs to be executed. 

61 parents : `dict` mapping `str` to iterable of `str` 

62 Dependency tree. Keywords are job names, and values are a list of job 

63 names that must be executed before the keyword job name can be 

64 executed. 

65 endpoints : iterable of `str` 

66 Endpoints of the dependency tree. These jobs (specified by name) have 

67 no children. 

68 final : `ParslJob`, optional 

69 Final job to be done, e.g., to merge the execution butler. This is done 

70 locally. 

71 """ 

72 

73 def __init__( 

74 self, 

75 name: str, 

76 config: BpsConfig, 

77 path: str, 

78 jobs: dict[str, ParslJob], 

79 parents: Mapping[str, Iterable[str]], 

80 endpoints: Iterable[str], 

81 final: ParslJob | None = None, 

82 ): 

83 super().__init__(name, config) 

84 

85 self.path = path 

86 self.bps_config = config 

87 self.parsl_config = get_parsl_config(config) 

88 self.site = SiteConfig.from_config(config) 

89 self.dfk: parsl.DataFlowKernel | None = None # type: ignore 

90 self.command_prefix = self.site.get_command_prefix() 

91 

92 # these are function decorators 

93 self.apps = { 

94 ex.label: bash_app(executors=[ex.label], cache=True, ignore_for_cache=["stderr", "stdout"]) 

95 for ex in self.parsl_config.executors 

96 } 

97 

98 self.jobs = jobs 

99 self.parents = parents 

100 self.endpoints = endpoints 

101 self.final = final 

102 

103 def __reduce__(self): 

104 """Recipe for pickle""" 

105 return type(self), ( 

106 self.name, 

107 self.bps_config, 

108 self.path, 

109 self.jobs, 

110 self.parents, 

111 self.endpoints, 

112 self.final, 

113 ) 

114 

115 @classmethod 

116 def from_generic_workflow( 

117 cls, config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str, service_class: str 

118 ) -> BaseWmsWorkflow: 

119 """Create a ParslWorkflow object from a BPS GenericWorkflow. 

120 

121 Parameters 

122 ---------- 

123 config: `BpsConfig` 

124 Configuration of the workflow. 

125 generic_workflow: `lsst.ctrl.bps.generic_workflow.GenericWorkflow` 

126 Generic representation of a single workflow. 

127 out_prefix: `str` 

128 Prefix for workflow output files. 

129 service_class: `str` 

130 Full module name of WMS service class that created this workflow. 

131 

132 Returns 

133 ------- 

134 self : `ParslWorkflow` 

135 Constructed workflow. 

136 """ 

137 # Generate list of jobs 

138 jobs: dict[str, ParslJob] = {} 

139 for job_name in generic_workflow: 

140 generic_job = generic_workflow.get_job(job_name) 

141 assert generic_job.name not in jobs 

142 jobs[job_name] = ParslJob(generic_job, config, get_file_paths(generic_workflow, job_name)) 

143 

144 parents = {name: set(generic_workflow.predecessors(name)) for name in jobs} 

145 endpoints = [name for name in jobs if generic_workflow.out_degree(name) == 0] 

146 

147 # Add final job: execution butler merge 

148 job = generic_workflow.get_final() 

149 final: ParslJob | None = None 

150 if job is not None: 

151 assert isinstance(job, GenericWorkflowJob) 

152 final = ParslJob(job, config, get_file_paths(generic_workflow, job.name)) 

153 

154 return cls(generic_workflow.name, config, out_prefix, jobs, parents, endpoints, final) 

155 

156 def write(self, out_prefix: str): 

157 """Write workflow state 

158 

159 This, in combination with the parsl checkpoint files, can be used to 

160 restart a workflow that was interrupted. 

161 

162 Parameters 

163 ---------- 

164 out_prefix : `str` 

165 Root directory to be used for WMS workflow inputs and outputs 

166 as well as internal WMS files. 

167 """ 

168 filename = get_workflow_filename(out_prefix) 

169 _log.info("Writing workflow with ID=%s", out_prefix) 

170 with open(filename, "wb") as fd: 

171 pickle.dump(self, fd) 

172 

173 @classmethod 

174 def read(cls, out_prefix: str) -> "ParslWorkflow": 

175 """Construct from the saved workflow state 

176 

177 Parameters 

178 ---------- 

179 out_prefix : `str` 

180 Root directory to be used for WMS workflow inputs and outputs 

181 as well as internal WMS files. 

182 

183 Returns 

184 ------- 

185 self : `ParslWorkflow` 

186 Constructed workflow. 

187 """ 

188 filename = get_workflow_filename(out_prefix) 

189 with open(filename, "rb") as fd: 

190 self = pickle.load(fd) 

191 assert isinstance(self, cls) 

192 return self 

193 

194 def run(self, block: bool = True) -> list[Future | None]: 

195 """Run the workflow 

196 

197 Parameters 

198 ---------- 

199 block : `bool`, optional 

200 Block returning from this method until the workflow is complete? If 

201 `False`, jobs may still be running when this returns, and it is the 

202 user's responsibility to call the ``finalize_jobs`` and 

203 ``shutdown`` methods when they are complete. 

204 

205 Returns 

206 ------- 

207 futures : `list` of `Future` 

208 `Future` objects linked to the execution of the endpoint jobs. 

209 """ 

210 futures = [self.execute(name) for name in self.endpoints] 

211 if block: 

212 # Calling .exception() for each future blocks returning 

213 # from this method until all the jobs have executed or 

214 # raised an error. This is needed for running in a 

215 # non-interactive python process that would otherwise end 

216 # before the futures resolve. 

217 for ff in futures: 

218 if ff is not None: 

219 ff.exception() 

220 self.shutdown() 

221 self.finalize_jobs() 

222 return futures 

223 

224 def execute(self, name: str) -> Future | None: 

225 """Execute a job 

226 

227 Parameters 

228 ---------- 

229 name : `str` 

230 Name of job to execute. 

231 

232 Returns 

233 ------- 

234 future : `Future` or `None` 

235 A `Future` object linked to the execution of the job, or `None` if 

236 the job is being reserved to run locally. 

237 """ 

238 if name in ("pipetaskInit", "mergeExecutionButler"): 

239 # These get done outside of parsl 

240 return None 

241 job = self.jobs[name] 

242 inputs = [self.execute(parent) for parent in self.parents[name]] 

243 executors = self.parsl_config.executors 

244 if len(executors) > 1: 

245 label = self.site.select_executor(job) 

246 else: 

247 label = executors[0].label 

248 return job.get_future( 

249 self.apps[label], 

250 [ff for ff in inputs if ff is not None], 

251 self.command_prefix, 

252 self.site.add_resources, 

253 ) 

254 

255 def load_dfk(self): 

256 """Load data frame kernel 

257 

258 This starts parsl. 

259 """ 

260 if self.dfk is not None: 

261 raise RuntimeError("Workflow has already started.") 

262 set_parsl_logging(self.bps_config) 

263 self.dfk = parsl.load(self.parsl_config) 

264 

265 def start(self): 

266 """Start the workflow""" 

267 self.initialize_jobs() 

268 self.load_dfk() 

269 

270 def restart(self): 

271 """Restart the workflow after interruption""" 

272 self.parsl_config.checkpoint_files = parsl.utils.get_last_checkpoint() 

273 self.load_dfk() 

274 

275 def shutdown(self): 

276 """Shut down the workflow 

277 

278 This stops parsl. 

279 """ 

280 if self.dfk is None: 

281 raise RuntimeError("Workflow not started.") 

282 self.dfk.cleanup() 

283 self.dfk = None 

284 parsl.DataFlowKernelLoader.clear() 

285 

286 def initialize_jobs(self): 

287 """Run initial jobs 

288 

289 These jobs are run locally before any other jobs are submitted to 

290 parsl. 

291 

292 This is used to set up the butler. 

293 """ 

294 job = self.jobs.get("pipetaskInit", None) 

295 if job is not None: 

296 os.makedirs(os.path.join(self.path, "logs")) 

297 job.run_local() 

298 

299 def finalize_jobs(self): 

300 """Run final jobs 

301 

302 These jobs are run locally after all other jobs are complete. 

303 

304 This is used to merge the execution butler. 

305 """ 

306 if self.final is not None and not self.final.done: 

307 self.final.run_local()