Coverage for python/lsst/ctrl/bps/parsl/job.py: 22%

90 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-06 12:42 +0000

1# This file is part of ctrl_bps_parsl. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org) and the LSST DESC (https://www.lsstdesc.org/). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28import os 

29import re 

30import subprocess 

31from collections import defaultdict 

32from collections.abc import Sequence 

33from functools import partial 

34from textwrap import dedent 

35from typing import Any 

36 

37from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob 

38from parsl.app.bash import BashApp 

39from parsl.app.futures import Future 

40 

41from .configuration import get_bps_config_value 

42 

43__all__ = ("get_file_paths", "ParslJob") 

44 

45_env_regex = re.compile(r"<ENV:(\S+)>") # Regex for replacing <ENV:WHATEVER> in BPS job command-lines 

46_file_regex = re.compile(r"<FILE:(\S+)>") # Regex for replacing <FILE:WHATEVER> in BPS job command-lines 

47 

48 

49def run_command( 

50 command_line: str, 

51 inputs: Sequence[Future] = (), 

52 stdout: str | None = None, 

53 stderr: str | None = None, 

54 parsl_resource_specification: dict[str, Any] | None = None, 

55) -> str: 

56 """Run a command. 

57 

58 This function exists to get information into parsl, through the ``inputs``, 

59 ``stdout`` and ``stderr`` parameters. It needs to be wrapped by a parsl 

60 ``bash_app`` decorator before use, after which it will return a `Future`. 

61 

62 Parameters 

63 ---------- 

64 command_line : `str` 

65 Command-line to have parsl run. 

66 inputs : list of `Future` 

67 Other commands that must have run before this. 

68 stdout, stderr : `str`, optional 

69 Filenames for stdout and stderr. 

70 parsl_resource_specification : `dict`, optional 

71 Resources required for job. 

72 

73 Returns 

74 ------- 

75 command_line : `str` 

76 Command-line to have parsl run. 

77 """ 

78 return command_line 

79 

80 

81def get_file_paths(workflow: GenericWorkflow, name: str) -> dict[str, str]: 

82 """Extract file paths for a job. 

83 

84 Parameters 

85 ---------- 

86 workflow : `GenericWorkflow` 

87 BPS workflow that knows the file paths. 

88 name : `str` 

89 Job name. 

90 

91 Returns 

92 ------- 

93 paths : `dict` mapping `str` to `str` 

94 File paths for job, indexed by symbolic name. 

95 """ 

96 return {ff.name: ff.src_uri for ff in workflow.get_job_inputs(name)} 

97 

98 

99class ParslJob: 

100 """Job to execute with parsl. 

101 

102 Parameters 

103 ---------- 

104 generic : `GenericWorkflowJob` 

105 BPS job information. 

106 config : `BpsConfig` 

107 BPS configuration. 

108 file_paths : `dict` mapping `str` to `str` 

109 File paths for job, indexed by symbolic name. 

110 """ 

111 

112 def __init__( 

113 self, 

114 generic: GenericWorkflowJob, 

115 config: BpsConfig, 

116 file_paths: dict[str, str], 

117 ): 

118 self.generic = generic 

119 self.name = generic.name 

120 self.config = config 

121 self.file_paths = file_paths 

122 self.future = None 

123 self.done = False 

124 

125 # Determine directory for job stdout and stderr 

126 log_dir = os.path.join(get_bps_config_value(self.config, "submitPath", str, required=True), "logs") 

127 _, template = self.config.search( 

128 "subDirTemplate", 

129 opt={ 

130 "curvals": {"curr_site": self.config["computeSite"], "curr_cluster": self.generic.label}, 

131 "replaceVars": False, 

132 "default": "", 

133 }, 

134 ) 

135 job_vals = defaultdict(str) 

136 job_vals["label"] = self.generic.label 

137 if self.generic.tags: 

138 job_vals.update(self.generic.tags) 

139 subdir = template.format_map(job_vals) 

140 # Call normpath just to make paths easier to read as templates tend 

141 # to have variables that aren't used by every job. Avoid calling on 

142 # empty string because changes it to dot. 

143 same_part = os.path.normpath(os.path.join(log_dir, subdir, self.name)) 

144 self.stdout = same_part + ".stdout" 

145 self.stderr = same_part + ".stderr" 

146 

147 def __reduce__(self): 

148 """Recipe for pickling.""" 

149 return type(self), (self.generic, self.config, self.file_paths) 

150 

151 def get_command_line(self, allow_stage=True) -> str: 

152 """Get the bash command-line to run to execute this job. 

153 

154 Parameters 

155 ---------- 

156 allow_stage : `bool` 

157 Allow staging of execution butler? This is not appropriate for the 

158 initial or final jobs that run on the local nodes. 

159 

160 Returns 

161 ------- 

162 command : `str` 

163 Command-line to execute for job. 

164 """ 

165 command: str = self.generic.executable.src_uri + " " + self.generic.arguments 

166 if not allow_stage: 

167 return command 

168 exec_butler_dir = get_bps_config_value(self.config, "executionButlerDir", str) 

169 if not exec_butler_dir or not os.path.isdir(exec_butler_dir): 

170 # We're not using the execution butler 

171 return command 

172 

173 # Add commands to copy the execution butler. 

174 # This keeps workers from overloading the sqlite database. 

175 # The copy can be deleted once we're done, because the original 

176 # execution butler contains everything that's required. 

177 job_dir = os.path.join(os.path.dirname(exec_butler_dir), self.name) 

178 # Set the butlerConfig field to the location of the job-specific copy. 

179 command = command.replace("<FILE:butlerConfig>", job_dir) 

180 return dedent( 

181 f""" 

182 if [[ ! -d {job_dir} ]]; then mkdir -p {job_dir}; fi 

183 cp {exec_butler_dir}/* {job_dir} 

184 {command} 

185 retcode=$? 

186 rm -rf {job_dir} 

187 exit $retcode 

188 """ 

189 ) 

190 

191 def evaluate_command_line(self, command: str) -> str: 

192 """Evaluate the bash command-line. 

193 

194 BPS provides a command-line with symbolic names for BPS variables, 

195 environment variables and files. Here, we replace those symbolic names 

196 with the actual values, to provide a concrete command that can be 

197 executed. 

198 

199 In replacing file paths, we are implicitly assuming that we are working 

200 on a shared file system, i.e., that workers can see the butler 

201 directory, and that files do not need to be staged to the worker. 

202 

203 Parameters 

204 ---------- 

205 command : `str` 

206 Command-line to execute, from BPS. 

207 

208 Returns 

209 ------- 

210 command : `str` 

211 Command ready for execution on a worker. 

212 """ 

213 command = command.format(**self.generic.cmdvals) # BPS variables 

214 

215 # Make sure *all* symbolic names are resolved. 

216 # 

217 # In general, actual values for some symbolic names may contain other 

218 # symbolic names. As a result, more than one iteration may be required 

219 # to resolve all symbolic names. For example, an actual value for 

220 # a filename may contain a symbolic name for an environment variable. 

221 prev_command = command 

222 while True: 

223 command = re.sub(_env_regex, r"${\g<1>}", command) # Environment variables 

224 command = re.sub(_file_regex, lambda match: self.file_paths[match.group(1)], command) # Files 

225 if prev_command == command: 

226 break 

227 prev_command = command 

228 

229 return command 

230 

231 def get_resources(self) -> dict[str, Any]: 

232 """Return what resources are required for executing this job.""" 

233 resources = {} 

234 for bps_name, parsl_name, scale in ( 

235 ("request_memory", "memory", None), # Both BPS and WorkQueueExecutor use MB 

236 ("request_cpus", "cores", None), 

237 ("request_disk", "disk", None), # Both are MB 

238 ("request_walltime", "running_time_min", None), # Both are minutes 

239 ): 

240 value = getattr(self.generic, bps_name) 

241 if scale is not None: 

242 value *= scale 

243 resources[parsl_name] = value 

244 return resources 

245 

246 def get_future( 

247 self, 

248 app: BashApp, 

249 inputs: list[Future], 

250 command_prefix: str | None = None, 

251 add_resources: bool = False, 

252 ) -> Future | None: 

253 """Get the parsl app future for the job. 

254 

255 This effectively queues the job for execution by a worker, subject to 

256 dependencies. 

257 

258 Parameters 

259 ---------- 

260 app : callable 

261 A parsl bash_app decorator to use. 

262 inputs : list of `Future` 

263 Dependencies to be satisfied before executing this job. 

264 command_prefix : `str`, optional 

265 Bash commands to execute before the job command, e.g., for setting 

266 the environment. 

267 add_resources : `bool` 

268 Add resource specification when submitting the job? This is only 

269 appropriate for the ``WorkQueue`` executor; other executors will 

270 raise an exception. 

271 

272 Returns 

273 ------- 

274 future : `Future` or `None` 

275 A `Future` object linked to the execution of the job, or `None` if 

276 the job has already been done (e.g., by ``run_local``). 

277 """ 

278 if self.done: 

279 return None # Nothing to do 

280 if not self.future: 

281 command = self.get_command_line() 

282 command = self.evaluate_command_line(command) 

283 if command_prefix: 

284 command = command_prefix + "\n" + command 

285 resources = self.get_resources() if add_resources else None 

286 

287 # Add a layer of indirection to which we can add a useful name. 

288 # This name is used by parsl for tracking workflow status. 

289 func = partial(run_command) 

290 setattr(func, "__name__", self.generic.label) 

291 

292 self.future = app(func)( 

293 command, 

294 inputs=inputs, 

295 stdout=self.stdout, 

296 stderr=self.stderr, 

297 parsl_resource_specification=resources, 

298 ) 

299 return self.future 

300 

301 def run_local(self): 

302 """Run the command locally. 

303 

304 This is intended to support jobs that should not be done by a 

305 worker. 

306 """ 

307 if self.done: # Nothing to do 

308 return 

309 command = self.get_command_line(False) 

310 command = self.evaluate_command_line(command) 

311 os.makedirs(os.path.dirname(self.stdout), exist_ok=True) 

312 os.makedirs(os.path.dirname(self.stderr), exist_ok=True) 

313 with open(self.stdout, "w") as stdout, open(self.stderr, "w") as stderr: 

314 subprocess.check_call(command, shell=True, executable="/bin/bash", stdout=stdout, stderr=stderr) 

315 self.done = True