Coverage for python/lsst/ctrl/bps/wms_service.py: 86%

105 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-20 11:11 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Base classes for working with a specific WMS.""" 

29 

30 

31__all__ = [ 

32 "BaseWmsService", 

33 "BaseWmsWorkflow", 

34 "WmsJobReport", 

35 "WmsRunReport", 

36 "WmsStates", 

37] 

38 

39 

40import dataclasses 

41import logging 

42from abc import ABCMeta 

43from enum import Enum 

44 

45_LOG = logging.getLogger(__name__) 

46 

47 

48class WmsStates(Enum): 

49 """Run and job states.""" 

50 

51 UNKNOWN = 0 

52 """Can't determine state.""" 

53 

54 MISFIT = 1 

55 """Determined state, but doesn't fit other states.""" 

56 

57 UNREADY = 2 

58 """Still waiting for parents to finish.""" 

59 

60 READY = 3 

61 """All of its parents have finished successfully.""" 

62 

63 PENDING = 4 

64 """Ready to run, visible in batch queue.""" 

65 

66 RUNNING = 5 

67 """Currently running.""" 

68 

69 DELETED = 6 

70 """In the process of being deleted or already deleted.""" 

71 

72 HELD = 7 

73 """In a hold state.""" 

74 

75 SUCCEEDED = 8 

76 """Have completed with success status.""" 

77 

78 FAILED = 9 

79 """Have completed with non-success status.""" 

80 

81 PRUNED = 10 

82 """At least one of the parents failed or can't be run.""" 

83 

84 

85@dataclasses.dataclass(slots=True) 

86class WmsJobReport: 

87 """WMS job information to be included in detailed report output.""" 

88 

89 wms_id: str 

90 """Job id assigned by the workflow management system.""" 

91 

92 name: str 

93 """A name assigned automatically by BPS.""" 

94 

95 label: str 

96 """A user-facing label for a job. Multiple jobs can have the same label.""" 

97 

98 state: WmsStates 

99 """Job's current execution state.""" 

100 

101 

102@dataclasses.dataclass(slots=True) 

103class WmsRunReport: 

104 """WMS run information to be included in detailed report output.""" 

105 

106 wms_id: str = None 

107 """Id assigned to the run by the WMS. 

108 """ 

109 

110 global_wms_id: str = None 

111 """Global run identification number. 

112 

113 Only applicable in the context of a WMS using distributed job queues 

114 (e.g., HTCondor). 

115 """ 

116 

117 path: str = None 

118 """Path to the submit directory.""" 

119 

120 label: str = None 

121 """Run's label.""" 

122 

123 run: str = None 

124 """Run's name.""" 

125 

126 project: str = None 

127 """Name of the project run belongs to.""" 

128 

129 campaign: str = None 

130 """Name of the campaign the run belongs to.""" 

131 

132 payload: str = None 

133 """Name of the payload.""" 

134 

135 operator: str = None 

136 """Username of the operator who submitted the run.""" 

137 

138 run_summary: str = None 

139 """Job counts per label.""" 

140 

141 state: WmsStates = None 

142 """Run's execution state.""" 

143 

144 jobs: list[WmsJobReport] = None 

145 """Information about individual jobs in the run.""" 

146 

147 total_number_jobs: int = None 

148 """Total number of jobs in the run.""" 

149 

150 job_state_counts: dict[WmsStates, int] = None 

151 """Job counts per state.""" 

152 

153 job_summary: dict[str, dict[WmsStates, int]] = None 

154 """Job counts per label and per state.""" 

155 

156 exit_code_summary: dict[list] = None 

157 """Summary of non-zero exit codes per job label 

158 available through the WMS. 

159 """ 

160 

161 

162class BaseWmsService: 

163 """Interface for interactions with a specific WMS. 

164 

165 Parameters 

166 ---------- 

167 config : `lsst.ctrl.bps.BpsConfig` 

168 Configuration needed by the WMS service. 

169 """ 

170 

171 def __init__(self, config): 

172 self.config = config 

173 

174 def prepare(self, config, generic_workflow, out_prefix=None): 

175 """Create submission for a generic workflow for a specific WMS. 

176 

177 Parameters 

178 ---------- 

179 config : `lsst.ctrl.bps.BpsConfig` 

180 BPS configuration. 

181 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

182 Generic representation of a single workflow. 

183 out_prefix : `str` 

184 Prefix for all WMS output files. 

185 

186 Returns 

187 ------- 

188 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

189 Prepared WMS Workflow to submit for execution. 

190 """ 

191 raise NotImplementedError 

192 

193 def submit(self, workflow): 

194 """Submit a single WMS workflow. 

195 

196 Parameters 

197 ---------- 

198 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

199 Prepared WMS Workflow to submit for execution. 

200 """ 

201 raise NotImplementedError 

202 

203 def restart(self, wms_workflow_id): 

204 """Restart a workflow from the point of failure. 

205 

206 Parameters 

207 ---------- 

208 wms_workflow_id : `str` 

209 Id that can be used by WMS service to identify workflow that 

210 need to be restarted. 

211 

212 Returns 

213 ------- 

214 wms_id : `str` 

215 Id of the restarted workflow. If restart failed, it will be set 

216 to `None`. 

217 run_name : `str` 

218 Name of the restarted workflow. If restart failed, it will be set 

219 to `None`. 

220 message : `str` 

221 A message describing any issues encountered during the restart. 

222 If there were no issue, an empty string is returned. 

223 """ 

224 raise NotImplementedError 

225 

226 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

227 """Query WMS for list of submitted WMS workflows/jobs. 

228 

229 This should be a quick lookup function to create list of jobs for 

230 other functions. 

231 

232 Parameters 

233 ---------- 

234 wms_id : `int` or `str`, optional 

235 Id or path that can be used by WMS service to look up job. 

236 user : `str`, optional 

237 User whose submitted jobs should be listed. 

238 require_bps : `bool`, optional 

239 Whether to require jobs returned in list to be bps-submitted jobs. 

240 pass_thru : `str`, optional 

241 Information to pass through to WMS. 

242 is_global : `bool`, optional 

243 If set, all available job queues will be queried for job 

244 information. Defaults to False which means that only a local job 

245 queue will be queried for information. 

246 

247 Only applicable in the context of a WMS using distributed job 

248 queues (e.g., HTCondor). A WMS with a centralized job queue 

249 (e.g. PanDA) can safely ignore it. 

250 

251 Returns 

252 ------- 

253 job_ids : `list` [`Any`] 

254 Only job ids to be used by cancel and other functions. Typically 

255 this means top-level jobs (i.e., not children jobs). 

256 """ 

257 raise NotImplementedError 

258 

259 def report( 

260 self, 

261 wms_workflow_id=None, 

262 user=None, 

263 hist=0, 

264 pass_thru=None, 

265 is_global=False, 

266 return_exit_codes=False, 

267 ): 

268 """Query WMS for status of submitted WMS workflows. 

269 

270 Parameters 

271 ---------- 

272 wms_workflow_id : `int` or `str`, optional 

273 Id that can be used by WMS service to look up status. 

274 user : `str`, optional 

275 Limit report to submissions by this particular user. 

276 hist : `int`, optional 

277 Number of days to expand report to include finished WMS workflows. 

278 pass_thru : `str`, optional 

279 Additional arguments to pass through to the specific WMS service. 

280 is_global : `bool`, optional 

281 If set, all available job queues will be queried for job 

282 information. Defaults to False which means that only a local job 

283 queue will be queried for information. 

284 

285 Only applicable in the context of a WMS using distributed job 

286 queues (e.g., HTCondor). A WMS with a centralized job queue 

287 (e.g. PanDA) can safely ignore it. 

288 return_exit_codes : `bool`, optional 

289 If set, return exit codes related to jobs with a 

290 non-success status. Defaults to False, which means that only 

291 the summary state is returned. 

292 

293 Only applicable in the context of a WMS with associated 

294 handlers to return exit codes from jobs. 

295 

296 Returns 

297 ------- 

298 run_reports : `list` [`lsst.ctrl.bps.WmsRunReport`] 

299 Status information for submitted WMS workflows. 

300 message : `str` 

301 Message to user on how to find more status information specific to 

302 this particular WMS. 

303 """ 

304 raise NotImplementedError 

305 

306 def cancel(self, wms_id, pass_thru=None): 

307 """Cancel submitted workflows/jobs. 

308 

309 Parameters 

310 ---------- 

311 wms_id : `str` 

312 ID or path of job that should be canceled. 

313 pass_thru : `str`, optional 

314 Information to pass through to WMS. 

315 

316 Returns 

317 ------- 

318 deleted : `bool` 

319 Whether successful deletion or not. Currently, if any doubt or any 

320 individual jobs not deleted, return False. 

321 message : `str` 

322 Any message from WMS (e.g., error details). 

323 """ 

324 raise NotImplementedError 

325 

326 def run_submission_checks(self): 

327 """Check to run at start if running WMS specific submission steps. 

328 

329 Any exception other than NotImplementedError will halt submission. 

330 Submit directory may not yet exist when this is called. 

331 """ 

332 raise NotImplementedError 

333 

334 def ping(self, pass_thru): 

335 """Check whether WMS services are up, reachable, and can authenticate 

336 if authentication is required. 

337 

338 The services to be checked are those needed for submit, report, cancel, 

339 restart, but ping cannot guarantee whether jobs would actually run 

340 successfully. 

341 

342 Parameters 

343 ---------- 

344 pass_thru : `str`, optional 

345 Information to pass through to WMS. 

346 

347 Returns 

348 ------- 

349 status : `int` 

350 0 for success, non-zero for failure. 

351 message : `str` 

352 Any message from WMS (e.g., error details). 

353 """ 

354 raise NotImplementedError 

355 

356 

357class BaseWmsWorkflow(metaclass=ABCMeta): 

358 """Interface for single workflow specific to a WMS. 

359 

360 Parameters 

361 ---------- 

362 name : `str` 

363 Unique name of workflow. 

364 config : `lsst.ctrl.bps.BpsConfig` 

365 Generic workflow config. 

366 """ 

367 

368 def __init__(self, name, config): 

369 self.name = name 

370 self.config = config 

371 self.service_class = None 

372 self.run_id = None 

373 self.submit_path = None 

374 

375 @classmethod 

376 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

377 """Create a WMS-specific workflow from a GenericWorkflow. 

378 

379 Parameters 

380 ---------- 

381 config : `lsst.ctrl.bps.BpsConfig` 

382 Configuration values needed for generating a WMS specific workflow. 

383 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

384 Generic workflow from which to create the WMS-specific one. 

385 out_prefix : `str` 

386 Root directory to be used for WMS workflow inputs and outputs 

387 as well as internal WMS files. 

388 service_class : `str` 

389 Full module name of WMS service class that created this workflow. 

390 

391 Returns 

392 ------- 

393 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

394 A WMS specific workflow. 

395 """ 

396 raise NotImplementedError 

397 

398 def write(self, out_prefix): 

399 """Write WMS files for this particular workflow. 

400 

401 Parameters 

402 ---------- 

403 out_prefix : `str` 

404 Root directory to be used for WMS workflow inputs and outputs 

405 as well as internal WMS files. 

406 """ 

407 raise NotImplementedError