Coverage for python/lsst/ctrl/bps/wms_service.py: 86%

105 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-01 16:05 -0700

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Base classes for working with a specific WMS.""" 

29 

30 

31__all__ = [ 

32 "BaseWmsService", 

33 "BaseWmsWorkflow", 

34 "WmsJobReport", 

35 "WmsRunReport", 

36 "WmsStates", 

37] 

38 

39 

40import dataclasses 

41import logging 

42from abc import ABCMeta 

43from enum import Enum 

44 

45_LOG = logging.getLogger(__name__) 

46 

47 

48class WmsStates(Enum): 

49 """Run and job states.""" 

50 

51 UNKNOWN = 0 

52 """Can't determine state.""" 

53 

54 MISFIT = 1 

55 """Determined state, but doesn't fit other states.""" 

56 

57 UNREADY = 2 

58 """Still waiting for parents to finish.""" 

59 

60 READY = 3 

61 """All of its parents have finished successfully.""" 

62 

63 PENDING = 4 

64 """Ready to run, visible in batch queue.""" 

65 

66 RUNNING = 5 

67 """Currently running.""" 

68 

69 DELETED = 6 

70 """In the process of being deleted or already deleted.""" 

71 

72 HELD = 7 

73 """In a hold state.""" 

74 

75 SUCCEEDED = 8 

76 """Have completed with success status.""" 

77 

78 FAILED = 9 

79 """Have completed with non-success status.""" 

80 

81 PRUNED = 10 

82 """At least one of the parents failed or can't be run.""" 

83 

84 

85@dataclasses.dataclass(slots=True) 

86class WmsJobReport: 

87 """WMS job information to be included in detailed report output.""" 

88 

89 wms_id: str 

90 """Job id assigned by the workflow management system.""" 

91 

92 name: str 

93 """A name assigned automatically by BPS.""" 

94 

95 label: str 

96 """A user-facing label for a job. Multiple jobs can have the same label.""" 

97 

98 state: WmsStates 

99 """Job's current execution state.""" 

100 

101 

102@dataclasses.dataclass(slots=True) 

103class WmsRunReport: 

104 """WMS run information to be included in detailed report output.""" 

105 

106 wms_id: str = None 

107 """Id assigned to the run by the WMS. 

108 """ 

109 

110 global_wms_id: str = None 

111 """Global run identification number. 

112 

113 Only applicable in the context of a WMS using distributed job queues 

114 (e.g., HTCondor). 

115 """ 

116 

117 path: str = None 

118 """Path to the submit directory.""" 

119 

120 label: str = None 

121 """Run's label.""" 

122 

123 run: str = None 

124 """Run's name.""" 

125 

126 project: str = None 

127 """Name of the project run belongs to.""" 

128 

129 campaign: str = None 

130 """Name of the campaign the run belongs to.""" 

131 

132 payload: str = None 

133 """Name of the payload.""" 

134 

135 operator: str = None 

136 """Username of the operator who submitted the run.""" 

137 

138 run_summary: str = None 

139 """Job counts per label.""" 

140 

141 state: WmsStates = None 

142 """Run's execution state.""" 

143 

144 jobs: list[WmsJobReport] = None 

145 """Information about individual jobs in the run.""" 

146 

147 total_number_jobs: int = None 

148 """Total number of jobs in the run.""" 

149 

150 job_state_counts: dict[WmsStates, int] = None 

151 """Job counts per state.""" 

152 

153 job_summary: dict[str, dict[WmsStates, int]] = None 

154 """Job counts per label and per state.""" 

155 

156 exit_code_summary: dict[str, list[int]] = None 

157 """Summary of non-zero exit codes per job label available through the WMS. 

158 

159 Currently behavior for jobs that were canceled, held, etc. are plugin 

160 dependent. 

161 """ 

162 

163 

164class BaseWmsService: 

165 """Interface for interactions with a specific WMS. 

166 

167 Parameters 

168 ---------- 

169 config : `lsst.ctrl.bps.BpsConfig` 

170 Configuration needed by the WMS service. 

171 """ 

172 

173 def __init__(self, config): 

174 self.config = config 

175 

176 def prepare(self, config, generic_workflow, out_prefix=None): 

177 """Create submission for a generic workflow for a specific WMS. 

178 

179 Parameters 

180 ---------- 

181 config : `lsst.ctrl.bps.BpsConfig` 

182 BPS configuration. 

183 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

184 Generic representation of a single workflow. 

185 out_prefix : `str` 

186 Prefix for all WMS output files. 

187 

188 Returns 

189 ------- 

190 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

191 Prepared WMS Workflow to submit for execution. 

192 """ 

193 raise NotImplementedError 

194 

195 def submit(self, workflow, **kwargs): 

196 """Submit a single WMS workflow. 

197 

198 Parameters 

199 ---------- 

200 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

201 Prepared WMS Workflow to submit for execution. 

202 **kwargs : `~typing.Any` 

203 Additional modifiers to the configuration. 

204 """ 

205 raise NotImplementedError 

206 

207 def restart(self, wms_workflow_id): 

208 """Restart a workflow from the point of failure. 

209 

210 Parameters 

211 ---------- 

212 wms_workflow_id : `str` 

213 Id that can be used by WMS service to identify workflow that 

214 need to be restarted. 

215 

216 Returns 

217 ------- 

218 wms_id : `str` 

219 Id of the restarted workflow. If restart failed, it will be set 

220 to `None`. 

221 run_name : `str` 

222 Name of the restarted workflow. If restart failed, it will be set 

223 to `None`. 

224 message : `str` 

225 A message describing any issues encountered during the restart. 

226 If there were no issue, an empty string is returned. 

227 """ 

228 raise NotImplementedError 

229 

230 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

231 """Query WMS for list of submitted WMS workflows/jobs. 

232 

233 This should be a quick lookup function to create list of jobs for 

234 other functions. 

235 

236 Parameters 

237 ---------- 

238 wms_id : `int` or `str`, optional 

239 Id or path that can be used by WMS service to look up job. 

240 user : `str`, optional 

241 User whose submitted jobs should be listed. 

242 require_bps : `bool`, optional 

243 Whether to require jobs returned in list to be bps-submitted jobs. 

244 pass_thru : `str`, optional 

245 Information to pass through to WMS. 

246 is_global : `bool`, optional 

247 If set, all available job queues will be queried for job 

248 information. Defaults to False which means that only a local job 

249 queue will be queried for information. 

250 

251 Only applicable in the context of a WMS using distributed job 

252 queues (e.g., HTCondor). A WMS with a centralized job queue 

253 (e.g. PanDA) can safely ignore it. 

254 

255 Returns 

256 ------- 

257 job_ids : `list` [`Any`] 

258 Only job ids to be used by cancel and other functions. Typically 

259 this means top-level jobs (i.e., not children jobs). 

260 """ 

261 raise NotImplementedError 

262 

263 def report( 

264 self, 

265 wms_workflow_id=None, 

266 user=None, 

267 hist=0, 

268 pass_thru=None, 

269 is_global=False, 

270 return_exit_codes=False, 

271 ): 

272 """Query WMS for status of submitted WMS workflows. 

273 

274 Parameters 

275 ---------- 

276 wms_workflow_id : `int` or `str`, optional 

277 Id that can be used by WMS service to look up status. 

278 user : `str`, optional 

279 Limit report to submissions by this particular user. 

280 hist : `int`, optional 

281 Number of days to expand report to include finished WMS workflows. 

282 pass_thru : `str`, optional 

283 Additional arguments to pass through to the specific WMS service. 

284 is_global : `bool`, optional 

285 If set, all available job queues will be queried for job 

286 information. Defaults to False which means that only a local job 

287 queue will be queried for information. 

288 

289 Only applicable in the context of a WMS using distributed job 

290 queues (e.g., HTCondor). A WMS with a centralized job queue 

291 (e.g. PanDA) can safely ignore it. 

292 return_exit_codes : `bool`, optional 

293 If set, return exit codes related to jobs with a 

294 non-success status. Defaults to False, which means that only 

295 the summary state is returned. 

296 

297 Only applicable in the context of a WMS with associated 

298 handlers to return exit codes from jobs. 

299 

300 Returns 

301 ------- 

302 run_reports : `list` [`lsst.ctrl.bps.WmsRunReport`] 

303 Status information for submitted WMS workflows. 

304 message : `str` 

305 Message to user on how to find more status information specific to 

306 this particular WMS. 

307 """ 

308 raise NotImplementedError 

309 

310 def cancel(self, wms_id, pass_thru=None): 

311 """Cancel submitted workflows/jobs. 

312 

313 Parameters 

314 ---------- 

315 wms_id : `str` 

316 ID or path of job that should be canceled. 

317 pass_thru : `str`, optional 

318 Information to pass through to WMS. 

319 

320 Returns 

321 ------- 

322 deleted : `bool` 

323 Whether successful deletion or not. Currently, if any doubt or any 

324 individual jobs not deleted, return False. 

325 message : `str` 

326 Any message from WMS (e.g., error details). 

327 """ 

328 raise NotImplementedError 

329 

330 def run_submission_checks(self): 

331 """Check to run at start if running WMS specific submission steps. 

332 

333 Any exception other than NotImplementedError will halt submission. 

334 Submit directory may not yet exist when this is called. 

335 """ 

336 raise NotImplementedError 

337 

338 def ping(self, pass_thru): 

339 """Check whether WMS services are up, reachable, and can authenticate 

340 if authentication is required. 

341 

342 The services to be checked are those needed for submit, report, cancel, 

343 restart, but ping cannot guarantee whether jobs would actually run 

344 successfully. 

345 

346 Parameters 

347 ---------- 

348 pass_thru : `str`, optional 

349 Information to pass through to WMS. 

350 

351 Returns 

352 ------- 

353 status : `int` 

354 0 for success, non-zero for failure. 

355 message : `str` 

356 Any message from WMS (e.g., error details). 

357 """ 

358 raise NotImplementedError 

359 

360 

361class BaseWmsWorkflow(metaclass=ABCMeta): 

362 """Interface for single workflow specific to a WMS. 

363 

364 Parameters 

365 ---------- 

366 name : `str` 

367 Unique name of workflow. 

368 config : `lsst.ctrl.bps.BpsConfig` 

369 Generic workflow config. 

370 """ 

371 

372 def __init__(self, name, config): 

373 self.name = name 

374 self.config = config 

375 self.service_class = None 

376 self.run_id = None 

377 self.submit_path = None 

378 

379 @classmethod 

380 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

381 """Create a WMS-specific workflow from a GenericWorkflow. 

382 

383 Parameters 

384 ---------- 

385 config : `lsst.ctrl.bps.BpsConfig` 

386 Configuration values needed for generating a WMS specific workflow. 

387 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

388 Generic workflow from which to create the WMS-specific one. 

389 out_prefix : `str` 

390 Root directory to be used for WMS workflow inputs and outputs 

391 as well as internal WMS files. 

392 service_class : `str` 

393 Full module name of WMS service class that created this workflow. 

394 

395 Returns 

396 ------- 

397 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

398 A WMS specific workflow. 

399 """ 

400 raise NotImplementedError 

401 

402 def write(self, out_prefix): 

403 """Write WMS files for this particular workflow. 

404 

405 Parameters 

406 ---------- 

407 out_prefix : `str` 

408 Root directory to be used for WMS workflow inputs and outputs 

409 as well as internal WMS files. 

410 """ 

411 raise NotImplementedError