Coverage for python/lsst/ctrl/bps/wms_service.py: 85%

103 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-01 09:55 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Base classes for working with a specific WMS""" 

29 

30 

31__all__ = [ 

32 "BaseWmsService", 

33 "BaseWmsWorkflow", 

34 "WmsJobReport", 

35 "WmsRunReport", 

36 "WmsStates", 

37] 

38 

39 

40import dataclasses 

41import logging 

42from abc import ABCMeta 

43from enum import Enum 

44 

45_LOG = logging.getLogger(__name__) 

46 

47 

48class WmsStates(Enum): 

49 """Run and job states""" 

50 

51 UNKNOWN = 0 

52 """Can't determine state.""" 

53 

54 MISFIT = 1 

55 """Determined state, but doesn't fit other states.""" 

56 

57 UNREADY = 2 

58 """Still waiting for parents to finish.""" 

59 

60 READY = 3 

61 """All of its parents have finished successfully.""" 

62 

63 PENDING = 4 

64 """Ready to run, visible in batch queue.""" 

65 

66 RUNNING = 5 

67 """Currently running.""" 

68 

69 DELETED = 6 

70 """In the process of being deleted or already deleted.""" 

71 

72 HELD = 7 

73 """In a hold state.""" 

74 

75 SUCCEEDED = 8 

76 """Have completed with success status.""" 

77 

78 FAILED = 9 

79 """Have completed with non-success status.""" 

80 

81 PRUNED = 10 

82 """At least one of the parents failed or can't be run.""" 

83 

84 

85@dataclasses.dataclass(slots=True) 

86class WmsJobReport: 

87 """WMS job information to be included in detailed report output""" 

88 

89 wms_id: str 

90 """Job id assigned by the workflow management system.""" 

91 

92 name: str 

93 """A name assigned automatically by BPS.""" 

94 

95 label: str 

96 """A user-facing label for a job. Multiple jobs can have the same label.""" 

97 

98 state: WmsStates 

99 """Job's current execution state.""" 

100 

101 

102@dataclasses.dataclass(slots=True) 

103class WmsRunReport: 

104 """WMS run information to be included in detailed report output""" 

105 

106 wms_id: str = None 

107 """Id assigned to the run by the WMS. 

108 """ 

109 

110 global_wms_id: str = None 

111 """Global run identification number. 

112 

113 Only applicable in the context of a WMS using distributed job queues 

114 (e.g., HTCondor). 

115 """ 

116 

117 path: str = None 

118 """Path to the submit directory.""" 

119 

120 label: str = None 

121 """Run's label.""" 

122 

123 run: str = None 

124 """Run's name.""" 

125 

126 project: str = None 

127 """Name of the project run belongs to.""" 

128 

129 campaign: str = None 

130 """Name of the campaign the run belongs to.""" 

131 

132 payload: str = None 

133 """Name of the payload.""" 

134 

135 operator: str = None 

136 """Username of the operator who submitted the run.""" 

137 

138 run_summary: str = None 

139 """Job counts per label.""" 

140 

141 state: WmsStates = None 

142 """Run's execution state.""" 

143 

144 jobs: list[WmsJobReport] = None 

145 """Information about individual jobs in the run.""" 

146 

147 total_number_jobs: int = None 

148 """Total number of jobs in the run.""" 

149 

150 job_state_counts: dict[WmsStates, int] = None 

151 """Job counts per state.""" 

152 

153 job_summary: dict[str, dict[WmsStates, int]] = None 

154 """Job counts per label and per state. 

155 """ 

156 

157 

158class BaseWmsService: 

159 """Interface for interactions with a specific WMS. 

160 

161 Parameters 

162 ---------- 

163 config : `lsst.ctrl.bps.BpsConfig` 

164 Configuration needed by the WMS service. 

165 """ 

166 

167 def __init__(self, config): 

168 self.config = config 

169 

170 def prepare(self, config, generic_workflow, out_prefix=None): 

171 """Create submission for a generic workflow for a specific WMS. 

172 

173 Parameters 

174 ---------- 

175 config : `lsst.ctrl.bps.BpsConfig` 

176 BPS configuration. 

177 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

178 Generic representation of a single workflow 

179 out_prefix : `str` 

180 Prefix for all WMS output files 

181 

182 Returns 

183 ------- 

184 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

185 Prepared WMS Workflow to submit for execution 

186 """ 

187 raise NotImplementedError 

188 

189 def submit(self, workflow): 

190 """Submit a single WMS workflow 

191 

192 Parameters 

193 ---------- 

194 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

195 Prepared WMS Workflow to submit for execution 

196 """ 

197 raise NotImplementedError 

198 

199 def restart(self, wms_workflow_id): 

200 """Restart a workflow from the point of failure. 

201 

202 Parameters 

203 ---------- 

204 wms_workflow_id : `str` 

205 Id that can be used by WMS service to identify workflow that 

206 need to be restarted. 

207 

208 Returns 

209 ------- 

210 wms_id : `str` 

211 Id of the restarted workflow. If restart failed, it will be set 

212 to None. 

213 run_name : `str` 

214 Name of the restarted workflow. If restart failed, it will be set 

215 to None. 

216 message : `str` 

217 A message describing any issues encountered during the restart. 

218 If there were no issue, an empty string is returned. 

219 """ 

220 raise NotImplementedError 

221 

222 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

223 """Query WMS for list of submitted WMS workflows/jobs. 

224 

225 This should be a quick lookup function to create list of jobs for 

226 other functions. 

227 

228 Parameters 

229 ---------- 

230 wms_id : `int` or `str`, optional 

231 Id or path that can be used by WMS service to look up job. 

232 user : `str`, optional 

233 User whose submitted jobs should be listed. 

234 require_bps : `bool`, optional 

235 Whether to require jobs returned in list to be bps-submitted jobs. 

236 pass_thru : `str`, optional 

237 Information to pass through to WMS. 

238 is_global : `bool`, optional 

239 If set, all available job queues will be queried for job 

240 information. Defaults to False which means that only a local job 

241 queue will be queried for information. 

242 

243 Only applicable in the context of a WMS using distributed job 

244 queues (e.g., HTCondor). A WMS with a centralized job queue 

245 (e.g. PanDA) can safely ignore it. 

246 

247 Returns 

248 ------- 

249 job_ids : `list` [`Any`] 

250 Only job ids to be used by cancel and other functions. Typically 

251 this means top-level jobs (i.e., not children jobs). 

252 """ 

253 raise NotImplementedError 

254 

255 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

256 """Query WMS for status of submitted WMS workflows. 

257 

258 Parameters 

259 ---------- 

260 wms_workflow_id : `int` or `str`, optional 

261 Id that can be used by WMS service to look up status. 

262 user : `str`, optional 

263 Limit report to submissions by this particular user. 

264 hist : `int`, optional 

265 Number of days to expand report to include finished WMS workflows. 

266 pass_thru : `str`, optional 

267 Additional arguments to pass through to the specific WMS service. 

268 is_global : `bool`, optional 

269 If set, all available job queues will be queried for job 

270 information. Defaults to False which means that only a local job 

271 queue will be queried for information. 

272 

273 Only applicable in the context of a WMS using distributed job 

274 queues (e.g., HTCondor). A WMS with a centralized job queue 

275 (e.g. PanDA) can safely ignore it. 

276 

277 Returns 

278 ------- 

279 run_reports : `list` [`lsst.ctrl.bps.WmsRunReport`] 

280 Status information for submitted WMS workflows. 

281 message : `str` 

282 Message to user on how to find more status information specific to 

283 this particular WMS. 

284 """ 

285 raise NotImplementedError 

286 

287 def cancel(self, wms_id, pass_thru=None): 

288 """Cancel submitted workflows/jobs. 

289 

290 Parameters 

291 ---------- 

292 wms_id : `str` 

293 ID or path of job that should be canceled. 

294 pass_thru : `str`, optional 

295 Information to pass through to WMS. 

296 

297 Returns 

298 ------- 

299 deleted : `bool` 

300 Whether successful deletion or not. Currently, if any doubt or any 

301 individual jobs not deleted, return False. 

302 message : `str` 

303 Any message from WMS (e.g., error details). 

304 """ 

305 raise NotImplementedError 

306 

307 def run_submission_checks(self): 

308 """Check to run at start if running WMS specific submission steps. 

309 

310 Any exception other than NotImplementedError will halt submission. 

311 Submit directory may not yet exist when this is called. 

312 """ 

313 raise NotImplementedError 

314 

315 def ping(self, pass_thru): 

316 """Check whether WMS services are up, reachable, and can authenticate 

317 if authentication is required. 

318 

319 The services to be checked are those needed for submit, report, cancel, 

320 restart, but ping cannot guarantee whether jobs would actually run 

321 successfully. 

322 

323 Parameters 

324 ---------- 

325 pass_thru : `str`, optional 

326 Information to pass through to WMS. 

327 

328 Returns 

329 ------- 

330 status : `int` 

331 0 for success, non-zero for failure 

332 message : `str` 

333 Any message from WMS (e.g., error details). 

334 """ 

335 raise NotImplementedError 

336 

337 

338class BaseWmsWorkflow(metaclass=ABCMeta): 

339 """Interface for single workflow specific to a WMS. 

340 

341 Parameters 

342 ---------- 

343 name : `str` 

344 Unique name of workflow. 

345 config : `lsst.ctrl.bps.BpsConfig` 

346 Generic workflow config. 

347 """ 

348 

349 def __init__(self, name, config): 

350 self.name = name 

351 self.config = config 

352 self.service_class = None 

353 self.run_id = None 

354 self.submit_path = None 

355 

356 @classmethod 

357 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

358 """Create a WMS-specific workflow from a GenericWorkflow 

359 

360 Parameters 

361 ---------- 

362 config : `lsst.ctrl.bps.BpsConfig` 

363 Configuration values needed for generating a WMS specific workflow. 

364 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

365 Generic workflow from which to create the WMS-specific one. 

366 out_prefix : `str` 

367 Root directory to be used for WMS workflow inputs and outputs 

368 as well as internal WMS files. 

369 service_class : `str` 

370 Full module name of WMS service class that created this workflow. 

371 

372 Returns 

373 ------- 

374 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

375 A WMS specific workflow. 

376 """ 

377 raise NotImplementedError 

378 

379 def write(self, out_prefix): 

380 """Write WMS files for this particular workflow. 

381 

382 Parameters 

383 ---------- 

384 out_prefix : `str` 

385 Root directory to be used for WMS workflow inputs and outputs 

386 as well as internal WMS files. 

387 """ 

388 raise NotImplementedError