Coverage for python/lsst/ctrl/bps/wms_service.py: 85%

103 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-26 10:15 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Base classes for working with a specific WMS""" 

23 

24 

25__all__ = [ 

26 "BaseWmsService", 

27 "BaseWmsWorkflow", 

28 "WmsJobReport", 

29 "WmsRunReport", 

30 "WmsStates", 

31] 

32 

33 

34import dataclasses 

35import logging 

36from abc import ABCMeta 

37from enum import Enum 

38 

39_LOG = logging.getLogger(__name__) 

40 

41 

42class WmsStates(Enum): 

43 """Run and job states""" 

44 

45 UNKNOWN = 0 

46 """Can't determine state.""" 

47 

48 MISFIT = 1 

49 """Determined state, but doesn't fit other states.""" 

50 

51 UNREADY = 2 

52 """Still waiting for parents to finish.""" 

53 

54 READY = 3 

55 """All of its parents have finished successfully.""" 

56 

57 PENDING = 4 

58 """Ready to run, visible in batch queue.""" 

59 

60 RUNNING = 5 

61 """Currently running.""" 

62 

63 DELETED = 6 

64 """In the process of being deleted or already deleted.""" 

65 

66 HELD = 7 

67 """In a hold state.""" 

68 

69 SUCCEEDED = 8 

70 """Have completed with success status.""" 

71 

72 FAILED = 9 

73 """Have completed with non-success status.""" 

74 

75 PRUNED = 10 

76 """At least one of the parents failed or can't be run.""" 

77 

78 

79@dataclasses.dataclass(slots=True) 

80class WmsJobReport: 

81 """WMS job information to be included in detailed report output""" 

82 

83 wms_id: str 

84 """Job id assigned by the workflow management system.""" 

85 

86 name: str 

87 """A name assigned automatically by BPS.""" 

88 

89 label: str 

90 """A user-facing label for a job. Multiple jobs can have the same label.""" 

91 

92 state: WmsStates 

93 """Job's current execution state.""" 

94 

95 

96@dataclasses.dataclass(slots=True) 

97class WmsRunReport: 

98 """WMS run information to be included in detailed report output""" 

99 

100 wms_id: str = None 

101 """Id assigned to the run by the WMS. 

102 """ 

103 

104 global_wms_id: str = None 

105 """Global run identification number. 

106 

107 Only applicable in the context of a WMS using distributed job queues 

108 (e.g., HTCondor). 

109 """ 

110 

111 path: str = None 

112 """Path to the submit directory.""" 

113 

114 label: str = None 

115 """Run's label.""" 

116 

117 run: str = None 

118 """Run's name.""" 

119 

120 project: str = None 

121 """Name of the project run belongs to.""" 

122 

123 campaign: str = None 

124 """Name of the campaign the run belongs to.""" 

125 

126 payload: str = None 

127 """Name of the payload.""" 

128 

129 operator: str = None 

130 """Username of the operator who submitted the run.""" 

131 

132 run_summary: str = None 

133 """Job counts per label.""" 

134 

135 state: WmsStates = None 

136 """Run's execution state.""" 

137 

138 jobs: list[WmsJobReport] = None 

139 """Information about individual jobs in the run.""" 

140 

141 total_number_jobs: int = None 

142 """Total number of jobs in the run.""" 

143 

144 job_state_counts: dict[WmsStates, int] = None 

145 """Job counts per state.""" 

146 

147 job_summary: dict[str, dict[WmsStates, int]] = None 

148 """Job counts per label and per state. 

149 """ 

150 

151 

152class BaseWmsService: 

153 """Interface for interactions with a specific WMS. 

154 

155 Parameters 

156 ---------- 

157 config : `lsst.ctrl.bps.BpsConfig` 

158 Configuration needed by the WMS service. 

159 """ 

160 

161 def __init__(self, config): 

162 self.config = config 

163 

164 def prepare(self, config, generic_workflow, out_prefix=None): 

165 """Create submission for a generic workflow for a specific WMS. 

166 

167 Parameters 

168 ---------- 

169 config : `lsst.ctrl.bps.BpsConfig` 

170 BPS configuration. 

171 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

172 Generic representation of a single workflow 

173 out_prefix : `str` 

174 Prefix for all WMS output files 

175 

176 Returns 

177 ------- 

178 wms_workflow : `BaseWmsWorkflow` 

179 Prepared WMS Workflow to submit for execution 

180 """ 

181 raise NotImplementedError 

182 

183 def submit(self, workflow): 

184 """Submit a single WMS workflow 

185 

186 Parameters 

187 ---------- 

188 workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

189 Prepared WMS Workflow to submit for execution 

190 """ 

191 raise NotImplementedError 

192 

193 def restart(self, wms_workflow_id): 

194 """Restart a workflow from the point of failure. 

195 

196 Parameters 

197 ---------- 

198 wms_workflow_id : `str` 

199 Id that can be used by WMS service to identify workflow that 

200 need to be restarted. 

201 

202 Returns 

203 ------- 

204 wms_id : `str` 

205 Id of the restarted workflow. If restart failed, it will be set 

206 to None. 

207 run_name : `str` 

208 Name of the restarted workflow. If restart failed, it will be set 

209 to None. 

210 message : `str` 

211 A message describing any issues encountered during the restart. 

212 If there were no issue, an empty string is returned. 

213 """ 

214 raise NotImplementedError 

215 

216 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

217 """Query WMS for list of submitted WMS workflows/jobs. 

218 

219 This should be a quick lookup function to create list of jobs for 

220 other functions. 

221 

222 Parameters 

223 ---------- 

224 wms_id : `int` or `str`, optional 

225 Id or path that can be used by WMS service to look up job. 

226 user : `str`, optional 

227 User whose submitted jobs should be listed. 

228 require_bps : `bool`, optional 

229 Whether to require jobs returned in list to be bps-submitted jobs. 

230 pass_thru : `str`, optional 

231 Information to pass through to WMS. 

232 is_global : `bool`, optional 

233 If set, all available job queues will be queried for job 

234 information. Defaults to False which means that only a local job 

235 queue will be queried for information. 

236 

237 Only applicable in the context of a WMS using distributed job 

238 queues (e.g., HTCondor). A WMS with a centralized job queue 

239 (e.g. PanDA) can safely ignore it. 

240 

241 Returns 

242 ------- 

243 job_ids : `list` [`Any`] 

244 Only job ids to be used by cancel and other functions. Typically 

245 this means top-level jobs (i.e., not children jobs). 

246 """ 

247 raise NotImplementedError 

248 

249 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

250 """Query WMS for status of submitted WMS workflows. 

251 

252 Parameters 

253 ---------- 

254 wms_workflow_id : `int` or `str`, optional 

255 Id that can be used by WMS service to look up status. 

256 user : `str`, optional 

257 Limit report to submissions by this particular user. 

258 hist : `int`, optional 

259 Number of days to expand report to include finished WMS workflows. 

260 pass_thru : `str`, optional 

261 Additional arguments to pass through to the specific WMS service. 

262 is_global : `bool`, optional 

263 If set, all available job queues will be queried for job 

264 information. Defaults to False which means that only a local job 

265 queue will be queried for information. 

266 

267 Only applicable in the context of a WMS using distributed job 

268 queues (e.g., HTCondor). A WMS with a centralized job queue 

269 (e.g. PanDA) can safely ignore it. 

270 

271 Returns 

272 ------- 

273 run_reports : `list` [`lsst.ctrl.bps.WmsRunReport`] 

274 Status information for submitted WMS workflows. 

275 message : `str` 

276 Message to user on how to find more status information specific to 

277 this particular WMS. 

278 """ 

279 raise NotImplementedError 

280 

281 def cancel(self, wms_id, pass_thru=None): 

282 """Cancel submitted workflows/jobs. 

283 

284 Parameters 

285 ---------- 

286 wms_id : `str` 

287 ID or path of job that should be canceled. 

288 pass_thru : `str`, optional 

289 Information to pass through to WMS. 

290 

291 Returns 

292 ------- 

293 deleted : `bool` 

294 Whether successful deletion or not. Currently, if any doubt or any 

295 individual jobs not deleted, return False. 

296 message : `str` 

297 Any message from WMS (e.g., error details). 

298 """ 

299 raise NotImplementedError 

300 

301 def run_submission_checks(self): 

302 """Checks to run at start if running WMS specific submission steps. 

303 

304 Any exception other than NotImplementedError will halt submission. 

305 Submit directory may not yet exist when this is called. 

306 """ 

307 raise NotImplementedError 

308 

309 def ping(self, pass_thru): 

310 """Checks whether WMS services are up, reachable, and can authenticate 

311 if authentication is required. 

312 

313 The services to be checked are those needed for submit, report, cancel, 

314 restart, but ping cannot guarantee whether jobs would actually run 

315 successfully. 

316 

317 Parameters 

318 ---------- 

319 pass_thru : `str`, optional 

320 Information to pass through to WMS. 

321 

322 Returns 

323 ------- 

324 status : `int` 

325 0 for success, non-zero for failure 

326 message : `str` 

327 Any message from WMS (e.g., error details). 

328 """ 

329 raise NotImplementedError 

330 

331 

332class BaseWmsWorkflow(metaclass=ABCMeta): 

333 """Interface for single workflow specific to a WMS. 

334 

335 Parameters 

336 ---------- 

337 name : `str` 

338 Unique name of workflow. 

339 config : `lsst.ctrl.bps.BpsConfig` 

340 Generic workflow config. 

341 """ 

342 

343 def __init__(self, name, config): 

344 self.name = name 

345 self.config = config 

346 self.service_class = None 

347 self.run_id = None 

348 self.submit_path = None 

349 

350 @classmethod 

351 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

352 """Create a WMS-specific workflow from a GenericWorkflow 

353 

354 Parameters 

355 ---------- 

356 config : `lsst.ctrl.bps.BpsConfig` 

357 Configuration values needed for generating a WMS specific workflow. 

358 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

359 Generic workflow from which to create the WMS-specific one. 

360 out_prefix : `str` 

361 Root directory to be used for WMS workflow inputs and outputs 

362 as well as internal WMS files. 

363 service_class : `str` 

364 Full module name of WMS service class that created this workflow. 

365 

366 Returns 

367 ------- 

368 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow` 

369 A WMS specific workflow. 

370 """ 

371 

372 raise NotImplementedError 

373 

374 def write(self, out_prefix): 

375 """Write WMS files for this particular workflow. 

376 

377 Parameters 

378 ---------- 

379 out_prefix : `str` 

380 Root directory to be used for WMS workflow inputs and outputs 

381 as well as internal WMS files. 

382 """ 

383 raise NotImplementedError