Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 12%

163 statements  

« prev     ^ index     » next       coverage.py v7.3.3, created at 2023-12-20 17:41 +0000

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27"""Interface between generic workflow to PanDA/iDDS workflow system. 

28""" 

29 

30 

31__all__ = ["PanDAService", "PandaBpsWmsWorkflow"] 

32 

33 

34import json 

35import logging 

36import os 

37import pickle 

38import re 

39 

40from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

41from lsst.ctrl.bps import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates 

42from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS 

43from lsst.ctrl.bps.panda.utils import ( 

44 add_final_idds_work, 

45 add_idds_work, 

46 copy_files_for_distribution, 

47 get_idds_client, 

48 get_idds_result, 

49) 

50 

51_LOG = logging.getLogger(__name__) 

52 

53 

54class PanDAService(BaseWmsService): 

55 """PanDA version of WMS service.""" 

56 

57 def prepare(self, config, generic_workflow, out_prefix=None): 

58 # Docstring inherited from BaseWmsService.prepare. 

59 _LOG.debug("out_prefix = '%s'", out_prefix) 

60 workflow = PandaBpsWmsWorkflow.from_generic_workflow( 

61 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}" 

62 ) 

63 workflow.write(out_prefix) 

64 return workflow 

65 

66 def submit(self, workflow): 

67 _, max_copy_workers = self.config.search( 

68 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} 

69 ) 

70 # Docstring inherited from BaseWmsService.submit. 

71 file_distribution_uri = self.config["fileDistributionEndPoint"] 

72 lsst_temp = "LSST_RUN_TEMP_SPACE" 

73 if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: 

74 file_distribution_uri = self.config["fileDistributionEndPointDefault"] 

75 

76 copy_files_for_distribution(workflow.files_to_pre_stage, file_distribution_uri, max_copy_workers) 

77 

78 idds_client = get_idds_client(self.config) 

79 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) 

80 _LOG.debug("iDDS client manager submit returned = %s", ret) 

81 

82 # Check submission success 

83 status, result, error = get_idds_result(ret) 

84 if status: 

85 request_id = int(result) 

86 else: 

87 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

88 

89 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

90 workflow.run_id = request_id 

91 

92 def restart(self, wms_workflow_id): 

93 # Docstring inherited from BaseWmsService.restart. 

94 idds_client = get_idds_client(self.config) 

95 ret = idds_client.retry(request_id=wms_workflow_id) 

96 _LOG.debug("Restart PanDA workflow returned = %s", ret) 

97 

98 status, result, error = get_idds_result(ret) 

99 if status: 

100 _LOG.info("Restarting PanDA workflow %s", result) 

101 return wms_workflow_id, None, json.dumps(result) 

102 

103 return None, None, f"Error retry PanDA workflow: {str(error)}" 

104 

105 def report( 

106 self, 

107 wms_workflow_id=None, 

108 user=None, 

109 hist=0, 

110 pass_thru=None, 

111 is_global=False, 

112 return_exit_codes=False, 

113 ): 

114 # Docstring inherited from BaseWmsService.report. 

115 message = "" 

116 run_reports = [] 

117 

118 if not wms_workflow_id: 

119 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead" 

120 return run_reports, message 

121 

122 idds_client = get_idds_client(self.config) 

123 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True) 

124 _LOG.debug("PanDA get workflow status returned = %s", str(ret)) 

125 

126 request_status = ret[0] 

127 if request_status != 0: 

128 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}") 

129 

130 tasks = ret[1][1] 

131 if not tasks: 

132 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id" 

133 else: 

134 head = tasks[0] 

135 wms_report = WmsRunReport( 

136 wms_id=str(head["request_id"]), 

137 operator=head["username"], 

138 project="", 

139 campaign="", 

140 payload="", 

141 run=head["name"], 

142 state=WmsStates.UNKNOWN, 

143 total_number_jobs=0, 

144 job_state_counts={state: 0 for state in WmsStates}, 

145 job_summary={}, 

146 run_summary="", 

147 exit_code_summary=[], 

148 ) 

149 

150 # The status of a task is taken from the first item of state_map. 

151 # The workflow is in status WmsStates.FAILED when: 

152 # All tasks have failed. 

153 # SubFinished tasks has jobs in 

154 # output_processed_files: Finished 

155 # output_failed_files: Failed 

156 # output_missing_files: Missing 

157 state_map = { 

158 "Finished": [WmsStates.SUCCEEDED], 

159 "SubFinished": [ 

160 WmsStates.SUCCEEDED, 

161 WmsStates.FAILED, 

162 WmsStates.PRUNED, 

163 ], 

164 "Transforming": [ 

165 WmsStates.RUNNING, 

166 WmsStates.SUCCEEDED, 

167 WmsStates.FAILED, 

168 WmsStates.UNREADY, 

169 WmsStates.PRUNED, 

170 ], 

171 "Failed": [WmsStates.FAILED, WmsStates.PRUNED], 

172 } 

173 

174 file_map = { 

175 WmsStates.SUCCEEDED: "output_processed_files", 

176 WmsStates.RUNNING: "output_processing_files", 

177 WmsStates.FAILED: "output_failed_files", 

178 WmsStates.UNREADY: "input_new_files", 

179 WmsStates.PRUNED: "output_missing_files", 

180 } 

181 

182 # workflow status to report as SUCCEEDED 

183 wf_status = ["Finished", "SubFinished", "Transforming"] 

184 

185 wf_succeed = False 

186 

187 tasks.sort(key=lambda x: x["transform_workload_id"]) 

188 

189 exit_codes_all = {} 

190 # Loop over all tasks data returned by idds_client 

191 for task in tasks: 

192 exit_codes = [] 

193 totaljobs = task["output_total_files"] 

194 wms_report.total_number_jobs += totaljobs 

195 tasklabel = task["transform_name"] 

196 tasklabel = re.sub(wms_report.run + "_", "", tasklabel) 

197 status = task["transform_status"]["attributes"]["_name_"] 

198 taskstatus = {} 

199 # if the state is failed, gather exit code information 

200 if status in ["SubFinished", "Failed"]: 

201 transform_workload_id = task["transform_workload_id"] 

202 new_ret = idds_client.get_contents_output_ext( 

203 request_id=wms_workflow_id, workload_id=transform_workload_id 

204 ) 

205 request_status = new_ret[0] 

206 if request_status != 0: 

207 raise RuntimeError( 

208 f"Error to get workflow status: {new_ret} for id: {wms_workflow_id}" 

209 ) 

210 # task_info is a dictionary of len 1 that contains a list 

211 # of dicts containing panda job info 

212 task_info = new_ret[1][1] 

213 

214 if len(task_info) == 1: 

215 wmskey = list(task_info.keys())[0] 

216 wmsjobs = task_info[wmskey] 

217 else: 

218 raise RuntimeError( 

219 f"Unexpected job return from PanDA: {task_info} for id: {transform_workload_id}" 

220 ) 

221 exit_codes = [ 

222 wmsjob["trans_exit_code"] 

223 for wmsjob in wmsjobs 

224 if wmsjob["trans_exit_code"] is not None and int(wmsjob["trans_exit_code"]) != 0 

225 ] 

226 exit_codes_all[tasklabel] = exit_codes 

227 # Fill number of jobs in all WmsStates 

228 for state in WmsStates: 

229 njobs = 0 

230 # Each WmsState have many iDDS status mapped to it. 

231 for mappedstate in state_map[status]: 

232 if state in file_map and mappedstate == state: 

233 if task[file_map[mappedstate]] is not None: 

234 njobs = task[file_map[mappedstate]] 

235 if state == WmsStates.RUNNING: 

236 njobs += task["output_new_files"] - task["input_new_files"] 

237 break 

238 wms_report.job_state_counts[state] += njobs 

239 taskstatus[state] = njobs 

240 wms_report.job_summary[tasklabel] = taskstatus 

241 

242 # To fill the EXPECTED column 

243 if wms_report.run_summary: 

244 wms_report.run_summary += ";" 

245 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}" 

246 

247 if status in wf_status: 

248 wf_succeed = True 

249 wms_report.state = state_map[status][0] 

250 

251 # All tasks have failed, set the workflow FAILED 

252 if not wf_succeed: 

253 wms_report.state = WmsStates.FAILED 

254 wms_report.exit_code_summary = exit_codes_all 

255 run_reports.append(wms_report) 

256 

257 return run_reports, message 

258 

259 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

260 # Docstring inherited from BaseWmsService.list_submitted_jobs. 

261 if wms_id is None and user is not None: 

262 raise RuntimeError( 

263 "Error to get workflow status report: wms_id is required" 

264 " and filtering workflows with 'user' is not supported." 

265 ) 

266 

267 idds_client = get_idds_client(self.config) 

268 ret = idds_client.get_requests(request_id=wms_id) 

269 _LOG.debug("PanDA get workflows returned = %s", ret) 

270 

271 status, result, error = get_idds_result(ret) 

272 if status: 

273 req_ids = [req["request_id"] for req in result] 

274 return req_ids 

275 

276 raise RuntimeError(f"Error list PanDA workflow requests: {error}") 

277 

278 def cancel(self, wms_id, pass_thru=None): 

279 # Docstring inherited from BaseWmsService.cancel. 

280 idds_client = get_idds_client(self.config) 

281 ret = idds_client.abort(request_id=wms_id) 

282 _LOG.debug("Abort PanDA workflow returned = %s", ret) 

283 

284 status, result, error = get_idds_result(ret) 

285 if status: 

286 _LOG.info("Aborting PanDA workflow %s", result) 

287 return True, json.dumps(result) 

288 

289 return False, f"Error abort PanDA workflow: {str(error)}" 

290 

291 def ping(self, pass_thru=None): 

292 # Docstring inherited from BaseWmsService.ping. 

293 idds_client = get_idds_client(self.config) 

294 ret = idds_client.ping() 

295 _LOG.debug("Ping PanDA service returned = %s", ret) 

296 

297 status, result, error = get_idds_result(ret) 

298 if status: 

299 if "Status" in result and result["Status"] == "OK": 

300 return 0, None 

301 

302 return -1, f"Error ping PanDA service: {str(result)}" 

303 

304 return -1, f"Error ping PanDA service: {str(error)}" 

305 

306 def run_submission_checks(self): 

307 # Docstring inherited from BaseWmsService.run_submission_checks. 

308 for key in ["PANDA_URL"]: 

309 if key not in os.environ: 

310 raise OSError(f"Missing environment variable {key}") 

311 

312 status, message = self.ping() 

313 if status != 0: 

314 raise RuntimeError(message) 

315 

316 

317class PandaBpsWmsWorkflow(BaseWmsWorkflow): 

318 """A single Panda based workflow. 

319 

320 Parameters 

321 ---------- 

322 name : `str` 

323 Unique name for Workflow. 

324 config : `lsst.ctrl.bps.BpsConfig` 

325 BPS configuration that includes necessary submit/runtime information. 

326 """ 

327 

328 def __init__(self, name, config=None): 

329 super().__init__(name, config) 

330 self.files_to_pre_stage = {} # src, dest 

331 self.idds_client_workflow = IDDS_client_workflow(name=name) 

332 

333 @classmethod 

334 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

335 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow. 

336 wms_workflow = cls(generic_workflow.name, config) 

337 

338 files, dag_sink_work, task_count = add_idds_work( 

339 config, generic_workflow, wms_workflow.idds_client_workflow 

340 ) 

341 wms_workflow.files_to_pre_stage.update(files) 

342 

343 files = add_final_idds_work( 

344 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1 

345 ) 

346 wms_workflow.files_to_pre_stage.update(files) 

347 

348 return wms_workflow 

349 

350 def write(self, out_prefix): 

351 # Docstring inherited from BaseWmsWorkflow.write. 

352 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh: 

353 pickle.dump(self, fh)