Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 10%

191 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-20 02:54 -0700

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27"""Interface between generic workflow to PanDA/iDDS workflow system. 

28""" 

29 

30 

31__all__ = ["PanDAService", "PandaBpsWmsWorkflow"] 

32 

33 

34import json 

35import logging 

36import os 

37import pickle 

38import re 

39 

40from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

41from lsst.ctrl.bps import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates 

42from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS 

43from lsst.ctrl.bps.panda.utils import ( 

44 add_final_idds_work, 

45 add_idds_work, 

46 copy_files_for_distribution, 

47 create_idds_build_workflow, 

48 get_idds_client, 

49 get_idds_result, 

50) 

51 

52_LOG = logging.getLogger(__name__) 

53 

54 

55class PanDAService(BaseWmsService): 

56 """PanDA version of WMS service.""" 

57 

58 def prepare(self, config, generic_workflow, out_prefix=None): 

59 # Docstring inherited from BaseWmsService.prepare. 

60 _LOG.debug("out_prefix = '%s'", out_prefix) 

61 workflow = PandaBpsWmsWorkflow.from_generic_workflow( 

62 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}" 

63 ) 

64 workflow.write(out_prefix) 

65 return workflow 

66 

67 def submit(self, workflow, **kwargs): 

68 config = kwargs["config"] if "config" in kwargs else None 

69 remote_build = kwargs["remote_build"] if "remote_build" in kwargs else None 

70 

71 if config and remote_build: 

72 _LOG.info("remote build") 

73 

74 idds_build_workflow = create_idds_build_workflow(**kwargs) 

75 idds_client = get_idds_client(self.config) 

76 ret = idds_client.submit_build(idds_build_workflow, username=None, use_dataset_name=False) 

77 _LOG.debug("iDDS client manager submit returned = %s", ret) 

78 

79 # Check submission success 

80 status, result, error = get_idds_result(ret) 

81 if status: 

82 request_id = int(result) 

83 else: 

84 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

85 

86 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

87 idds_build_workflow.run_id = request_id 

88 return idds_build_workflow 

89 else: 

90 _, max_copy_workers = self.config.search( 

91 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} 

92 ) 

93 # Docstring inherited from BaseWmsService.submit. 

94 file_distribution_uri = self.config["fileDistributionEndPoint"] 

95 lsst_temp = "LSST_RUN_TEMP_SPACE" 

96 if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: 

97 file_distribution_uri = self.config["fileDistributionEndPointDefault"] 

98 copy_files_for_distribution(workflow.files_to_pre_stage, file_distribution_uri, max_copy_workers) 

99 

100 idds_client = get_idds_client(self.config) 

101 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) 

102 _LOG.debug("iDDS client manager submit returned = %s", ret) 

103 

104 # Check submission success 

105 status, result, error = get_idds_result(ret) 

106 if status: 

107 request_id = int(result) 

108 else: 

109 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

110 

111 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

112 workflow.run_id = request_id 

113 

114 def restart(self, wms_workflow_id): 

115 # Docstring inherited from BaseWmsService.restart. 

116 idds_client = get_idds_client(self.config) 

117 ret = idds_client.retry(request_id=wms_workflow_id) 

118 _LOG.debug("Restart PanDA workflow returned = %s", ret) 

119 

120 status, result, error = get_idds_result(ret) 

121 if status: 

122 _LOG.info("Restarting PanDA workflow %s", result) 

123 return wms_workflow_id, None, json.dumps(result) 

124 

125 return None, None, f"Error retry PanDA workflow: {str(error)}" 

126 

127 def report( 

128 self, 

129 wms_workflow_id=None, 

130 user=None, 

131 hist=0, 

132 pass_thru=None, 

133 is_global=False, 

134 return_exit_codes=False, 

135 ): 

136 # Docstring inherited from BaseWmsService.report. 

137 message = "" 

138 run_reports = [] 

139 

140 if not wms_workflow_id: 

141 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead" 

142 return run_reports, message 

143 

144 idds_client = get_idds_client(self.config) 

145 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True) 

146 _LOG.debug("PanDA get workflow status returned = %s", str(ret)) 

147 

148 request_status = ret[0] 

149 if request_status != 0: 

150 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}") 

151 

152 tasks = ret[1][1] 

153 if not tasks: 

154 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id" 

155 else: 

156 head = tasks[0] 

157 wms_report = WmsRunReport( 

158 wms_id=str(head["request_id"]), 

159 operator=head["username"], 

160 project="", 

161 campaign="", 

162 payload="", 

163 run=head["name"], 

164 state=WmsStates.UNKNOWN, 

165 total_number_jobs=0, 

166 job_state_counts={state: 0 for state in WmsStates}, 

167 job_summary={}, 

168 run_summary="", 

169 exit_code_summary=[], 

170 ) 

171 

172 # The status of a task is taken from the first item of state_map. 

173 # The workflow is in status WmsStates.FAILED when: 

174 # All tasks have failed. 

175 # SubFinished tasks has jobs in 

176 # output_processed_files: Finished 

177 # output_failed_files: Failed 

178 # output_missing_files: Missing 

179 state_map = { 

180 "Finished": [WmsStates.SUCCEEDED], 

181 "SubFinished": [ 

182 WmsStates.SUCCEEDED, 

183 WmsStates.FAILED, 

184 WmsStates.PRUNED, 

185 ], 

186 "Transforming": [ 

187 WmsStates.RUNNING, 

188 WmsStates.SUCCEEDED, 

189 WmsStates.FAILED, 

190 WmsStates.UNREADY, 

191 WmsStates.PRUNED, 

192 ], 

193 "Failed": [WmsStates.FAILED, WmsStates.PRUNED], 

194 } 

195 

196 file_map = { 

197 WmsStates.SUCCEEDED: "output_processed_files", 

198 WmsStates.RUNNING: "output_processing_files", 

199 WmsStates.FAILED: "output_failed_files", 

200 WmsStates.UNREADY: "input_new_files", 

201 WmsStates.PRUNED: "output_missing_files", 

202 } 

203 

204 workflow_status = head["status"]["attributes"]["_name_"] 

205 if workflow_status in ["Finished", "SubFinished"]: 

206 wms_report.state = WmsStates.SUCCEEDED 

207 elif workflow_status in ["Failed", "Expired"]: 

208 wms_report.state = WmsStates.FAILED 

209 elif workflow_status in ["Cancelled"]: 

210 wms_report.state = WmsStates.DELETED 

211 elif workflow_status in ["Suspended"]: 

212 wms_report.state = WmsStates.HELD 

213 else: 

214 wms_report.state = WmsStates.RUNNING 

215 

216 try: 

217 tasks.sort(key=lambda x: x["transform_workload_id"]) 

218 except Exception: 

219 tasks.sort(key=lambda x: x["transform_id"]) 

220 

221 exit_codes_all = {} 

222 # Loop over all tasks data returned by idds_client 

223 for task in tasks: 

224 if task["transform_id"] is None: 

225 # Not created task (It happens because of an outer join 

226 # between requests table and transforms table). 

227 continue 

228 

229 exit_codes = [] 

230 totaljobs = task["output_total_files"] 

231 wms_report.total_number_jobs += totaljobs 

232 tasklabel = task["transform_name"] 

233 tasklabel = re.sub(wms_report.run + "_", "", tasklabel) 

234 status = task["transform_status"]["attributes"]["_name_"] 

235 taskstatus = {} 

236 # if the state is failed, gather exit code information 

237 if status in ["SubFinished", "Failed"]: 

238 transform_workload_id = task["transform_workload_id"] 

239 if not (task["transform_name"] and task["transform_name"].startswith("build_task")): 

240 new_ret = idds_client.get_contents_output_ext( 

241 request_id=wms_workflow_id, workload_id=transform_workload_id 

242 ) 

243 _LOG.debug( 

244 "PanDA get task %s detail returned = %s", transform_workload_id, str(new_ret) 

245 ) 

246 

247 request_status = new_ret[0] 

248 if request_status != 0: 

249 raise RuntimeError( 

250 f"Error to get workflow status: {new_ret} for id: {wms_workflow_id}" 

251 ) 

252 # task_info is a dictionary of len 1 that contains 

253 # a list of dicts containing panda job info 

254 task_info = new_ret[1][1] 

255 

256 if len(task_info) == 1: 

257 wmskey = list(task_info.keys())[0] 

258 wmsjobs = task_info[wmskey] 

259 else: 

260 err_msg = "Unexpected job return from PanDA: " 

261 err_msg += f"{task_info} for id: {transform_workload_id}" 

262 raise RuntimeError(err_msg) 

263 exit_codes = [ 

264 wmsjob["trans_exit_code"] 

265 for wmsjob in wmsjobs 

266 if wmsjob["trans_exit_code"] is not None and int(wmsjob["trans_exit_code"]) != 0 

267 ] 

268 exit_codes_all[tasklabel] = exit_codes 

269 # Fill number of jobs in all WmsStates 

270 for state in WmsStates: 

271 njobs = 0 

272 # Each WmsState have many iDDS status mapped to it. 

273 if status in state_map: 

274 for mappedstate in state_map[status]: 

275 if state in file_map and mappedstate == state: 

276 if task[file_map[mappedstate]] is not None: 

277 njobs = task[file_map[mappedstate]] 

278 if state == WmsStates.RUNNING: 

279 njobs += task["output_new_files"] - task["input_new_files"] 

280 break 

281 wms_report.job_state_counts[state] += njobs 

282 taskstatus[state] = njobs 

283 wms_report.job_summary[tasklabel] = taskstatus 

284 

285 # To fill the EXPECTED column 

286 if wms_report.run_summary: 

287 wms_report.run_summary += ";" 

288 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}" 

289 

290 wms_report.exit_code_summary = exit_codes_all 

291 run_reports.append(wms_report) 

292 

293 return run_reports, message 

294 

295 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

296 # Docstring inherited from BaseWmsService.list_submitted_jobs. 

297 if wms_id is None and user is not None: 

298 raise RuntimeError( 

299 "Error to get workflow status report: wms_id is required" 

300 " and filtering workflows with 'user' is not supported." 

301 ) 

302 

303 idds_client = get_idds_client(self.config) 

304 ret = idds_client.get_requests(request_id=wms_id) 

305 _LOG.debug("PanDA get workflows returned = %s", ret) 

306 

307 status, result, error = get_idds_result(ret) 

308 if status: 

309 req_ids = [req["request_id"] for req in result] 

310 return req_ids 

311 

312 raise RuntimeError(f"Error list PanDA workflow requests: {error}") 

313 

314 def cancel(self, wms_id, pass_thru=None): 

315 # Docstring inherited from BaseWmsService.cancel. 

316 idds_client = get_idds_client(self.config) 

317 ret = idds_client.abort(request_id=wms_id) 

318 _LOG.debug("Abort PanDA workflow returned = %s", ret) 

319 

320 status, result, error = get_idds_result(ret) 

321 if status: 

322 _LOG.info("Aborting PanDA workflow %s", result) 

323 return True, json.dumps(result) 

324 

325 return False, f"Error abort PanDA workflow: {str(error)}" 

326 

327 def ping(self, pass_thru=None): 

328 # Docstring inherited from BaseWmsService.ping. 

329 idds_client = get_idds_client(self.config) 

330 ret = idds_client.ping() 

331 _LOG.debug("Ping PanDA service returned = %s", ret) 

332 

333 status, result, error = get_idds_result(ret) 

334 if status: 

335 if "Status" in result and result["Status"] == "OK": 

336 return 0, None 

337 

338 return -1, f"Error ping PanDA service: {str(result)}" 

339 

340 return -1, f"Error ping PanDA service: {str(error)}" 

341 

342 def run_submission_checks(self): 

343 # Docstring inherited from BaseWmsService.run_submission_checks. 

344 for key in ["PANDA_URL"]: 

345 if key not in os.environ: 

346 raise OSError(f"Missing environment variable {key}") 

347 

348 status, message = self.ping() 

349 if status != 0: 

350 raise RuntimeError(message) 

351 

352 

353class PandaBpsWmsWorkflow(BaseWmsWorkflow): 

354 """A single Panda based workflow. 

355 

356 Parameters 

357 ---------- 

358 name : `str` 

359 Unique name for Workflow. 

360 config : `lsst.ctrl.bps.BpsConfig` 

361 BPS configuration that includes necessary submit/runtime information. 

362 """ 

363 

364 def __init__(self, name, config=None): 

365 super().__init__(name, config) 

366 self.files_to_pre_stage = {} # src, dest 

367 self.idds_client_workflow = IDDS_client_workflow(name=name) 

368 

369 @classmethod 

370 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

371 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow. 

372 wms_workflow = cls(generic_workflow.name, config) 

373 

374 files, dag_sink_work, task_count = add_idds_work( 

375 config, generic_workflow, wms_workflow.idds_client_workflow 

376 ) 

377 wms_workflow.files_to_pre_stage.update(files) 

378 

379 files = add_final_idds_work( 

380 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1 

381 ) 

382 wms_workflow.files_to_pre_stage.update(files) 

383 

384 return wms_workflow 

385 

386 def write(self, out_prefix): 

387 # Docstring inherited from BaseWmsWorkflow.write. 

388 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh: 

389 pickle.dump(self, fh)