Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 14%

147 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-29 09:35 +0000

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27"""Interface between generic workflow to PanDA/iDDS workflow system. 

28""" 

29 

30 

31__all__ = ["PanDAService", "PandaBpsWmsWorkflow"] 

32 

33 

34import json 

35import logging 

36import os 

37import pickle 

38import re 

39 

40from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

41from lsst.ctrl.bps import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates 

42from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS 

43from lsst.ctrl.bps.panda.utils import ( 

44 add_final_idds_work, 

45 add_idds_work, 

46 copy_files_for_distribution, 

47 get_idds_client, 

48 get_idds_result, 

49) 

50 

51_LOG = logging.getLogger(__name__) 

52 

53 

54class PanDAService(BaseWmsService): 

55 """PanDA version of WMS service""" 

56 

57 def prepare(self, config, generic_workflow, out_prefix=None): 

58 # Docstring inherited from BaseWmsService.prepare. 

59 _LOG.debug("out_prefix = '%s'", out_prefix) 

60 workflow = PandaBpsWmsWorkflow.from_generic_workflow( 

61 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}" 

62 ) 

63 workflow.write(out_prefix) 

64 return workflow 

65 

66 def submit(self, workflow): 

67 _, max_copy_workers = self.config.search( 

68 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} 

69 ) 

70 # Docstring inherited from BaseWmsService.submit. 

71 file_distribution_uri = self.config["fileDistributionEndPoint"] 

72 lsst_temp = "LSST_RUN_TEMP_SPACE" 

73 if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: 

74 file_distribution_uri = self.config["fileDistributionEndPointDefault"] 

75 

76 copy_files_for_distribution(workflow.files_to_pre_stage, file_distribution_uri, max_copy_workers) 

77 

78 idds_client = get_idds_client(self.config) 

79 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) 

80 _LOG.debug("iDDS client manager submit returned = %s", ret) 

81 

82 # Check submission success 

83 status, result, error = get_idds_result(ret) 

84 if status: 

85 request_id = int(result) 

86 else: 

87 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

88 

89 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

90 workflow.run_id = request_id 

91 

92 def restart(self, wms_workflow_id): 

93 # Docstring inherited from BaseWmsService.restart. 

94 idds_client = get_idds_client(self.config) 

95 ret = idds_client.retry(request_id=wms_workflow_id) 

96 _LOG.debug("Restart PanDA workflow returned = %s", ret) 

97 

98 status, result, error = get_idds_result(ret) 

99 if status: 

100 _LOG.info("Restarting PanDA workflow %s", result) 

101 return wms_workflow_id, None, json.dumps(result) 

102 

103 return None, None, f"Error retry PanDA workflow: {str(error)}" 

104 

105 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

106 # Docstring inherited from BaseWmsService.report. 

107 message = "" 

108 run_reports = [] 

109 

110 if not wms_workflow_id: 

111 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead" 

112 return run_reports, message 

113 

114 idds_client = get_idds_client(self.config) 

115 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True) 

116 _LOG.debug("PanDA get workflow status returned = %s", str(ret)) 

117 

118 request_status = ret[0] 

119 if request_status != 0: 

120 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}") 

121 

122 tasks = ret[1][1] 

123 if not tasks: 

124 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id" 

125 else: 

126 head = tasks[0] 

127 wms_report = WmsRunReport( 

128 wms_id=str(head["request_id"]), 

129 operator=head["username"], 

130 project="", 

131 campaign="", 

132 payload="", 

133 run=head["name"], 

134 state=WmsStates.UNKNOWN, 

135 total_number_jobs=0, 

136 job_state_counts={state: 0 for state in WmsStates}, 

137 job_summary={}, 

138 run_summary="", 

139 ) 

140 

141 # The status of a task is taken from the first item of state_map. 

142 # The workflow is in status WmsStates.FAILED when: 

143 # All tasks have failed. 

144 # SubFinished tasks has jobs in 

145 # output_processed_files: Finished 

146 # output_failed_files: Failed 

147 # output_missing_files: Missing 

148 state_map = { 

149 "Finished": [WmsStates.SUCCEEDED], 

150 "SubFinished": [ 

151 WmsStates.SUCCEEDED, 

152 WmsStates.FAILED, 

153 WmsStates.PRUNED, 

154 ], 

155 "Transforming": [ 

156 WmsStates.RUNNING, 

157 WmsStates.SUCCEEDED, 

158 WmsStates.FAILED, 

159 WmsStates.UNREADY, 

160 WmsStates.PRUNED, 

161 ], 

162 "Failed": [WmsStates.FAILED, WmsStates.PRUNED], 

163 } 

164 

165 file_map = { 

166 WmsStates.SUCCEEDED: "output_processed_files", 

167 WmsStates.RUNNING: "output_processing_files", 

168 WmsStates.FAILED: "output_failed_files", 

169 WmsStates.UNREADY: "input_new_files", 

170 WmsStates.PRUNED: "output_missing_files", 

171 } 

172 

173 # workflow status to report as SUCCEEDED 

174 wf_status = ["Finished", "SubFinished", "Transforming"] 

175 

176 wf_succeed = False 

177 

178 tasks.sort(key=lambda x: x["transform_workload_id"]) 

179 

180 # Loop over all tasks data returned by idds_client 

181 for task in tasks: 

182 totaljobs = task["output_total_files"] 

183 wms_report.total_number_jobs += totaljobs 

184 tasklabel = task["transform_name"] 

185 tasklabel = re.sub(wms_report.run + "_", "", tasklabel) 

186 status = task["transform_status"]["attributes"]["_name_"] 

187 taskstatus = {} 

188 # Fill number of jobs in all WmsStates 

189 for state in WmsStates: 

190 njobs = 0 

191 # Each WmsState have many iDDS status mapped to it. 

192 for mappedstate in state_map[status]: 

193 if state in file_map and mappedstate == state: 

194 if task[file_map[mappedstate]] is not None: 

195 njobs = task[file_map[mappedstate]] 

196 if state == WmsStates.RUNNING: 

197 njobs += task["output_new_files"] - task["input_new_files"] 

198 break 

199 wms_report.job_state_counts[state] += njobs 

200 taskstatus[state] = njobs 

201 wms_report.job_summary[tasklabel] = taskstatus 

202 

203 # To fill the EXPECTED column 

204 if wms_report.run_summary: 

205 wms_report.run_summary += ";" 

206 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}" 

207 

208 if status in wf_status: 

209 wf_succeed = True 

210 wms_report.state = state_map[status][0] 

211 

212 # All tasks have failed, set the workflow FAILED 

213 if not wf_succeed: 

214 wms_report.state = WmsStates.FAILED 

215 

216 run_reports.append(wms_report) 

217 

218 return run_reports, message 

219 

220 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

221 # Docstring inherited from BaseWmsService.list_submitted_jobs. 

222 if wms_id is None and user is not None: 

223 raise RuntimeError( 

224 "Error to get workflow status report: wms_id is required" 

225 " and filtering workflows with 'user' is not supported." 

226 ) 

227 

228 idds_client = get_idds_client(self.config) 

229 ret = idds_client.get_requests(request_id=wms_id) 

230 _LOG.debug("PanDA get workflows returned = %s", ret) 

231 

232 status, result, error = get_idds_result(ret) 

233 if status: 

234 req_ids = [req["request_id"] for req in result] 

235 return req_ids 

236 

237 raise RuntimeError(f"Error list PanDA workflow requests: {error}") 

238 

239 def cancel(self, wms_id, pass_thru=None): 

240 # Docstring inherited from BaseWmsService.cancel. 

241 idds_client = get_idds_client(self.config) 

242 ret = idds_client.abort(request_id=wms_id) 

243 _LOG.debug("Abort PanDA workflow returned = %s", ret) 

244 

245 status, result, error = get_idds_result(ret) 

246 if status: 

247 _LOG.info("Aborting PanDA workflow %s", result) 

248 return True, json.dumps(result) 

249 

250 return False, f"Error abort PanDA workflow: {str(error)}" 

251 

252 def ping(self, pass_thru=None): 

253 # Docstring inherited from BaseWmsService.ping. 

254 idds_client = get_idds_client(self.config) 

255 ret = idds_client.ping() 

256 _LOG.debug("Ping PanDA service returned = %s", ret) 

257 

258 status, result, error = get_idds_result(ret) 

259 if status: 

260 if "Status" in result and result["Status"] == "OK": 

261 return 0, None 

262 

263 return -1, f"Error ping PanDA service: {str(result)}" 

264 

265 return -1, f"Error ping PanDA service: {str(error)}" 

266 

267 def run_submission_checks(self): 

268 # Docstring inherited from BaseWmsService.run_submission_checks. 

269 for key in ["PANDA_URL"]: 

270 if key not in os.environ: 

271 raise OSError(f"Missing environment variable {key}") 

272 

273 status, message = self.ping() 

274 if status != 0: 

275 raise RuntimeError(message) 

276 

277 

278class PandaBpsWmsWorkflow(BaseWmsWorkflow): 

279 """A single Panda based workflow 

280 

281 Parameters 

282 ---------- 

283 name : `str` 

284 Unique name for Workflow 

285 config : `lsst.ctrl.bps.BpsConfig` 

286 BPS configuration that includes necessary submit/runtime information 

287 """ 

288 

289 def __init__(self, name, config=None): 

290 super().__init__(name, config) 

291 self.files_to_pre_stage = {} # src, dest 

292 self.idds_client_workflow = IDDS_client_workflow(name=name) 

293 

294 @classmethod 

295 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

296 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow. 

297 wms_workflow = cls(generic_workflow.name, config) 

298 

299 files, dag_sink_work, task_count = add_idds_work( 

300 config, generic_workflow, wms_workflow.idds_client_workflow 

301 ) 

302 wms_workflow.files_to_pre_stage.update(files) 

303 

304 files = add_final_idds_work( 

305 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1 

306 ) 

307 wms_workflow.files_to_pre_stage.update(files) 

308 

309 return wms_workflow 

310 

311 def write(self, out_prefix): 

312 # Docstring inherited from BaseWmsWorkflow.write. 

313 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh: 

314 pickle.dump(self, fh)