Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 13%

143 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-04 02:17 -0700

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21"""Interface between generic workflow to PanDA/iDDS workflow system. 

22""" 

23 

24 

25__all__ = ["PanDAService", "PandaBpsWmsWorkflow"] 

26 

27 

28import json 

29import logging 

30import os 

31import pickle 

32import re 

33 

34from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

35from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS 

36from lsst.ctrl.bps.panda.utils import ( 

37 add_final_idds_work, 

38 add_idds_work, 

39 copy_files_for_distribution, 

40 get_idds_client, 

41 get_idds_result, 

42) 

43from lsst.ctrl.bps.wms_service import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates 

44 

45_LOG = logging.getLogger(__name__) 

46 

47 

48class PanDAService(BaseWmsService): 

49 """PanDA version of WMS service""" 

50 

51 def prepare(self, config, generic_workflow, out_prefix=None): 

52 # Docstring inherited from BaseWmsService.prepare. 

53 _LOG.debug("out_prefix = '%s'", out_prefix) 

54 workflow = PandaBpsWmsWorkflow.from_generic_workflow( 

55 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}" 

56 ) 

57 workflow.write(out_prefix) 

58 return workflow 

59 

60 def submit(self, workflow): 

61 _, max_copy_workers = self.config.search( 

62 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} 

63 ) 

64 # Docstring inherited from BaseWmsService.submit. 

65 copy_files_for_distribution( 

66 workflow.files_to_pre_stage, self.config["fileDistributionEndPoint"], max_copy_workers 

67 ) 

68 

69 idds_client = get_idds_client(self.config) 

70 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) 

71 _LOG.debug("iDDS client manager submit returned = %s", ret) 

72 

73 # Check submission success 

74 status, result, error = get_idds_result(ret) 

75 if status: 

76 request_id = int(result) 

77 else: 

78 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

79 

80 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

81 workflow.run_id = request_id 

82 

83 def restart(self, wms_workflow_id): 

84 # Docstring inherited from BaseWmsService.restart. 

85 idds_client = get_idds_client(self.config) 

86 ret = idds_client.retry(request_id=wms_workflow_id) 

87 _LOG.debug("Restart PanDA workflow returned = %s", ret) 

88 

89 status, result, error = get_idds_result(ret) 

90 if status: 

91 _LOG.info("Restarting PanDA workflow %s", result) 

92 return wms_workflow_id, None, json.dumps(result) 

93 

94 return None, None, f"Error retry PanDA workflow: {str(error)}" 

95 

96 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

97 # Docstring inherited from BaseWmsService.report. 

98 message = "" 

99 run_reports = [] 

100 

101 if not wms_workflow_id: 

102 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead" 

103 return run_reports, message 

104 

105 idds_client = get_idds_client(self.config) 

106 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True) 

107 _LOG.debug("PanDA get workflow status returned = %s", str(ret)) 

108 

109 request_status = ret[0] 

110 if request_status != 0: 

111 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}") 

112 

113 tasks = ret[1][1] 

114 if not tasks: 

115 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id" 

116 else: 

117 head = tasks[0] 

118 wms_report = WmsRunReport( 

119 wms_id=str(head["request_id"]), 

120 operator=head["username"], 

121 project="", 

122 campaign="", 

123 payload="", 

124 run=head["name"], 

125 state=WmsStates.UNKNOWN, 

126 total_number_jobs=0, 

127 job_state_counts={state: 0 for state in WmsStates}, 

128 job_summary={}, 

129 run_summary="", 

130 ) 

131 

132 # The status of a task is taken from the first item of state_map. 

133 # The workflow is in status WmsStates.FAILED when: 

134 # All tasks have failed. 

135 # SubFinished tasks has jobs in 

136 # output_processed_files: Finished 

137 # output_failed_files: Failed 

138 # output_missing_files: Missing 

139 state_map = { 

140 "Finished": [WmsStates.SUCCEEDED], 

141 "SubFinished": [ 

142 WmsStates.SUCCEEDED, 

143 WmsStates.FAILED, 

144 WmsStates.PRUNED, 

145 ], 

146 "Transforming": [ 

147 WmsStates.RUNNING, 

148 WmsStates.SUCCEEDED, 

149 WmsStates.FAILED, 

150 WmsStates.UNREADY, 

151 WmsStates.PRUNED, 

152 ], 

153 "Failed": [WmsStates.FAILED, WmsStates.PRUNED], 

154 } 

155 

156 file_map = { 

157 WmsStates.SUCCEEDED: "output_processed_files", 

158 WmsStates.RUNNING: "output_processing_files", 

159 WmsStates.FAILED: "output_failed_files", 

160 WmsStates.UNREADY: "input_new_files", 

161 WmsStates.PRUNED: "output_missing_files", 

162 } 

163 

164 # workflow status to report as SUCCEEDED 

165 wf_status = ["Finished", "SubFinished", "Transforming"] 

166 

167 wf_succeed = False 

168 

169 tasks.sort(key=lambda x: x["transform_workload_id"]) 

170 

171 # Loop over all tasks data returned by idds_client 

172 for task in tasks: 

173 totaljobs = task["output_total_files"] 

174 wms_report.total_number_jobs += totaljobs 

175 tasklabel = task["transform_name"] 

176 tasklabel = re.sub(wms_report.run + "_", "", tasklabel) 

177 status = task["transform_status"]["attributes"]["_name_"] 

178 taskstatus = {} 

179 # Fill number of jobs in all WmsStates 

180 for state in WmsStates: 

181 njobs = 0 

182 # Each WmsState have many iDDS status mapped to it. 

183 for mappedstate in state_map[status]: 

184 if state in file_map and mappedstate == state: 

185 if task[file_map[mappedstate]] is not None: 

186 njobs = task[file_map[mappedstate]] 

187 if state == WmsStates.RUNNING: 

188 njobs += task["output_new_files"] - task["input_new_files"] 

189 break 

190 wms_report.job_state_counts[state] += njobs 

191 taskstatus[state] = njobs 

192 wms_report.job_summary[tasklabel] = taskstatus 

193 

194 # To fill the EXPECTED column 

195 if wms_report.run_summary: 

196 wms_report.run_summary += ";" 

197 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}" 

198 

199 if status in wf_status: 

200 wf_succeed = True 

201 wms_report.state = state_map[status][0] 

202 

203 # All tasks have failed, set the workflow FAILED 

204 if not wf_succeed: 

205 wms_report.state = WmsStates.FAILED 

206 

207 run_reports.append(wms_report) 

208 

209 return run_reports, message 

210 

211 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

212 # Docstring inherited from BaseWmsService.list_submitted_jobs. 

213 if wms_id is None and user is not None: 

214 raise RuntimeError( 

215 "Error to get workflow status report: wms_id is required" 

216 " and filtering workflows with 'user' is not supported." 

217 ) 

218 

219 idds_client = get_idds_client(self.config) 

220 ret = idds_client.get_requests(request_id=wms_id) 

221 _LOG.debug("PanDA get workflows returned = %s", ret) 

222 

223 status, result, error = get_idds_result(ret) 

224 if status: 

225 req_ids = [req["request_id"] for req in result] 

226 return req_ids 

227 

228 raise RuntimeError(f"Error list PanDA workflow requests: {error}") 

229 

230 def cancel(self, wms_id, pass_thru=None): 

231 # Docstring inherited from BaseWmsService.cancel. 

232 idds_client = get_idds_client(self.config) 

233 ret = idds_client.abort(request_id=wms_id) 

234 _LOG.debug("Abort PanDA workflow returned = %s", ret) 

235 

236 status, result, error = get_idds_result(ret) 

237 if status: 

238 _LOG.info("Aborting PanDA workflow %s", result) 

239 return True, json.dumps(result) 

240 

241 return False, f"Error abort PanDA workflow: {str(error)}" 

242 

243 def ping(self, pass_thru=None): 

244 # Docstring inherited from BaseWmsService.ping. 

245 idds_client = get_idds_client(self.config) 

246 ret = idds_client.ping() 

247 _LOG.debug("Ping PanDA service returned = %s", ret) 

248 

249 status, result, error = get_idds_result(ret) 

250 if status: 

251 if "Status" in result and result["Status"] == "OK": 

252 return 0, None 

253 

254 return -1, f"Error ping PanDA service: {str(result)}" 

255 

256 return -1, f"Error ping PanDA service: {str(error)}" 

257 

258 def run_submission_checks(self): 

259 # Docstring inherited from BaseWmsService.run_submission_checks. 

260 for key in ["PANDA_URL"]: 

261 if key not in os.environ: 

262 raise OSError(f"Missing environment variable {key}") 

263 

264 status, message = self.ping() 

265 if status != 0: 

266 raise RuntimeError(message) 

267 

268 

269class PandaBpsWmsWorkflow(BaseWmsWorkflow): 

270 """A single Panda based workflow 

271 

272 Parameters 

273 ---------- 

274 name : `str` 

275 Unique name for Workflow 

276 config : `lsst.ctrl.bps.BpsConfig` 

277 BPS configuration that includes necessary submit/runtime information 

278 """ 

279 

280 def __init__(self, name, config=None): 

281 super().__init__(name, config) 

282 self.files_to_pre_stage = {} # src, dest 

283 self.idds_client_workflow = IDDS_client_workflow(name=name) 

284 

285 @classmethod 

286 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

287 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow. 

288 wms_workflow = cls(generic_workflow.name, config) 

289 

290 files, dag_sink_work, task_count = add_idds_work( 

291 config, generic_workflow, wms_workflow.idds_client_workflow 

292 ) 

293 wms_workflow.files_to_pre_stage.update(files) 

294 

295 files = add_final_idds_work( 

296 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1 

297 ) 

298 wms_workflow.files_to_pre_stage.update(files) 

299 

300 return wms_workflow 

301 

302 def write(self, out_prefix): 

303 # Docstring inherited from BaseWmsWorkflow.write. 

304 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh: 

305 pickle.dump(self, fh)