Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 14%

147 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-09-02 09:51 +0000

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21"""Interface between generic workflow to PanDA/iDDS workflow system. 

22""" 

23 

24 

25__all__ = ["PanDAService", "PandaBpsWmsWorkflow"] 

26 

27 

28import json 

29import logging 

30import os 

31import pickle 

32import re 

33 

34from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

35from lsst.ctrl.bps import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates 

36from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS 

37from lsst.ctrl.bps.panda.utils import ( 

38 add_final_idds_work, 

39 add_idds_work, 

40 copy_files_for_distribution, 

41 get_idds_client, 

42 get_idds_result, 

43) 

44 

45_LOG = logging.getLogger(__name__) 

46 

47 

48class PanDAService(BaseWmsService): 

49 """PanDA version of WMS service""" 

50 

51 def prepare(self, config, generic_workflow, out_prefix=None): 

52 # Docstring inherited from BaseWmsService.prepare. 

53 _LOG.debug("out_prefix = '%s'", out_prefix) 

54 workflow = PandaBpsWmsWorkflow.from_generic_workflow( 

55 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}" 

56 ) 

57 workflow.write(out_prefix) 

58 return workflow 

59 

60 def submit(self, workflow): 

61 _, max_copy_workers = self.config.search( 

62 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} 

63 ) 

64 # Docstring inherited from BaseWmsService.submit. 

65 file_distribution_uri = self.config["fileDistributionEndPoint"] 

66 lsst_temp = "LSST_RUN_TEMP_SPACE" 

67 if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: 

68 file_distribution_uri = self.config["fileDistributionEndPointDefault"] 

69 

70 copy_files_for_distribution(workflow.files_to_pre_stage, file_distribution_uri, max_copy_workers) 

71 

72 idds_client = get_idds_client(self.config) 

73 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) 

74 _LOG.debug("iDDS client manager submit returned = %s", ret) 

75 

76 # Check submission success 

77 status, result, error = get_idds_result(ret) 

78 if status: 

79 request_id = int(result) 

80 else: 

81 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

82 

83 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

84 workflow.run_id = request_id 

85 

86 def restart(self, wms_workflow_id): 

87 # Docstring inherited from BaseWmsService.restart. 

88 idds_client = get_idds_client(self.config) 

89 ret = idds_client.retry(request_id=wms_workflow_id) 

90 _LOG.debug("Restart PanDA workflow returned = %s", ret) 

91 

92 status, result, error = get_idds_result(ret) 

93 if status: 

94 _LOG.info("Restarting PanDA workflow %s", result) 

95 return wms_workflow_id, None, json.dumps(result) 

96 

97 return None, None, f"Error retry PanDA workflow: {str(error)}" 

98 

99 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

100 # Docstring inherited from BaseWmsService.report. 

101 message = "" 

102 run_reports = [] 

103 

104 if not wms_workflow_id: 

105 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead" 

106 return run_reports, message 

107 

108 idds_client = get_idds_client(self.config) 

109 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True) 

110 _LOG.debug("PanDA get workflow status returned = %s", str(ret)) 

111 

112 request_status = ret[0] 

113 if request_status != 0: 

114 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}") 

115 

116 tasks = ret[1][1] 

117 if not tasks: 

118 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id" 

119 else: 

120 head = tasks[0] 

121 wms_report = WmsRunReport( 

122 wms_id=str(head["request_id"]), 

123 operator=head["username"], 

124 project="", 

125 campaign="", 

126 payload="", 

127 run=head["name"], 

128 state=WmsStates.UNKNOWN, 

129 total_number_jobs=0, 

130 job_state_counts={state: 0 for state in WmsStates}, 

131 job_summary={}, 

132 run_summary="", 

133 ) 

134 

135 # The status of a task is taken from the first item of state_map. 

136 # The workflow is in status WmsStates.FAILED when: 

137 # All tasks have failed. 

138 # SubFinished tasks has jobs in 

139 # output_processed_files: Finished 

140 # output_failed_files: Failed 

141 # output_missing_files: Missing 

142 state_map = { 

143 "Finished": [WmsStates.SUCCEEDED], 

144 "SubFinished": [ 

145 WmsStates.SUCCEEDED, 

146 WmsStates.FAILED, 

147 WmsStates.PRUNED, 

148 ], 

149 "Transforming": [ 

150 WmsStates.RUNNING, 

151 WmsStates.SUCCEEDED, 

152 WmsStates.FAILED, 

153 WmsStates.UNREADY, 

154 WmsStates.PRUNED, 

155 ], 

156 "Failed": [WmsStates.FAILED, WmsStates.PRUNED], 

157 } 

158 

159 file_map = { 

160 WmsStates.SUCCEEDED: "output_processed_files", 

161 WmsStates.RUNNING: "output_processing_files", 

162 WmsStates.FAILED: "output_failed_files", 

163 WmsStates.UNREADY: "input_new_files", 

164 WmsStates.PRUNED: "output_missing_files", 

165 } 

166 

167 # workflow status to report as SUCCEEDED 

168 wf_status = ["Finished", "SubFinished", "Transforming"] 

169 

170 wf_succeed = False 

171 

172 tasks.sort(key=lambda x: x["transform_workload_id"]) 

173 

174 # Loop over all tasks data returned by idds_client 

175 for task in tasks: 

176 totaljobs = task["output_total_files"] 

177 wms_report.total_number_jobs += totaljobs 

178 tasklabel = task["transform_name"] 

179 tasklabel = re.sub(wms_report.run + "_", "", tasklabel) 

180 status = task["transform_status"]["attributes"]["_name_"] 

181 taskstatus = {} 

182 # Fill number of jobs in all WmsStates 

183 for state in WmsStates: 

184 njobs = 0 

185 # Each WmsState have many iDDS status mapped to it. 

186 for mappedstate in state_map[status]: 

187 if state in file_map and mappedstate == state: 

188 if task[file_map[mappedstate]] is not None: 

189 njobs = task[file_map[mappedstate]] 

190 if state == WmsStates.RUNNING: 

191 njobs += task["output_new_files"] - task["input_new_files"] 

192 break 

193 wms_report.job_state_counts[state] += njobs 

194 taskstatus[state] = njobs 

195 wms_report.job_summary[tasklabel] = taskstatus 

196 

197 # To fill the EXPECTED column 

198 if wms_report.run_summary: 

199 wms_report.run_summary += ";" 

200 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}" 

201 

202 if status in wf_status: 

203 wf_succeed = True 

204 wms_report.state = state_map[status][0] 

205 

206 # All tasks have failed, set the workflow FAILED 

207 if not wf_succeed: 

208 wms_report.state = WmsStates.FAILED 

209 

210 run_reports.append(wms_report) 

211 

212 return run_reports, message 

213 

214 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

215 # Docstring inherited from BaseWmsService.list_submitted_jobs. 

216 if wms_id is None and user is not None: 

217 raise RuntimeError( 

218 "Error to get workflow status report: wms_id is required" 

219 " and filtering workflows with 'user' is not supported." 

220 ) 

221 

222 idds_client = get_idds_client(self.config) 

223 ret = idds_client.get_requests(request_id=wms_id) 

224 _LOG.debug("PanDA get workflows returned = %s", ret) 

225 

226 status, result, error = get_idds_result(ret) 

227 if status: 

228 req_ids = [req["request_id"] for req in result] 

229 return req_ids 

230 

231 raise RuntimeError(f"Error list PanDA workflow requests: {error}") 

232 

233 def cancel(self, wms_id, pass_thru=None): 

234 # Docstring inherited from BaseWmsService.cancel. 

235 idds_client = get_idds_client(self.config) 

236 ret = idds_client.abort(request_id=wms_id) 

237 _LOG.debug("Abort PanDA workflow returned = %s", ret) 

238 

239 status, result, error = get_idds_result(ret) 

240 if status: 

241 _LOG.info("Aborting PanDA workflow %s", result) 

242 return True, json.dumps(result) 

243 

244 return False, f"Error abort PanDA workflow: {str(error)}" 

245 

246 def ping(self, pass_thru=None): 

247 # Docstring inherited from BaseWmsService.ping. 

248 idds_client = get_idds_client(self.config) 

249 ret = idds_client.ping() 

250 _LOG.debug("Ping PanDA service returned = %s", ret) 

251 

252 status, result, error = get_idds_result(ret) 

253 if status: 

254 if "Status" in result and result["Status"] == "OK": 

255 return 0, None 

256 

257 return -1, f"Error ping PanDA service: {str(result)}" 

258 

259 return -1, f"Error ping PanDA service: {str(error)}" 

260 

261 def run_submission_checks(self): 

262 # Docstring inherited from BaseWmsService.run_submission_checks. 

263 for key in ["PANDA_URL"]: 

264 if key not in os.environ: 

265 raise OSError(f"Missing environment variable {key}") 

266 

267 status, message = self.ping() 

268 if status != 0: 

269 raise RuntimeError(message) 

270 

271 

272class PandaBpsWmsWorkflow(BaseWmsWorkflow): 

273 """A single Panda based workflow 

274 

275 Parameters 

276 ---------- 

277 name : `str` 

278 Unique name for Workflow 

279 config : `lsst.ctrl.bps.BpsConfig` 

280 BPS configuration that includes necessary submit/runtime information 

281 """ 

282 

283 def __init__(self, name, config=None): 

284 super().__init__(name, config) 

285 self.files_to_pre_stage = {} # src, dest 

286 self.idds_client_workflow = IDDS_client_workflow(name=name) 

287 

288 @classmethod 

289 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

290 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow. 

291 wms_workflow = cls(generic_workflow.name, config) 

292 

293 files, dag_sink_work, task_count = add_idds_work( 

294 config, generic_workflow, wms_workflow.idds_client_workflow 

295 ) 

296 wms_workflow.files_to_pre_stage.update(files) 

297 

298 files = add_final_idds_work( 

299 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1 

300 ) 

301 wms_workflow.files_to_pre_stage.update(files) 

302 

303 return wms_workflow 

304 

305 def write(self, out_prefix): 

306 # Docstring inherited from BaseWmsWorkflow.write. 

307 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh: 

308 pickle.dump(self, fh)