Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 12%
163 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-06 12:41 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-06 12:41 +0000
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
27"""Interface between generic workflow to PanDA/iDDS workflow system.
28"""
31__all__ = ["PanDAService", "PandaBpsWmsWorkflow"]
34import json
35import logging
36import os
37import pickle
38import re
40from idds.workflowv2.workflow import Workflow as IDDS_client_workflow
41from lsst.ctrl.bps import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates
42from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS
43from lsst.ctrl.bps.panda.utils import (
44 add_final_idds_work,
45 add_idds_work,
46 copy_files_for_distribution,
47 get_idds_client,
48 get_idds_result,
49)
51_LOG = logging.getLogger(__name__)
54class PanDAService(BaseWmsService):
55 """PanDA version of WMS service."""
57 def prepare(self, config, generic_workflow, out_prefix=None):
58 # Docstring inherited from BaseWmsService.prepare.
59 _LOG.debug("out_prefix = '%s'", out_prefix)
60 workflow = PandaBpsWmsWorkflow.from_generic_workflow(
61 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}"
62 )
63 workflow.write(out_prefix)
64 return workflow
66 def submit(self, workflow):
67 _, max_copy_workers = self.config.search(
68 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS}
69 )
70 # Docstring inherited from BaseWmsService.submit.
71 file_distribution_uri = self.config["fileDistributionEndPoint"]
72 lsst_temp = "LSST_RUN_TEMP_SPACE"
73 if lsst_temp in file_distribution_uri and lsst_temp not in os.environ:
74 file_distribution_uri = self.config["fileDistributionEndPointDefault"]
76 copy_files_for_distribution(workflow.files_to_pre_stage, file_distribution_uri, max_copy_workers)
78 idds_client = get_idds_client(self.config)
79 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False)
80 _LOG.debug("iDDS client manager submit returned = %s", ret)
82 # Check submission success
83 status, result, error = get_idds_result(ret)
84 if status:
85 request_id = int(result)
86 else:
87 raise RuntimeError(f"Error submitting to PanDA service: {error}")
89 _LOG.info("Submitted into iDDs with request id=%s", request_id)
90 workflow.run_id = request_id
92 def restart(self, wms_workflow_id):
93 # Docstring inherited from BaseWmsService.restart.
94 idds_client = get_idds_client(self.config)
95 ret = idds_client.retry(request_id=wms_workflow_id)
96 _LOG.debug("Restart PanDA workflow returned = %s", ret)
98 status, result, error = get_idds_result(ret)
99 if status:
100 _LOG.info("Restarting PanDA workflow %s", result)
101 return wms_workflow_id, None, json.dumps(result)
103 return None, None, f"Error retry PanDA workflow: {str(error)}"
105 def report(
106 self,
107 wms_workflow_id=None,
108 user=None,
109 hist=0,
110 pass_thru=None,
111 is_global=False,
112 return_exit_codes=False,
113 ):
114 # Docstring inherited from BaseWmsService.report.
115 message = ""
116 run_reports = []
118 if not wms_workflow_id:
119 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead"
120 return run_reports, message
122 idds_client = get_idds_client(self.config)
123 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True)
124 _LOG.debug("PanDA get workflow status returned = %s", str(ret))
126 request_status = ret[0]
127 if request_status != 0:
128 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}")
130 tasks = ret[1][1]
131 if not tasks:
132 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id"
133 else:
134 head = tasks[0]
135 wms_report = WmsRunReport(
136 wms_id=str(head["request_id"]),
137 operator=head["username"],
138 project="",
139 campaign="",
140 payload="",
141 run=head["name"],
142 state=WmsStates.UNKNOWN,
143 total_number_jobs=0,
144 job_state_counts={state: 0 for state in WmsStates},
145 job_summary={},
146 run_summary="",
147 exit_code_summary=[],
148 )
150 # The status of a task is taken from the first item of state_map.
151 # The workflow is in status WmsStates.FAILED when:
152 # All tasks have failed.
153 # SubFinished tasks has jobs in
154 # output_processed_files: Finished
155 # output_failed_files: Failed
156 # output_missing_files: Missing
157 state_map = {
158 "Finished": [WmsStates.SUCCEEDED],
159 "SubFinished": [
160 WmsStates.SUCCEEDED,
161 WmsStates.FAILED,
162 WmsStates.PRUNED,
163 ],
164 "Transforming": [
165 WmsStates.RUNNING,
166 WmsStates.SUCCEEDED,
167 WmsStates.FAILED,
168 WmsStates.UNREADY,
169 WmsStates.PRUNED,
170 ],
171 "Failed": [WmsStates.FAILED, WmsStates.PRUNED],
172 }
174 file_map = {
175 WmsStates.SUCCEEDED: "output_processed_files",
176 WmsStates.RUNNING: "output_processing_files",
177 WmsStates.FAILED: "output_failed_files",
178 WmsStates.UNREADY: "input_new_files",
179 WmsStates.PRUNED: "output_missing_files",
180 }
182 # workflow status to report as SUCCEEDED
183 wf_status = ["Finished", "SubFinished", "Transforming"]
185 wf_succeed = False
187 tasks.sort(key=lambda x: x["transform_workload_id"])
189 exit_codes_all = {}
190 # Loop over all tasks data returned by idds_client
191 for task in tasks:
192 exit_codes = []
193 totaljobs = task["output_total_files"]
194 wms_report.total_number_jobs += totaljobs
195 tasklabel = task["transform_name"]
196 tasklabel = re.sub(wms_report.run + "_", "", tasklabel)
197 status = task["transform_status"]["attributes"]["_name_"]
198 taskstatus = {}
199 # if the state is failed, gather exit code information
200 if status in ["SubFinished", "Failed"]:
201 transform_workload_id = task["transform_workload_id"]
202 new_ret = idds_client.get_contents_output_ext(
203 request_id=wms_workflow_id, workload_id=transform_workload_id
204 )
205 request_status = new_ret[0]
206 if request_status != 0:
207 raise RuntimeError(
208 f"Error to get workflow status: {new_ret} for id: {wms_workflow_id}"
209 )
210 # task_info is a dictionary of len 1 that contains a list
211 # of dicts containing panda job info
212 task_info = new_ret[1][1]
214 if len(task_info) == 1:
215 wmskey = list(task_info.keys())[0]
216 wmsjobs = task_info[wmskey]
217 else:
218 raise RuntimeError(
219 f"Unexpected job return from PanDA: {task_info} for id: {transform_workload_id}"
220 )
221 exit_codes = [
222 wmsjob["trans_exit_code"]
223 for wmsjob in wmsjobs
224 if wmsjob["trans_exit_code"] is not None and int(wmsjob["trans_exit_code"]) != 0
225 ]
226 exit_codes_all[tasklabel] = exit_codes
227 # Fill number of jobs in all WmsStates
228 for state in WmsStates:
229 njobs = 0
230 # Each WmsState have many iDDS status mapped to it.
231 for mappedstate in state_map[status]:
232 if state in file_map and mappedstate == state:
233 if task[file_map[mappedstate]] is not None:
234 njobs = task[file_map[mappedstate]]
235 if state == WmsStates.RUNNING:
236 njobs += task["output_new_files"] - task["input_new_files"]
237 break
238 wms_report.job_state_counts[state] += njobs
239 taskstatus[state] = njobs
240 wms_report.job_summary[tasklabel] = taskstatus
242 # To fill the EXPECTED column
243 if wms_report.run_summary:
244 wms_report.run_summary += ";"
245 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}"
247 if status in wf_status:
248 wf_succeed = True
249 wms_report.state = state_map[status][0]
251 # All tasks have failed, set the workflow FAILED
252 if not wf_succeed:
253 wms_report.state = WmsStates.FAILED
254 wms_report.exit_code_summary = exit_codes_all
255 run_reports.append(wms_report)
257 return run_reports, message
259 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
260 # Docstring inherited from BaseWmsService.list_submitted_jobs.
261 if wms_id is None and user is not None:
262 raise RuntimeError(
263 "Error to get workflow status report: wms_id is required"
264 " and filtering workflows with 'user' is not supported."
265 )
267 idds_client = get_idds_client(self.config)
268 ret = idds_client.get_requests(request_id=wms_id)
269 _LOG.debug("PanDA get workflows returned = %s", ret)
271 status, result, error = get_idds_result(ret)
272 if status:
273 req_ids = [req["request_id"] for req in result]
274 return req_ids
276 raise RuntimeError(f"Error list PanDA workflow requests: {error}")
278 def cancel(self, wms_id, pass_thru=None):
279 # Docstring inherited from BaseWmsService.cancel.
280 idds_client = get_idds_client(self.config)
281 ret = idds_client.abort(request_id=wms_id)
282 _LOG.debug("Abort PanDA workflow returned = %s", ret)
284 status, result, error = get_idds_result(ret)
285 if status:
286 _LOG.info("Aborting PanDA workflow %s", result)
287 return True, json.dumps(result)
289 return False, f"Error abort PanDA workflow: {str(error)}"
291 def ping(self, pass_thru=None):
292 # Docstring inherited from BaseWmsService.ping.
293 idds_client = get_idds_client(self.config)
294 ret = idds_client.ping()
295 _LOG.debug("Ping PanDA service returned = %s", ret)
297 status, result, error = get_idds_result(ret)
298 if status:
299 if "Status" in result and result["Status"] == "OK":
300 return 0, None
302 return -1, f"Error ping PanDA service: {str(result)}"
304 return -1, f"Error ping PanDA service: {str(error)}"
306 def run_submission_checks(self):
307 # Docstring inherited from BaseWmsService.run_submission_checks.
308 for key in ["PANDA_URL"]:
309 if key not in os.environ:
310 raise OSError(f"Missing environment variable {key}")
312 status, message = self.ping()
313 if status != 0:
314 raise RuntimeError(message)
317class PandaBpsWmsWorkflow(BaseWmsWorkflow):
318 """A single Panda based workflow.
320 Parameters
321 ----------
322 name : `str`
323 Unique name for Workflow.
324 config : `lsst.ctrl.bps.BpsConfig`
325 BPS configuration that includes necessary submit/runtime information.
326 """
328 def __init__(self, name, config=None):
329 super().__init__(name, config)
330 self.files_to_pre_stage = {} # src, dest
331 self.idds_client_workflow = IDDS_client_workflow(name=name)
333 @classmethod
334 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
335 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow.
336 wms_workflow = cls(generic_workflow.name, config)
338 files, dag_sink_work, task_count = add_idds_work(
339 config, generic_workflow, wms_workflow.idds_client_workflow
340 )
341 wms_workflow.files_to_pre_stage.update(files)
343 files = add_final_idds_work(
344 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1
345 )
346 wms_workflow.files_to_pre_stage.update(files)
348 return wms_workflow
350 def write(self, out_prefix):
351 # Docstring inherited from BaseWmsWorkflow.write.
352 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh:
353 pickle.dump(self, fh)