Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 10%
191 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 03:05 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 03:05 -0700
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
27"""Interface between generic workflow to PanDA/iDDS workflow system.
28"""
31__all__ = ["PanDAService", "PandaBpsWmsWorkflow"]
34import json
35import logging
36import os
37import pickle
38import re
40from idds.workflowv2.workflow import Workflow as IDDS_client_workflow
41from lsst.ctrl.bps import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates
42from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS
43from lsst.ctrl.bps.panda.utils import (
44 add_final_idds_work,
45 add_idds_work,
46 copy_files_for_distribution,
47 create_idds_build_workflow,
48 get_idds_client,
49 get_idds_result,
50)
52_LOG = logging.getLogger(__name__)
55class PanDAService(BaseWmsService):
56 """PanDA version of WMS service."""
58 def prepare(self, config, generic_workflow, out_prefix=None):
59 # Docstring inherited from BaseWmsService.prepare.
60 _LOG.debug("out_prefix = '%s'", out_prefix)
61 workflow = PandaBpsWmsWorkflow.from_generic_workflow(
62 config, generic_workflow, out_prefix, f"{self.__class__.__module__}.{self.__class__.__name__}"
63 )
64 workflow.write(out_prefix)
65 return workflow
67 def submit(self, workflow, **kwargs):
68 config = kwargs["config"] if "config" in kwargs else None
69 remote_build = kwargs["remote_build"] if "remote_build" in kwargs else None
71 if config and remote_build:
72 _LOG.info("remote build")
74 idds_build_workflow = create_idds_build_workflow(**kwargs)
75 idds_client = get_idds_client(self.config)
76 ret = idds_client.submit_build(idds_build_workflow, username=None, use_dataset_name=False)
77 _LOG.debug("iDDS client manager submit returned = %s", ret)
79 # Check submission success
80 status, result, error = get_idds_result(ret)
81 if status:
82 request_id = int(result)
83 else:
84 raise RuntimeError(f"Error submitting to PanDA service: {error}")
86 _LOG.info("Submitted into iDDs with request id=%s", request_id)
87 idds_build_workflow.run_id = request_id
88 return idds_build_workflow
89 else:
90 _, max_copy_workers = self.config.search(
91 "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS}
92 )
93 # Docstring inherited from BaseWmsService.submit.
94 file_distribution_uri = self.config["fileDistributionEndPoint"]
95 lsst_temp = "LSST_RUN_TEMP_SPACE"
96 if lsst_temp in file_distribution_uri and lsst_temp not in os.environ:
97 file_distribution_uri = self.config["fileDistributionEndPointDefault"]
98 copy_files_for_distribution(workflow.files_to_pre_stage, file_distribution_uri, max_copy_workers)
100 idds_client = get_idds_client(self.config)
101 ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False)
102 _LOG.debug("iDDS client manager submit returned = %s", ret)
104 # Check submission success
105 status, result, error = get_idds_result(ret)
106 if status:
107 request_id = int(result)
108 else:
109 raise RuntimeError(f"Error submitting to PanDA service: {error}")
111 _LOG.info("Submitted into iDDs with request id=%s", request_id)
112 workflow.run_id = request_id
114 def restart(self, wms_workflow_id):
115 # Docstring inherited from BaseWmsService.restart.
116 idds_client = get_idds_client(self.config)
117 ret = idds_client.retry(request_id=wms_workflow_id)
118 _LOG.debug("Restart PanDA workflow returned = %s", ret)
120 status, result, error = get_idds_result(ret)
121 if status:
122 _LOG.info("Restarting PanDA workflow %s", result)
123 return wms_workflow_id, None, json.dumps(result)
125 return None, None, f"Error retry PanDA workflow: {str(error)}"
127 def report(
128 self,
129 wms_workflow_id=None,
130 user=None,
131 hist=0,
132 pass_thru=None,
133 is_global=False,
134 return_exit_codes=False,
135 ):
136 # Docstring inherited from BaseWmsService.report.
137 message = ""
138 run_reports = []
140 if not wms_workflow_id:
141 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead"
142 return run_reports, message
144 idds_client = get_idds_client(self.config)
145 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True)
146 _LOG.debug("PanDA get workflow status returned = %s", str(ret))
148 request_status = ret[0]
149 if request_status != 0:
150 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}")
152 tasks = ret[1][1]
153 if not tasks:
154 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id"
155 else:
156 head = tasks[0]
157 wms_report = WmsRunReport(
158 wms_id=str(head["request_id"]),
159 operator=head["username"],
160 project="",
161 campaign="",
162 payload="",
163 run=head["name"],
164 state=WmsStates.UNKNOWN,
165 total_number_jobs=0,
166 job_state_counts={state: 0 for state in WmsStates},
167 job_summary={},
168 run_summary="",
169 exit_code_summary=[],
170 )
172 # The status of a task is taken from the first item of state_map.
173 # The workflow is in status WmsStates.FAILED when:
174 # All tasks have failed.
175 # SubFinished tasks has jobs in
176 # output_processed_files: Finished
177 # output_failed_files: Failed
178 # output_missing_files: Missing
179 state_map = {
180 "Finished": [WmsStates.SUCCEEDED],
181 "SubFinished": [
182 WmsStates.SUCCEEDED,
183 WmsStates.FAILED,
184 WmsStates.PRUNED,
185 ],
186 "Transforming": [
187 WmsStates.RUNNING,
188 WmsStates.SUCCEEDED,
189 WmsStates.FAILED,
190 WmsStates.UNREADY,
191 WmsStates.PRUNED,
192 ],
193 "Failed": [WmsStates.FAILED, WmsStates.PRUNED],
194 }
196 file_map = {
197 WmsStates.SUCCEEDED: "output_processed_files",
198 WmsStates.RUNNING: "output_processing_files",
199 WmsStates.FAILED: "output_failed_files",
200 WmsStates.UNREADY: "input_new_files",
201 WmsStates.PRUNED: "output_missing_files",
202 }
204 workflow_status = head["status"]["attributes"]["_name_"]
205 if workflow_status in ["Finished", "SubFinished"]:
206 wms_report.state = WmsStates.SUCCEEDED
207 elif workflow_status in ["Failed", "Expired"]:
208 wms_report.state = WmsStates.FAILED
209 elif workflow_status in ["Cancelled"]:
210 wms_report.state = WmsStates.DELETED
211 elif workflow_status in ["Suspended"]:
212 wms_report.state = WmsStates.HELD
213 else:
214 wms_report.state = WmsStates.RUNNING
216 try:
217 tasks.sort(key=lambda x: x["transform_workload_id"])
218 except Exception:
219 tasks.sort(key=lambda x: x["transform_id"])
221 exit_codes_all = {}
222 # Loop over all tasks data returned by idds_client
223 for task in tasks:
224 if task["transform_id"] is None:
225 # Not created task (It happens because of an outer join
226 # between requests table and transforms table).
227 continue
229 exit_codes = []
230 totaljobs = task["output_total_files"]
231 wms_report.total_number_jobs += totaljobs
232 tasklabel = task["transform_name"]
233 tasklabel = re.sub(wms_report.run + "_", "", tasklabel)
234 status = task["transform_status"]["attributes"]["_name_"]
235 taskstatus = {}
236 # if the state is failed, gather exit code information
237 if status in ["SubFinished", "Failed"]:
238 transform_workload_id = task["transform_workload_id"]
239 if not (task["transform_name"] and task["transform_name"].startswith("build_task")):
240 new_ret = idds_client.get_contents_output_ext(
241 request_id=wms_workflow_id, workload_id=transform_workload_id
242 )
243 _LOG.debug(
244 "PanDA get task %s detail returned = %s", transform_workload_id, str(new_ret)
245 )
247 request_status = new_ret[0]
248 if request_status != 0:
249 raise RuntimeError(
250 f"Error to get workflow status: {new_ret} for id: {wms_workflow_id}"
251 )
252 # task_info is a dictionary of len 1 that contains
253 # a list of dicts containing panda job info
254 task_info = new_ret[1][1]
256 if len(task_info) == 1:
257 wmskey = list(task_info.keys())[0]
258 wmsjobs = task_info[wmskey]
259 else:
260 err_msg = "Unexpected job return from PanDA: "
261 err_msg += f"{task_info} for id: {transform_workload_id}"
262 raise RuntimeError(err_msg)
263 exit_codes = [
264 wmsjob["trans_exit_code"]
265 for wmsjob in wmsjobs
266 if wmsjob["trans_exit_code"] is not None and int(wmsjob["trans_exit_code"]) != 0
267 ]
268 exit_codes_all[tasklabel] = exit_codes
269 # Fill number of jobs in all WmsStates
270 for state in WmsStates:
271 njobs = 0
272 # Each WmsState have many iDDS status mapped to it.
273 if status in state_map:
274 for mappedstate in state_map[status]:
275 if state in file_map and mappedstate == state:
276 if task[file_map[mappedstate]] is not None:
277 njobs = task[file_map[mappedstate]]
278 if state == WmsStates.RUNNING:
279 njobs += task["output_new_files"] - task["input_new_files"]
280 break
281 wms_report.job_state_counts[state] += njobs
282 taskstatus[state] = njobs
283 wms_report.job_summary[tasklabel] = taskstatus
285 # To fill the EXPECTED column
286 if wms_report.run_summary:
287 wms_report.run_summary += ";"
288 wms_report.run_summary += f"{tasklabel}:{str(totaljobs)}"
290 wms_report.exit_code_summary = exit_codes_all
291 run_reports.append(wms_report)
293 return run_reports, message
295 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
296 # Docstring inherited from BaseWmsService.list_submitted_jobs.
297 if wms_id is None and user is not None:
298 raise RuntimeError(
299 "Error to get workflow status report: wms_id is required"
300 " and filtering workflows with 'user' is not supported."
301 )
303 idds_client = get_idds_client(self.config)
304 ret = idds_client.get_requests(request_id=wms_id)
305 _LOG.debug("PanDA get workflows returned = %s", ret)
307 status, result, error = get_idds_result(ret)
308 if status:
309 req_ids = [req["request_id"] for req in result]
310 return req_ids
312 raise RuntimeError(f"Error list PanDA workflow requests: {error}")
314 def cancel(self, wms_id, pass_thru=None):
315 # Docstring inherited from BaseWmsService.cancel.
316 idds_client = get_idds_client(self.config)
317 ret = idds_client.abort(request_id=wms_id)
318 _LOG.debug("Abort PanDA workflow returned = %s", ret)
320 status, result, error = get_idds_result(ret)
321 if status:
322 _LOG.info("Aborting PanDA workflow %s", result)
323 return True, json.dumps(result)
325 return False, f"Error abort PanDA workflow: {str(error)}"
327 def ping(self, pass_thru=None):
328 # Docstring inherited from BaseWmsService.ping.
329 idds_client = get_idds_client(self.config)
330 ret = idds_client.ping()
331 _LOG.debug("Ping PanDA service returned = %s", ret)
333 status, result, error = get_idds_result(ret)
334 if status:
335 if "Status" in result and result["Status"] == "OK":
336 return 0, None
338 return -1, f"Error ping PanDA service: {str(result)}"
340 return -1, f"Error ping PanDA service: {str(error)}"
342 def run_submission_checks(self):
343 # Docstring inherited from BaseWmsService.run_submission_checks.
344 for key in ["PANDA_URL"]:
345 if key not in os.environ:
346 raise OSError(f"Missing environment variable {key}")
348 status, message = self.ping()
349 if status != 0:
350 raise RuntimeError(message)
353class PandaBpsWmsWorkflow(BaseWmsWorkflow):
354 """A single Panda based workflow.
356 Parameters
357 ----------
358 name : `str`
359 Unique name for Workflow.
360 config : `lsst.ctrl.bps.BpsConfig`
361 BPS configuration that includes necessary submit/runtime information.
362 """
364 def __init__(self, name, config=None):
365 super().__init__(name, config)
366 self.files_to_pre_stage = {} # src, dest
367 self.idds_client_workflow = IDDS_client_workflow(name=name)
369 @classmethod
370 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
371 # Docstring inherited from BaseWmsWorkflow.from_generic_workflow.
372 wms_workflow = cls(generic_workflow.name, config)
374 files, dag_sink_work, task_count = add_idds_work(
375 config, generic_workflow, wms_workflow.idds_client_workflow
376 )
377 wms_workflow.files_to_pre_stage.update(files)
379 files = add_final_idds_work(
380 config, generic_workflow, wms_workflow.idds_client_workflow, dag_sink_work, task_count + 1, 1
381 )
382 wms_workflow.files_to_pre_stage.update(files)
384 return wms_workflow
386 def write(self, out_prefix):
387 # Docstring inherited from BaseWmsWorkflow.write.
388 with open(os.path.join(out_prefix, "panda_workflow.pickle"), "wb") as fh:
389 pickle.dump(self, fh)