Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 11%
230 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-10 11:13 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-10 11:13 +0000
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
23__all__ = ["PanDAService", "PandaBpsWmsWorkflow"]
26import binascii
27import concurrent.futures
28import json
29import logging
30import os
31import re
33import idds.common.utils as idds_utils
34import pandaclient.idds_api
35from idds.doma.workflowv2.domapandawork import DomaPanDAWork
36from idds.workflowv2.workflow import AndCondition
37from idds.workflowv2.workflow import Workflow as IDDS_client_workflow
38from lsst.ctrl.bps.bps_config import BpsConfig
39from lsst.ctrl.bps.panda.idds_tasks import IDDSWorkflowGenerator
40from lsst.ctrl.bps.wms_service import BaseWmsService, BaseWmsWorkflow, WmsRunReport, WmsStates
41from lsst.resources import ResourcePath
43_LOG = logging.getLogger(__name__)
46class PanDAService(BaseWmsService):
47 """PanDA version of WMS service"""
49 def prepare(self, config, generic_workflow, out_prefix=None):
50 """Convert generic workflow to an PanDA iDDS ready for submission
52 Parameters
53 ----------
54 config : `lsst.ctrl.bps.BpsConfig`
55 BPS configuration that includes necessary submit/runtime
56 information.
57 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
58 out_prefix : `str`
59 The root directory into which all WMS-specific files are written
61 Returns
62 -------
63 workflow : `lsst.ctrl.bps.panda.panda_service.PandaBpsWmsWorkflow`
64 PanDA workflow ready to be run.
65 """
66 _LOG.debug("out_prefix = '%s'", out_prefix)
67 workflow = PandaBpsWmsWorkflow.from_generic_workflow(
68 config, generic_workflow, out_prefix, f"{self.__class__.__module__}." f"{self.__class__.__name__}"
69 )
70 workflow.write(out_prefix)
71 return workflow
73 def convert_exec_string_to_hex(self, cmdline):
74 """Convert the command line into hex representation.
76 This step is currently involved because large blocks of command lines
77 including special symbols passed to the pilot/container. To make sure
78 the 1 to 1 matching and pass by the special symbol stripping
79 performed by the Pilot we applied the hexing.
81 Parameters
82 ----------
83 cmdline : `str`
84 UTF-8 command line string
86 Returns
87 -------
88 hex : `str`
89 Hex representation of string
90 """
91 return binascii.hexlify(cmdline.encode()).decode("utf-8")
93 def add_decoder_prefix(self, cmd_line, distribution_path, files):
94 """
95 Compose the command line sent to the pilot from the functional part
96 (the actual SW running) and the middleware part (containers invocation)
98 Parameters
99 ----------
100 cmd_line : `str`
101 UTF-8 based functional part of the command line
102 distribution_path : `str`
103 URI of path where all files are located for distribution
104 files `list` [`str`]
105 File names needed for a task
107 Returns
108 -------
109 decoder_prefix : `str`
110 Full command line to be executed on the edge node
111 """
113 cmdline_hex = self.convert_exec_string_to_hex(cmd_line)
114 _, decoder_prefix = self.config.search(
115 "runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False}
116 )
117 decoder_prefix = decoder_prefix.replace(
118 "_cmd_line_",
119 str(cmdline_hex)
120 + " ${IN/L} "
121 + distribution_path
122 + " "
123 + "+".join(f"{k}:{v}" for k, v in files[0].items())
124 + " "
125 + "+".join(files[1]),
126 )
127 return decoder_prefix
129 def submit(self, workflow):
130 """Submit a single PanDA iDDS workflow
132 Parameters
133 ----------
134 workflow : `lsst.ctrl.bps.BaseWorkflow`
135 A single PanDA iDDS workflow to submit
136 """
137 idds_client_workflow = IDDS_client_workflow(name=workflow.name)
138 files = self.copy_files_for_distribution(
139 workflow.generated_tasks, self.config["fileDistributionEndPoint"]
140 )
141 DAG_end_work = []
142 DAG_final_work = None
144 _, processing_type = self.config.search("processing_type", opt={"default": None})
145 _, task_type = self.config.search("task_type", opt={"default": "test"})
146 _, prod_source_label = self.config.search("prodSourceLabel", opt={"default": None})
147 _, vo = self.config.search("vo", opt={"default": "wlcg"})
149 for idx, task in enumerate(workflow.generated_tasks):
150 work = DomaPanDAWork(
151 executable=self.add_decoder_prefix(
152 task.executable, self.config["fileDistributionEndPoint"], files
153 ),
154 primary_input_collection={
155 "scope": "pseudo_dataset",
156 "name": "pseudo_input_collection#" + str(idx),
157 },
158 output_collections=[
159 {"scope": "pseudo_dataset", "name": "pseudo_output_collection#" + str(idx)}
160 ],
161 log_collections=[],
162 dependency_map=task.dependencies,
163 task_name=task.name,
164 task_queue=task.queue,
165 task_log={
166 "destination": "local",
167 "value": "log.tgz",
168 "dataset": "PandaJob_#{pandaid}/",
169 "token": "local",
170 "param_type": "log",
171 "type": "template",
172 },
173 encode_command_line=True,
174 task_rss=task.max_rss,
175 task_cloud=task.cloud,
176 task_site=task.site,
177 task_priority=int(task.priority) if task.priority else 900,
178 core_count=task.core_count,
179 working_group=task.working_group,
180 processing_type=processing_type,
181 task_type=task_type,
182 prodSourceLabel=prod_source_label if prod_source_label else task.prod_source_label,
183 vo=vo,
184 maxattempt=task.max_attempt,
185 maxwalltime=task.max_walltime if task.max_walltime else 90000,
186 )
188 idds_client_workflow.add_work(work)
189 if task.is_final:
190 DAG_final_work = work
191 if task.is_dag_end:
192 DAG_end_work.append(work)
194 if DAG_final_work:
195 conditions = []
196 for work in DAG_end_work:
197 conditions.append(work.is_terminated)
198 and_cond = AndCondition(conditions=conditions, true_works=[DAG_final_work])
199 idds_client_workflow.add_condition(and_cond)
200 idds_client = self.get_idds_client()
201 ret = idds_client.submit(idds_client_workflow, username=None, use_dataset_name=False)
202 _LOG.debug("iDDS client manager submit returned = %s", ret)
204 # Check submission success
205 status, result, error = self.get_idds_result(ret)
206 if status:
207 request_id = int(result)
208 else:
209 raise RuntimeError(f"Error submitting to PanDA service: {error}")
211 _LOG.info("Submitted into iDDs with request id=%s", request_id)
212 workflow.run_id = request_id
214 @staticmethod
215 def copy_files_for_distribution(tasks, file_distribution_uri):
216 """
217 Brings locally generated files into Cloud for further
218 utilization them on the edge nodes.
220 Parameters
221 ----------
222 local_pfns: `list` of `tasks`
223 Tasks that input files needs to be placed for
224 distribution
225 file_distribution_uri: `str`
226 Path on the edge node accessed storage,
227 including access protocol, bucket name to place files
229 Returns
230 -------
231 files_plc_hldr, direct_IO_files : `dict` [`str`, `str`], `set` of `str`
232 First parameters is key values pairs
233 of file placeholder - file name
234 Second parameter is set of files which will be directly accessed.
235 """
236 local_pfns = {}
237 direct_IO_files = set()
238 for task in tasks:
239 for file in task.files_used_by_task:
240 if not file.delivered:
241 local_pfns[file.name] = file.submission_url
242 if file.direct_IO:
243 direct_IO_files.add(file.name)
245 files_to_copy = {}
247 # In case there are folders we iterate over its content
248 for local_pfn in local_pfns.values():
249 folder_name = os.path.basename(local_pfn)
250 if os.path.isdir(local_pfn):
251 files_in_folder = ResourcePath.findFileResources([local_pfn])
252 for file in files_in_folder:
253 file_name = file.basename()
254 files_to_copy[file] = ResourcePath(
255 os.path.join(file_distribution_uri, folder_name, file_name)
256 )
257 else:
258 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
259 os.path.join(file_distribution_uri, folder_name)
260 )
262 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
263 future_file_copy = []
264 for src, trgt in files_to_copy.items():
266 # S3 clients explicitly instantiate here to overpass this
267 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
268 trgt.exists()
269 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
270 for future in concurrent.futures.as_completed(future_file_copy):
271 if not future.result() is None:
272 raise RuntimeError("Error of placing files to the distribution point")
274 if len(direct_IO_files) == 0:
275 direct_IO_files.add("cmdlineplaceholder")
277 files_plc_hldr = {}
278 for file_placeholder, src_path in local_pfns.items():
279 files_plc_hldr[file_placeholder] = os.path.basename(src_path)
280 if os.path.isdir(src_path):
281 # this is needed to make isdir function working
282 # properly in ButlerURL instance on the egde node
283 files_plc_hldr[file_placeholder] += "/"
285 return files_plc_hldr, direct_IO_files
287 def get_idds_client(self):
288 """Get the idds client
290 Returns
291 -------
292 idds_client: `idds.client.clientmanager.ClientManager`
293 iDDS ClientManager object.
294 """
295 idds_server = None
296 if isinstance(self.config, BpsConfig):
297 _, idds_server = self.config.search("iddsServer", opt={"default": None})
298 elif isinstance(self.config, dict) and "iddsServer" in self.config:
299 idds_server = self.config["iddsServer"]
300 # if idds_server is None, a default value on the panda relay service
301 # will be used
302 idds_client = pandaclient.idds_api.get_api(
303 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
304 )
305 return idds_client
307 def get_idds_result(self, ret):
308 """Parse the results returned from iDDS.
310 Parameters
311 ----------
312 ret: `tuple` of (`int`, (`bool`, payload)).
313 The first part ret[0] is the status of PanDA relay service.
314 The part of ret[1][0] is the status of iDDS service.
315 The part of ret[1][1] is the returned payload.
316 If ret[1][0] is False, ret[1][1] can be error messages.
318 Returns
319 -------
320 status: `bool`
321 The status of iDDS calls.
322 result: `int` or `list` or `dict`
323 The result returned from iDDS.
324 error: `str`
325 Error messages.
326 """
327 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
328 if not (isinstance(ret, tuple) or isinstance(ret, list)) or ret[0] != 0:
329 # Something wrong with the PanDA relay service.
330 # The call may not be delivered to iDDS.
331 status = False
332 result = None
333 error = "PanDA relay service returns errors: %s" % str(ret)
334 else:
335 if ret[1][0]:
336 status = True
337 result = ret[1][1]
338 error = None
339 if isinstance(result, str) and "Authentication no permission" in result:
340 status = False
341 result = None
342 error = result
343 else:
344 # iDDS returns errors
345 status = False
346 result = None
347 error = "iDDS returns errors: %s" % str(ret[1][1])
348 return status, result, error
350 def restart(self, wms_workflow_id):
351 """Restart a workflow from the point of failure.
353 Parameters
354 ----------
355 wms_workflow_id : `str`
356 Id that can be used by WMS service to identify workflow that
357 need to be restarted.
359 Returns
360 -------
361 wms_id : `str`
362 Id of the restarted workflow. If restart failed, it will be set
363 to `None`.
364 run_name : `str`
365 Name of the restarted workflow. If restart failed, it will be set
366 to `None`.
367 message : `str`
368 A message describing any issues encountered during the restart.
369 If there were no issue, an empty string is returned.
370 """
371 idds_client = self.get_idds_client()
372 ret = idds_client.retry(request_id=wms_workflow_id)
373 _LOG.debug("Restart PanDA workflow returned = %s", ret)
375 status, result, error = self.get_idds_result(ret)
376 if status:
377 _LOG.info("Restarting PanDA workflow %s", result)
378 return wms_workflow_id, None, json.dumps(result)
379 else:
380 return None, None, "Error retry PanDA workflow: %s" % str(error)
382 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
383 """Stub for future implementation of the report method
384 Expected to return run information based upon given constraints.
386 Parameters
387 ----------
388 wms_workflow_id : `int` or `str`
389 Limit to specific run based on id.
390 user : `str`
391 Limit results to runs for this user.
392 hist : `float`
393 Limit history search to this many days.
394 pass_thru : `str`
395 Constraints to pass through to HTCondor.
396 is_global : `bool`, optional
397 If set, all available job queues will be queried for job
398 information. Defaults to False which means that only a local job
399 queue will be queried for information.
401 Returns
402 -------
403 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
404 Information about runs from given job information.
405 message : `str`
406 Extra message for report command to print. This could be
407 pointers to documentation or to WMS specific commands.
408 """
409 message = ""
410 run_reports = []
412 if not wms_workflow_id:
413 message = "Run summary not implemented yet, use 'bps report --id <workflow_id>' instead"
414 return run_reports, message
416 idds_client = self.get_idds_client()
417 ret = idds_client.get_requests(request_id=wms_workflow_id, with_detail=True)
418 _LOG.debug("PanDA get workflow status returned = %s", str(ret))
420 request_status = ret[0]
421 if request_status != 0:
422 raise RuntimeError(f"Error to get workflow status: {ret} for id: {wms_workflow_id}")
424 tasks = ret[1][1]
425 if not tasks:
426 message = f"No records found for workflow id '{wms_workflow_id}'. Hint: double check the id"
427 else:
428 head = tasks[0]
429 wms_report = WmsRunReport(
430 wms_id=str(head["request_id"]),
431 operator=head["username"],
432 project="",
433 campaign="",
434 payload="",
435 run=head["name"],
436 state=WmsStates.UNKNOWN,
437 total_number_jobs=0,
438 job_state_counts={state: 0 for state in WmsStates},
439 job_summary={},
440 run_summary="",
441 )
443 # The status of a task is taken from the first item of state_map.
444 # The workflow is in status WmsStates.FAILED when:
445 # All tasks have failed.
446 # SubFinished tasks has jobs in
447 # output_processed_files: Finished
448 # output_failed_files: Failed
449 # output_missing_files: Missing
450 state_map = {
451 "Finished": [WmsStates.SUCCEEDED],
452 "SubFinished": [
453 WmsStates.SUCCEEDED,
454 WmsStates.FAILED,
455 WmsStates.PRUNED,
456 ],
457 "Transforming": [
458 WmsStates.RUNNING,
459 WmsStates.SUCCEEDED,
460 WmsStates.FAILED,
461 WmsStates.UNREADY,
462 WmsStates.PRUNED,
463 ],
464 "Failed": [WmsStates.FAILED, WmsStates.PRUNED],
465 }
467 file_map = {
468 WmsStates.SUCCEEDED: "output_processed_files",
469 WmsStates.RUNNING: "output_processing_files",
470 WmsStates.FAILED: "output_failed_files",
471 WmsStates.UNREADY: "input_new_files",
472 WmsStates.PRUNED: "output_missing_files",
473 }
475 # workflow status to report as SUCCEEDED
476 wf_status = ["Finished", "SubFinished", "Transforming"]
478 wf_succeed = False
480 tasks.sort(key=lambda x: x["transform_workload_id"])
482 # Loop over all tasks data returned by idds_client
483 for task in tasks:
484 totaljobs = task["output_total_files"]
485 wms_report.total_number_jobs += totaljobs
486 tasklabel = task["transform_name"]
487 tasklabel = re.sub(wms_report.run + "_", "", tasklabel)
488 status = task["transform_status"]["attributes"]["_name_"]
489 taskstatus = {}
490 # Fill number of jobs in all WmsStates
491 for state in WmsStates:
492 njobs = 0
493 # Each WmsState have many iDDS status mapped to it.
494 for mappedstate in state_map[status]:
495 if state in file_map and mappedstate == state:
496 if task[file_map[mappedstate]] is not None:
497 njobs = task[file_map[mappedstate]]
498 if state == WmsStates.RUNNING:
499 njobs += task["output_new_files"] - task["input_new_files"]
500 break
501 wms_report.job_state_counts[state] += njobs
502 taskstatus[state] = njobs
503 wms_report.job_summary[tasklabel] = taskstatus
505 # To fill the EXPECTED column
506 if wms_report.run_summary:
507 wms_report.run_summary += ";"
508 wms_report.run_summary += "%s:%s" % (tasklabel, str(totaljobs))
510 if status in wf_status:
511 wf_succeed = True
512 wms_report.state = state_map[status][0]
514 # All tasks have failed, set the workflow FAILED
515 if not wf_succeed:
516 wms_report.state = WmsStates.FAILED
518 run_reports.append(wms_report)
520 return run_reports, message
522 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
523 """Query WMS for list of submitted WMS workflows/jobs.
525 This should be a quick lookup function to create list of jobs for
526 other functions.
528 Parameters
529 ----------
530 wms_id : `int` or `str`, optional
531 Id or path that can be used by WMS service to look up job.
532 user : `str`, optional
533 User whose submitted jobs should be listed.
534 require_bps : `bool`, optional
535 Whether to require jobs returned in list to be bps-submitted jobs.
536 pass_thru : `str`, optional
537 Information to pass through to WMS.
538 is_global : `bool`, optional
539 If set, all available job queues will be queried for job
540 information. Defaults to False which means that only a local job
541 queue will be queried for information.
543 Only applicable in the context of a WMS using distributed job
544 queues (e.g., HTCondor). A WMS with a centralized job queue
545 (e.g. PanDA) can safely ignore it.
547 Returns
548 -------
549 req_ids : `list` [`Any`]
550 Only job ids to be used by cancel and other functions. Typically
551 this means top-level jobs (i.e., not children jobs).
552 """
553 if wms_id is None and user is not None:
554 raise RuntimeError(
555 "Error to get workflow status report: wms_id is required"
556 " and filtering workflows with 'user' is not supported."
557 )
559 idds_client = self.get_idds_client()
560 ret = idds_client.get_requests(request_id=wms_id)
561 _LOG.debug("PanDA get workflows returned = %s", ret)
563 status, result, error = self.get_idds_result(ret)
564 if status:
565 req_ids = [req["request_id"] for req in result]
566 return req_ids
567 else:
568 raise RuntimeError(f"Error list PanDA workflow requests: {error}")
570 def cancel(self, wms_id, pass_thru=None):
571 """Cancel submitted workflows/jobs.
573 Parameters
574 ----------
575 wms_id : `str`
576 ID or path of job that should be canceled.
577 pass_thru : `str`, optional
578 Information to pass through to WMS.
580 Returns
581 -------
582 deleted : `bool`
583 Whether successful deletion or not. Currently, if any doubt or any
584 individual jobs not deleted, return False.
585 message : `str`
586 Any message from WMS (e.g., error details).
587 """
588 idds_client = self.get_idds_client()
589 ret = idds_client.abort(request_id=wms_id)
590 _LOG.debug("Abort PanDA workflow returned = %s", ret)
592 status, result, error = self.get_idds_result(ret)
593 if status:
594 _LOG.info("Aborting PanDA workflow %s", result)
595 return True, json.dumps(result)
596 else:
597 return False, "Error abort PanDA workflow: %s" % str(error)
599 def ping(self, pass_thru=None):
600 """Checks whether PanDA WMS services are up, reachable,
601 and can authenticate if authentication is required.
603 The services to be checked are those needed for submit, report, cancel,
604 restart, but ping cannot guarantee whether jobs would actually run
605 successfully. Any messages should be sent directly to the logger.
607 Parameters
608 ----------
609 pass_thru : `str`, optional
610 Information to pass through to WMS.
612 Returns
613 -------
614 status : `int`
615 0 for success, non-zero for failure
616 message : `str`
617 Any message from WMS (e.g., error details).
618 """
619 idds_client = self.get_idds_client()
620 ret = idds_client.ping()
621 _LOG.debug("Ping PanDA service returned = %s", ret)
623 status, result, error = self.get_idds_result(ret)
624 if status:
625 if "Status" in result and result["Status"] == "OK":
626 return 0, None
627 else:
628 return -1, "Error ping PanDA service: %s" % str(result)
629 else:
630 return -1, "Error ping PanDA service: %s" % str(error)
632 def run_submission_checks(self):
633 """Checks to run at start if running WMS specific submission steps.
635 Any exception other than NotImplementedError will halt submission.
636 Submit directory may not yet exist when this is called.
637 """
638 for key in ["PANDA_URL"]:
639 if key not in os.environ:
640 raise OSError(f"Missing environment variable {key}")
642 status, message = self.ping()
643 if status != 0:
644 raise RuntimeError(message)
647class PandaBpsWmsWorkflow(BaseWmsWorkflow):
648 """A single Panda based workflow
650 Parameters
651 ----------
652 name : `str`
653 Unique name for Workflow
654 config : `lsst.ctrl.bps.BpsConfig`
655 BPS configuration that includes necessary submit/runtime information
656 """
658 def __init__(self, name, config=None):
659 super().__init__(name, config)
660 self.generated_tasks = None
662 @classmethod
663 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
664 # Docstring inherited from parent class
665 idds_workflow = cls(generic_workflow.name, config)
666 workflow_generator = IDDSWorkflowGenerator(generic_workflow, config)
667 idds_workflow.generated_tasks = workflow_generator.define_tasks()
668 _LOG.debug("panda dag attribs %s", generic_workflow.run_attrs)
669 return idds_workflow
671 def write(self, out_prefix):
672 """Not yet implemented"""