Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 14%
180 statements
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 21:11 -0700
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 21:11 -0700
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
23__all__ = ["PanDAService", "PandaBpsWmsWorkflow"]
26import binascii
27import concurrent.futures
28import json
29import logging
30import os
32import idds.common.utils as idds_utils
33import pandaclient.idds_api
34from idds.doma.workflowv2.domapandawork import DomaPanDAWork
35from idds.workflowv2.workflow import AndCondition
36from idds.workflowv2.workflow import Workflow as IDDS_client_workflow
37from lsst.ctrl.bps.bps_config import BpsConfig
38from lsst.ctrl.bps.panda.idds_tasks import IDDSWorkflowGenerator
39from lsst.ctrl.bps.wms_service import BaseWmsService, BaseWmsWorkflow
40from lsst.resources import ResourcePath
42_LOG = logging.getLogger(__name__)
45class PanDAService(BaseWmsService):
46 """PanDA version of WMS service"""
48 def prepare(self, config, generic_workflow, out_prefix=None):
49 """Convert generic workflow to an PanDA iDDS ready for submission
51 Parameters
52 ----------
53 config : `lsst.ctrl.bps.BpsConfig`
54 BPS configuration that includes necessary submit/runtime
55 information.
56 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
57 out_prefix : `str`
58 The root directory into which all WMS-specific files are written
60 Returns
61 -------
62 workflow : `lsst.ctrl.bps.panda.panda_service.PandaBpsWmsWorkflow`
63 PanDA workflow ready to be run.
64 """
65 _LOG.debug("out_prefix = '%s'", out_prefix)
66 workflow = PandaBpsWmsWorkflow.from_generic_workflow(
67 config, generic_workflow, out_prefix, f"{self.__class__.__module__}." f"{self.__class__.__name__}"
68 )
69 workflow.write(out_prefix)
70 return workflow
72 def convert_exec_string_to_hex(self, cmdline):
73 """Convert the command line into hex representation.
75 This step is currently involved because large blocks of command lines
76 including special symbols passed to the pilot/container. To make sure
77 the 1 to 1 matching and pass by the special symbol stripping
78 performed by the Pilot we applied the hexing.
80 Parameters
81 ----------
82 cmdline : `str`
83 UTF-8 command line string
85 Returns
86 -------
87 hex : `str`
88 Hex representation of string
89 """
90 return binascii.hexlify(cmdline.encode()).decode("utf-8")
92 def add_decoder_prefix(self, cmd_line, distribution_path, files):
93 """
94 Compose the command line sent to the pilot from the functional part
95 (the actual SW running) and the middleware part (containers invocation)
97 Parameters
98 ----------
99 cmd_line : `str`
100 UTF-8 based functional part of the command line
101 distribution_path : `str`
102 URI of path where all files are located for distribution
103 files `list` [`str`]
104 File names needed for a task
106 Returns
107 -------
108 decoder_prefix : `str`
109 Full command line to be executed on the edge node
110 """
112 cmdline_hex = self.convert_exec_string_to_hex(cmd_line)
113 _, decoder_prefix = self.config.search(
114 "runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False}
115 )
116 decoder_prefix = decoder_prefix.replace(
117 "_cmd_line_",
118 str(cmdline_hex)
119 + " ${IN/L} "
120 + distribution_path
121 + " "
122 + "+".join(f"{k}:{v}" for k, v in files[0].items())
123 + " "
124 + "+".join(files[1]),
125 )
126 return decoder_prefix
128 def submit(self, workflow):
129 """Submit a single PanDA iDDS workflow
131 Parameters
132 ----------
133 workflow : `lsst.ctrl.bps.BaseWorkflow`
134 A single PanDA iDDS workflow to submit
135 """
136 idds_client_workflow = IDDS_client_workflow(name=workflow.name)
137 files = self.copy_files_for_distribution(
138 workflow.generated_tasks, self.config["fileDistributionEndPoint"]
139 )
140 DAG_end_work = []
141 DAG_final_work = None
143 _, processing_type = self.config.search("processing_type", opt={"default": None})
144 _, task_type = self.config.search("task_type", opt={"default": "test"})
145 _, prod_source_label = self.config.search("prodSourceLabel", opt={"default": None})
146 _, vo = self.config.search("vo", opt={"default": "wlcg"})
148 for idx, task in enumerate(workflow.generated_tasks):
149 work = DomaPanDAWork(
150 executable=self.add_decoder_prefix(
151 task.executable, self.config["fileDistributionEndPoint"], files
152 ),
153 primary_input_collection={
154 "scope": "pseudo_dataset",
155 "name": "pseudo_input_collection#" + str(idx),
156 },
157 output_collections=[
158 {"scope": "pseudo_dataset", "name": "pseudo_output_collection#" + str(idx)}
159 ],
160 log_collections=[],
161 dependency_map=task.dependencies,
162 task_name=task.name,
163 task_queue=task.queue,
164 task_log={
165 "destination": "local",
166 "value": "log.tgz",
167 "dataset": "PandaJob_#{pandaid}/",
168 "token": "local",
169 "param_type": "log",
170 "type": "template",
171 },
172 encode_command_line=True,
173 task_rss=task.max_rss,
174 task_cloud=task.cloud,
175 task_site=task.site,
176 task_priority=int(task.priority) if task.priority else 900,
177 core_count=task.core_count,
178 working_group=task.working_group,
179 processing_type=processing_type,
180 task_type=task_type,
181 prodSourceLabel=prod_source_label if prod_source_label else task.prod_source_label,
182 vo=vo,
183 maxattempt=task.max_attempt,
184 maxwalltime=task.max_walltime if task.max_walltime else 90000,
185 )
187 idds_client_workflow.add_work(work)
188 if task.is_final:
189 DAG_final_work = work
190 if task.is_dag_end:
191 DAG_end_work.append(work)
193 if DAG_final_work:
194 conditions = []
195 for work in DAG_end_work:
196 conditions.append(work.is_terminated)
197 and_cond = AndCondition(conditions=conditions, true_works=[DAG_final_work])
198 idds_client_workflow.add_condition(and_cond)
199 idds_client = self.get_idds_client()
200 ret = idds_client.submit(idds_client_workflow, username=None, use_dataset_name=False)
201 _LOG.debug("iDDS client manager submit returned = %s", ret)
203 # Check submission success
204 status, result, error = self.get_idds_result(ret)
205 if status:
206 request_id = int(result)
207 else:
208 raise RuntimeError(f"Error submitting to PanDA service: {error}")
210 _LOG.info("Submitted into iDDs with request id=%s", request_id)
211 workflow.run_id = request_id
213 @staticmethod
214 def copy_files_for_distribution(tasks, file_distribution_uri):
215 """
216 Brings locally generated files into Cloud for further
217 utilization them on the edge nodes.
219 Parameters
220 ----------
221 local_pfns: `list` of `tasks`
222 Tasks that input files needs to be placed for
223 distribution
224 file_distribution_uri: `str`
225 Path on the edge node accessed storage,
226 including access protocol, bucket name to place files
228 Returns
229 -------
230 files_plc_hldr, direct_IO_files : `dict` [`str`, `str`], `set` of `str`
231 First parameters is key values pairs
232 of file placeholder - file name
233 Second parameter is set of files which will be directly accessed.
234 """
235 local_pfns = {}
236 direct_IO_files = set()
237 for task in tasks:
238 for file in task.files_used_by_task:
239 if not file.delivered:
240 local_pfns[file.name] = file.submission_url
241 if file.direct_IO:
242 direct_IO_files.add(file.name)
244 files_to_copy = {}
246 # In case there are folders we iterate over its content
247 for local_pfn in local_pfns.values():
248 folder_name = os.path.basename(local_pfn)
249 if os.path.isdir(local_pfn):
250 files_in_folder = ResourcePath.findFileResources([local_pfn])
251 for file in files_in_folder:
252 file_name = file.basename()
253 files_to_copy[file] = ResourcePath(
254 os.path.join(file_distribution_uri, folder_name, file_name)
255 )
256 else:
257 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
258 os.path.join(file_distribution_uri, folder_name)
259 )
261 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
262 future_file_copy = []
263 for src, trgt in files_to_copy.items():
264 # S3 clients explicitly instantiate here to overpass this
265 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
266 trgt.exists()
267 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
268 for future in concurrent.futures.as_completed(future_file_copy):
269 if not future.result() is None:
270 raise RuntimeError("Error of placing files to the distribution point")
272 if len(direct_IO_files) == 0:
273 direct_IO_files.add("cmdlineplaceholder")
275 files_plc_hldr = {}
276 for file_placeholder, src_path in local_pfns.items():
277 files_plc_hldr[file_placeholder] = os.path.basename(src_path)
278 if os.path.isdir(src_path):
279 # this is needed to make isdir function working
280 # properly in ButlerURL instance on the egde node
281 files_plc_hldr[file_placeholder] += "/"
283 return files_plc_hldr, direct_IO_files
285 def get_idds_client(self):
286 """Get the idds client
288 Returns
289 -------
290 idds_client: `idds.client.clientmanager.ClientManager`
291 iDDS ClientManager object.
292 """
293 idds_server = None
294 if isinstance(self.config, BpsConfig):
295 _, idds_server = self.config.search("iddsServer", opt={"default": None})
296 elif isinstance(self.config, dict) and "iddsServer" in self.config:
297 idds_server = self.config["iddsServer"]
298 # if idds_server is None, a default value on the panda relay service
299 # will be used
300 idds_client = pandaclient.idds_api.get_api(
301 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
302 )
303 return idds_client
305 def get_idds_result(self, ret):
306 """Parse the results returned from iDDS.
308 Parameters
309 ----------
310 ret: `tuple` of (`int`, (`bool`, payload)).
311 The first part ret[0] is the status of PanDA relay service.
312 The part of ret[1][0] is the status of iDDS service.
313 The part of ret[1][1] is the returned payload.
314 If ret[1][0] is False, ret[1][1] can be error messages.
316 Returns
317 -------
318 status: `bool`
319 The status of iDDS calls.
320 result: `int` or `list` or `dict`
321 The result returned from iDDS.
322 error: `str`
323 Error messages.
324 """
325 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
326 if not (isinstance(ret, tuple) or isinstance(ret, list)) or ret[0] != 0:
327 # Something wrong with the PanDA relay service.
328 # The call may not be delivered to iDDS.
329 status = False
330 result = None
331 error = "PanDA relay service returns errors: %s" % str(ret)
332 else:
333 if ret[1][0]:
334 status = True
335 result = ret[1][1]
336 error = None
337 if isinstance(result, str) and "Authentication no permission" in result:
338 status = False
339 result = None
340 error = result
341 else:
342 # iDDS returns errors
343 status = False
344 result = None
345 error = "iDDS returns errors: %s" % str(ret[1][1])
346 return status, result, error
348 def restart(self, wms_workflow_id):
349 """Restart a workflow from the point of failure.
351 Parameters
352 ----------
353 wms_workflow_id : `str`
354 Id that can be used by WMS service to identify workflow that
355 need to be restarted.
357 Returns
358 -------
359 wms_id : `str`
360 Id of the restarted workflow. If restart failed, it will be set
361 to `None`.
362 run_name : `str`
363 Name of the restarted workflow. If restart failed, it will be set
364 to `None`.
365 message : `str`
366 A message describing any issues encountered during the restart.
367 If there were no issue, an empty string is returned.
368 """
369 idds_client = self.get_idds_client()
370 ret = idds_client.retry(request_id=wms_workflow_id)
371 _LOG.debug("Restart PanDA workflow returned = %s", ret)
373 status, result, error = self.get_idds_result(ret)
374 if status:
375 _LOG.info("Restarting PanDA workflow %s", result)
376 return wms_workflow_id, None, json.dumps(result)
377 else:
378 return None, None, "Error retry PanDA workflow: %s" % str(error)
380 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
381 """Stub for future implementation of the report method
382 Expected to return run information based upon given constraints.
384 Parameters
385 ----------
386 wms_workflow_id : `int` or `str`
387 Limit to specific run based on id.
388 user : `str`
389 Limit results to runs for this user.
390 hist : `float`
391 Limit history search to this many days.
392 pass_thru : `str`
393 Constraints to pass through to HTCondor.
394 is_global : `bool`, optional
395 If set, all available job queues will be queried for job
396 information. Defaults to False which means that only a local job
397 queue will be queried for information.
399 Returns
400 -------
401 runs : `list` [`lsst.ctrl.bps.WmsRunReport`]
402 Information about runs from given job information.
403 message : `str`
404 Extra message for report command to print. This could be
405 pointers to documentation or to WMS specific commands.
406 """
407 raise NotImplementedError
409 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
410 """Query WMS for list of submitted WMS workflows/jobs.
412 This should be a quick lookup function to create list of jobs for
413 other functions.
415 Parameters
416 ----------
417 wms_id : `int` or `str`, optional
418 Id or path that can be used by WMS service to look up job.
419 user : `str`, optional
420 User whose submitted jobs should be listed.
421 require_bps : `bool`, optional
422 Whether to require jobs returned in list to be bps-submitted jobs.
423 pass_thru : `str`, optional
424 Information to pass through to WMS.
425 is_global : `bool`, optional
426 If set, all available job queues will be queried for job
427 information. Defaults to False which means that only a local job
428 queue will be queried for information.
430 Only applicable in the context of a WMS using distributed job
431 queues (e.g., HTCondor). A WMS with a centralized job queue
432 (e.g. PanDA) can safely ignore it.
434 Returns
435 -------
436 req_ids : `list` [`Any`]
437 Only job ids to be used by cancel and other functions. Typically
438 this means top-level jobs (i.e., not children jobs).
439 """
440 if wms_id is None and user is not None:
441 raise RuntimeError(
442 "Error to get workflow status report: wms_id is required"
443 " and filtering workflows with 'user' is not supported."
444 )
446 idds_client = self.get_idds_client()
447 ret = idds_client.get_requests(request_id=wms_id)
448 _LOG.debug("PanDA get workflows returned = %s", ret)
450 status, result, error = self.get_idds_result(ret)
451 if status:
452 req_ids = [req["request_id"] for req in result]
453 return req_ids
454 else:
455 raise RuntimeError(f"Error list PanDA workflow requests: {error}")
457 def cancel(self, wms_id, pass_thru=None):
458 """Cancel submitted workflows/jobs.
460 Parameters
461 ----------
462 wms_id : `str`
463 ID or path of job that should be canceled.
464 pass_thru : `str`, optional
465 Information to pass through to WMS.
467 Returns
468 -------
469 deleted : `bool`
470 Whether successful deletion or not. Currently, if any doubt or any
471 individual jobs not deleted, return False.
472 message : `str`
473 Any message from WMS (e.g., error details).
474 """
475 idds_client = self.get_idds_client()
476 ret = idds_client.abort(request_id=wms_id)
477 _LOG.debug("Abort PanDA workflow returned = %s", ret)
479 status, result, error = self.get_idds_result(ret)
480 if status:
481 _LOG.info("Aborting PanDA workflow %s", result)
482 return True, json.dumps(result)
483 else:
484 return False, "Error abort PanDA workflow: %s" % str(error)
486 def ping(self, pass_thru=None):
487 """Checks whether PanDA WMS services are up, reachable,
488 and can authenticate if authentication is required.
490 The services to be checked are those needed for submit, report, cancel,
491 restart, but ping cannot guarantee whether jobs would actually run
492 successfully. Any messages should be sent directly to the logger.
494 Parameters
495 ----------
496 pass_thru : `str`, optional
497 Information to pass through to WMS.
499 Returns
500 -------
501 status : `int`
502 0 for success, non-zero for failure
503 message : `str`
504 Any message from WMS (e.g., error details).
505 """
506 idds_client = self.get_idds_client()
507 ret = idds_client.ping()
508 _LOG.debug("Ping PanDA service returned = %s", ret)
510 status, result, error = self.get_idds_result(ret)
511 if status:
512 if "Status" in result and result["Status"] == "OK":
513 return 0, None
514 else:
515 return -1, "Error ping PanDA service: %s" % str(result)
516 else:
517 return -1, "Error ping PanDA service: %s" % str(error)
519 def run_submission_checks(self):
520 """Checks to run at start if running WMS specific submission steps.
522 Any exception other than NotImplementedError will halt submission.
523 Submit directory may not yet exist when this is called.
524 """
525 for key in ["PANDA_URL"]:
526 if key not in os.environ:
527 raise OSError(f"Missing environment variable {key}")
529 status, message = self.ping()
530 if status != 0:
531 raise RuntimeError(message)
534class PandaBpsWmsWorkflow(BaseWmsWorkflow):
535 """A single Panda based workflow
537 Parameters
538 ----------
539 name : `str`
540 Unique name for Workflow
541 config : `lsst.ctrl.bps.BpsConfig`
542 BPS configuration that includes necessary submit/runtime information
543 """
545 def __init__(self, name, config=None):
546 super().__init__(name, config)
547 self.generated_tasks = None
549 @classmethod
550 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
551 # Docstring inherited from parent class
552 idds_workflow = cls(generic_workflow.name, config)
553 workflow_generator = IDDSWorkflowGenerator(generic_workflow, config)
554 idds_workflow.generated_tasks = workflow_generator.define_tasks()
555 _LOG.debug("panda dag attribs %s", generic_workflow.run_attrs)
556 return idds_workflow
558 def write(self, out_prefix):
559 """Not yet implemented"""