Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%
195 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-30 03:04 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-30 03:04 -0700
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Utilities for bps PanDA plugin."""
30__all__ = [
31 "copy_files_for_distribution",
32 "get_idds_client",
33 "get_idds_result",
34 "convert_exec_string_to_hex",
35 "add_decoder_prefix",
36]
38import binascii
39import concurrent.futures
40import logging
41import os
43import idds.common.utils as idds_utils
44import pandaclient.idds_api
45from idds.doma.workflowv2.domapandawork import DomaPanDAWork
46from idds.workflowv2.workflow import AndCondition
47from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
48from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
49from lsst.ctrl.bps.panda.constants import (
50 PANDA_DEFAULT_CLOUD,
51 PANDA_DEFAULT_CORE_COUNT,
52 PANDA_DEFAULT_MAX_ATTEMPTS,
53 PANDA_DEFAULT_MAX_JOBS_PER_TASK,
54 PANDA_DEFAULT_MAX_WALLTIME,
55 PANDA_DEFAULT_PRIORITY,
56 PANDA_DEFAULT_PROCESSING_TYPE,
57 PANDA_DEFAULT_PROD_SOURCE_LABEL,
58 PANDA_DEFAULT_RSS,
59 PANDA_DEFAULT_RSS_MAX,
60 PANDA_DEFAULT_TASK_TYPE,
61 PANDA_DEFAULT_VO,
62)
63from lsst.resources import ResourcePath
65_LOG = logging.getLogger(__name__)
68def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers):
69 """Brings locally generated files into Cloud for further
70 utilization them on the edge nodes.
72 Parameters
73 ----------
74 files_to_stage : `dict` [`str`, `str`]
75 Files which need to be copied to a workflow staging area.
76 file_distribution_uri : `str`
77 Path on the edge node accessed storage,
78 including access protocol, bucket name to place files.
79 max_copy_workers : `int`
80 Maximum number of workers for copying files.
82 Raises
83 ------
84 RuntimeError
85 Raised when error copying files to the distribution point.
86 """
87 files_to_copy = {}
89 # In case there are folders we iterate over its content
90 for local_pfn in files_to_stage.values():
91 folder_name = os.path.basename(os.path.normpath(local_pfn))
92 if os.path.isdir(local_pfn):
93 files_in_folder = ResourcePath.findFileResources([local_pfn])
94 for file in files_in_folder:
95 file_name = file.basename()
96 files_to_copy[file] = ResourcePath(
97 os.path.join(file_distribution_uri, folder_name, file_name)
98 )
99 else:
100 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
101 os.path.join(file_distribution_uri, folder_name)
102 )
104 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers)
105 future_file_copy = []
106 for src, trgt in files_to_copy.items():
107 _LOG.debug("Staging %s to %s", src, trgt)
108 # S3 clients explicitly instantiate here to overpass this
109 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
110 trgt.exists()
111 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
113 for future in concurrent.futures.as_completed(future_file_copy):
114 if future.result() is not None:
115 raise RuntimeError("Error of placing files to the distribution point")
118def get_idds_client(config):
119 """Get the idds client.
121 Parameters
122 ----------
123 config : `lsst.ctrl.bps.BpsConfig`
124 BPS configuration.
126 Returns
127 -------
128 idds_client: `idds.client.clientmanager.ClientManager`
129 The iDDS ClientManager object.
130 """
131 idds_server = None
132 if isinstance(config, BpsConfig):
133 _, idds_server = config.search("iddsServer", opt={"default": None})
134 elif isinstance(config, dict) and "iddsServer" in config:
135 idds_server = config["iddsServer"]
136 # if idds_server is None, a default value on the panda relay service
137 # will be used
138 idds_client = pandaclient.idds_api.get_api(
139 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
140 )
141 return idds_client
144def get_idds_result(ret):
145 """Parse the results returned from iDDS.
147 Parameters
148 ----------
149 ret : `tuple` [`int`, `tuple` [`bool`, payload ]]
150 The first part ``ret[0]`` is the status of PanDA relay service.
151 The part of ``ret[1][0]`` is the status of iDDS service.
152 The part of ``ret[1][1]`` is the returned payload.
153 If ``ret[1][0]`` is `False`, ``ret[1][1]`` can be error messages.
155 Returns
156 -------
157 status: `bool`
158 The status of iDDS calls.
159 result: `int` or `list` or `dict` or `None`
160 The result returned from iDDS. `None` if error state.
161 error: `str` or `None`
162 Error messages. `None` if no error state.
163 """
164 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
165 if not isinstance(ret, list | tuple) or ret[0] != 0:
166 # Something wrong with the PanDA relay service.
167 # The call may not be delivered to iDDS.
168 status = False
169 result = None
170 error = f"PanDA relay service returns errors: {str(ret)}"
171 else:
172 if ret[1][0]:
173 status = True
174 result = ret[1][1]
175 error = None
176 if isinstance(result, str) and "Authentication no permission" in result:
177 status = False
178 result = None
179 error = result
180 else:
181 # iDDS returns errors
182 status = False
183 result = None
184 error = f"iDDS returns errors: {str(ret[1][1])}"
185 return status, result, error
188def _make_pseudo_filename(config, gwjob):
189 """Make the job pseudo filename.
191 Parameters
192 ----------
193 config : `lsst.ctrl.bps.BpsConfig`
194 BPS configuration.
195 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
196 Job for which to create the pseudo filename.
198 Returns
199 -------
200 pseudo_filename : `str`
201 The pseudo filename for the given job.
202 """
203 cmd_line_embedder = CommandLineEmbedder(config)
204 _, pseudo_filename = cmd_line_embedder.substitute_command_line(
205 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name, []
206 )
207 return pseudo_filename
210def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk):
211 """Make the DOMA Work object for a PanDA task.
213 Parameters
214 ----------
215 config : `lsst.ctrl.bps.BpsConfig`
216 BPS configuration.
217 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
218 Job representing the jobs for the PanDA task.
219 task_count : `int`
220 Count of PanDA tasks used when making unique names.
221 task_chunk : `int`
222 Count of chunk of a PanDA tasks used when making unique names.
224 Returns
225 -------
226 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork`
227 The client representation of a PanDA task.
228 local_pfns : `dict` [`str`, `str`]
229 Files which need to be copied to a workflow staging area.
230 """
231 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob)
232 cvals = {"curr_cluster": gwjob.label}
233 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True})
234 cvals["curr_site"] = site
235 _, processing_type = config.search(
236 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE}
237 )
238 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE})
239 _, prod_source_label = config.search(
240 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL}
241 )
242 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO})
244 _, file_distribution_end_point = config.search(
245 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None}
246 )
248 _, file_distribution_end_point_default = config.search(
249 "fileDistributionEndPointDefault", opt={"curvals": cvals, "default": None}
250 )
252 task_rss = gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS
253 task_rss_retry_step = task_rss * gwjob.memory_multiplier if gwjob.memory_multiplier else 0
254 task_rss_retry_offset = 0 if task_rss_retry_step else task_rss
256 # Assume input files are same across task
257 local_pfns = {}
258 direct_io_files = set()
260 if gwjob.executable.transfer_executable:
261 local_pfns["job_executable"] = gwjob.executable.src_uri
262 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}"
263 else:
264 job_executable = gwjob.executable.src_uri
265 cmd_line_embedder = CommandLineEmbedder(config)
266 _LOG.debug(
267 "job %s inputs = %s, outputs = %s",
268 gwjob.name,
269 generic_workflow.get_job_inputs(gwjob.name),
270 generic_workflow.get_job_outputs(gwjob.name),
271 )
273 cmd_line, _ = cmd_line_embedder.substitute_command_line(
274 job_executable + " " + gwjob.arguments,
275 gwjob.cmdvals,
276 gwjob.name,
277 generic_workflow.get_job_inputs(gwjob.name) + generic_workflow.get_job_outputs(gwjob.name),
278 )
280 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True):
281 local_pfns[gwfile.name] = gwfile.src_uri
282 if os.path.isdir(gwfile.src_uri):
283 # this is needed to make isdir function working
284 # properly in ButlerURL instance on the edge node
285 local_pfns[gwfile.name] += "/"
287 if gwfile.job_access_remote:
288 direct_io_files.add(gwfile.name)
290 if not direct_io_files:
291 direct_io_files.add("cmdlineplaceholder")
293 lsst_temp = "LSST_RUN_TEMP_SPACE"
294 if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ:
295 file_distribution_end_point = file_distribution_end_point_default
297 executable = add_decoder_prefix(
298 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files)
299 )
300 work = DomaPanDAWork(
301 executable=executable,
302 primary_input_collection={
303 "scope": "pseudo_dataset",
304 "name": f"pseudo_input_collection#{str(task_count)}",
305 },
306 output_collections=[
307 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"}
308 ],
309 log_collections=[],
310 dependency_map=[],
311 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}",
312 task_queue=gwjob.queue,
313 task_log={
314 "destination": "local",
315 "value": "log.tgz",
316 "dataset": "PandaJob_#{pandaid}/",
317 "token": "local",
318 "param_type": "log",
319 "type": "template",
320 },
321 encode_command_line=True,
322 task_rss=task_rss,
323 task_rss_retry_offset=task_rss_retry_offset,
324 task_rss_retry_step=task_rss_retry_step,
325 task_rss_max=gwjob.request_memory_max if gwjob.request_memory_max else PANDA_DEFAULT_RSS_MAX,
326 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD,
327 task_site=site,
328 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY,
329 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT,
330 working_group=gwjob.accounting_group,
331 processing_type=processing_type,
332 task_type=task_type,
333 prodSourceLabel=prod_source_label,
334 vo=vo,
335 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS,
336 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME,
337 )
338 return work, local_pfns
341def add_final_idds_work(
342 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk
343):
344 """Add the special final PanDA task to the client workflow.
346 Parameters
347 ----------
348 config : `lsst.ctrl.bps.BpsConfig`
349 BPS configuration.
350 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
351 Generic workflow in which to find the final job.
352 idds_client_workflow : `idds.workflowv2.workflow.Workflow`
353 The iDDS client representation of the workflow to which the final task
354 is added.
355 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
356 The work nodes in the client workflow which have no successors.
357 task_count : `int`
358 Count of PanDA tasks used when making unique names.
359 task_chunk : `int`
360 Count of chunk of a PanDA tasks used when making unique names.
362 Returns
363 -------
364 files : `dict` [`str`, `str`]
365 Files which need to be copied to a workflow staging area.
367 Raises
368 ------
369 NotImplementedError
370 Raised if final job in GenericWorkflow is itself a workflow.
371 TypeError
372 Raised if final job in GenericWorkflow is invalid type.
373 """
374 files = {}
376 # If final job exists in generic workflow, create DAG final job
377 final = generic_workflow.get_final()
378 if final:
379 if isinstance(final, GenericWorkflow):
380 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
382 if not isinstance(final, GenericWorkflowJob):
383 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
385 dag_final_work, files = _make_doma_work(
386 config,
387 generic_workflow,
388 final,
389 task_count,
390 task_chunk,
391 )
392 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:"
393 dag_final_work.dependency_map.append(
394 {"name": pseudo_filename, "submitted": False, "dependencies": []}
395 )
396 idds_client_workflow.add_work(dag_final_work)
397 conditions = []
398 for work in dag_sink_work:
399 conditions.append(work.is_terminated)
400 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work])
401 idds_client_workflow.add_condition(and_cond)
402 else:
403 _LOG.debug("No final job in GenericWorkflow")
404 return files
407def convert_exec_string_to_hex(cmdline):
408 """Convert the command line into hex representation.
410 This step is currently involved because large blocks of command lines
411 including special symbols passed to the pilot/container. To make sure
412 the 1 to 1 matching and pass by the special symbol stripping
413 performed by the Pilot we applied the hexing.
415 Parameters
416 ----------
417 cmdline : `str`
418 UTF-8 command line string.
420 Returns
421 -------
422 hex : `str`
423 Hex representation of string.
424 """
425 return binascii.hexlify(cmdline.encode()).decode("utf-8")
428def add_decoder_prefix(config, cmd_line, distribution_path, files):
429 """Compose the command line sent to the pilot from the functional part
430 (the actual SW running) and the middleware part (containers invocation).
432 Parameters
433 ----------
434 config : `lsst.ctrl.bps.BpsConfig`
435 Configuration information.
436 cmd_line : `str`
437 UTF-8 based functional part of the command line.
438 distribution_path : `str`
439 URI of path where all files are located for distribution.
440 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]]
441 File names needed for a task (copied local, direct access).
443 Returns
444 -------
445 decoder_prefix : `str`
446 Full command line to be executed on the edge node.
447 """
448 # Manipulate file paths for placement on cmdline
449 files_plc_hldr = {}
450 for key, pfn in files[0].items():
451 if pfn.endswith("/"):
452 files_plc_hldr[key] = os.path.basename(pfn[:-1])
453 isdir = True
454 else:
455 files_plc_hldr[key] = os.path.basename(pfn)
456 _, extension = os.path.splitext(pfn)
457 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml")
458 if isdir:
459 # this is needed to make isdir function working
460 # properly in ButlerURL instance on the egde node
461 files_plc_hldr[key] += "/"
462 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key])
464 cmdline_hex = convert_exec_string_to_hex(cmd_line)
465 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
466 runner_command = runner_command.replace("\n", " ")
467 decoder_prefix = runner_command.replace(
468 "_cmd_line_",
469 str(cmdline_hex)
470 + " ${IN/L} "
471 + distribution_path
472 + " "
473 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items())
474 + " "
475 + "+".join(files[1]),
476 )
477 return decoder_prefix
480def add_idds_work(config, generic_workflow, idds_workflow):
481 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS
482 workflow.
484 Parameters
485 ----------
486 config : `lsst.ctrl.bps.BpsConfig`
487 BPS configuration.
488 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
489 Generic workflow containing jobs to convert.
490 idds_workflow : `idds.workflowv2.workflow.Workflow`
491 The iDDS workflow to which the converted jobs should be added.
493 Returns
494 -------
495 files_to_pre_stage : `dict` [`str`, `str`]
496 Files that need to be copied to the staging area before submission.
497 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
498 The work nodes in the client workflow which have no successors.
499 task_count : `int`
500 Number of tasks in iDDS workflow used for unique task names.
502 Raises
503 ------
504 RuntimeError
505 If cannot recover from dependency issues after pass through workflow.
506 """
507 # Limit number of jobs in single PanDA task
508 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK})
510 files_to_pre_stage = {}
511 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task
512 job_to_task = {}
513 job_to_pseudo_filename = {}
514 task_count = 0 # Task number/ID in idds workflow used for unique name
516 # To avoid dying due to optimizing number of times through workflow,
517 # catch dependency issues to loop through again later.
518 jobs_with_dependency_issues = {}
520 # Assume jobs with same label share config values
521 for job_label in generic_workflow.labels:
522 _LOG.debug("job_label = %s", job_label)
523 # Add each job with a particular label to a corresponding PanDA task
524 # A PanDA task has a limit on number of jobs, so break into multiple
525 # PanDA tasks if needed.
526 job_count = 0 # Number of jobs in idds task used for task chunking
527 task_chunk = 1 # Task chunk number within job label used for unique name
528 work = None
530 # Instead of changing code to make chunks up front and round-robin
531 # assign jobs to chunks, for now keeping chunk creation in loop
532 # but using knowledge of how many chunks there will be to set better
533 # maximum number of jobs in a chunk for more even distribution.
534 jobs_by_label = generic_workflow.get_jobs_by_label(job_label)
535 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil
536 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks)
537 _LOG.debug(
538 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s",
539 job_label,
540 len(jobs_by_label),
541 num_chunks,
542 max_jobs_per_task_this_label,
543 )
544 for gwjob in jobs_by_label:
545 job_count += 1
546 if job_count > max_jobs_per_task_this_label:
547 job_count = 1
548 task_chunk += 1
550 if job_count == 1:
551 # Create new PanDA task object
552 task_count += 1
553 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk)
554 files_to_pre_stage.update(files)
555 idds_workflow.add_work(work)
556 if generic_workflow.out_degree(gwjob.name) == 0:
557 dag_sink_work.append(work)
559 pseudo_filename = _make_pseudo_filename(config, gwjob)
560 job_to_pseudo_filename[gwjob.name] = pseudo_filename
561 job_to_task[gwjob.name] = work.get_work_name()
562 deps = []
563 missing_deps = False
564 for parent_job_name in generic_workflow.predecessors(gwjob.name):
565 if parent_job_name not in job_to_task:
566 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
567 missing_deps = True
568 break
569 else:
570 deps.append(
571 {
572 "task": job_to_task[parent_job_name],
573 "inputname": job_to_pseudo_filename[parent_job_name],
574 "available": False,
575 }
576 )
577 if not missing_deps:
578 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
579 else:
580 jobs_with_dependency_issues[gwjob.name] = work
582 # If there were any issues figuring out dependencies through earlier loop
583 if jobs_with_dependency_issues:
584 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.")
585 _LOG.info("Trying to recover...")
586 for job_name, work in jobs_with_dependency_issues.items():
587 deps = []
588 for parent_job_name in generic_workflow.predecessors(job_name):
589 if parent_job_name not in job_to_task:
590 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
591 raise RuntimeError(
592 "Could not recover from dependency issues ({job_name} missing {parent_job_name})."
593 )
594 deps.append(
595 {
596 "task": job_to_task[parent_job_name],
597 "inputname": job_to_pseudo_filename[parent_job_name],
598 "available": False,
599 }
600 )
601 pseudo_filename = job_to_pseudo_filename[job_name]
602 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
603 _LOG.info("Successfully recovered.")
605 return files_to_pre_stage, dag_sink_work, task_count