Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%
190 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-10 10:39 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-10 10:39 +0000
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Utilities for bps PanDA plugin."""
24__all__ = [
25 "copy_files_for_distribution",
26 "get_idds_client",
27 "get_idds_result",
28 "convert_exec_string_to_hex",
29 "add_decoder_prefix",
30]
32import binascii
33import concurrent.futures
34import logging
35import os
37import idds.common.utils as idds_utils
38import pandaclient.idds_api
39from idds.doma.workflowv2.domapandawork import DomaPanDAWork
40from idds.workflowv2.workflow import AndCondition
41from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
42from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
43from lsst.ctrl.bps.panda.constants import (
44 PANDA_DEFAULT_CLOUD,
45 PANDA_DEFAULT_CORE_COUNT,
46 PANDA_DEFAULT_MAX_ATTEMPTS,
47 PANDA_DEFAULT_MAX_JOBS_PER_TASK,
48 PANDA_DEFAULT_MAX_WALLTIME,
49 PANDA_DEFAULT_PRIORITY,
50 PANDA_DEFAULT_PROCESSING_TYPE,
51 PANDA_DEFAULT_PROD_SOURCE_LABEL,
52 PANDA_DEFAULT_RSS,
53 PANDA_DEFAULT_TASK_TYPE,
54 PANDA_DEFAULT_VO,
55)
56from lsst.resources import ResourcePath
58_LOG = logging.getLogger(__name__)
61def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers):
62 """Brings locally generated files into Cloud for further
63 utilization them on the edge nodes.
65 Parameters
66 ----------
67 local_pfns : `dict` [`str`, `str`]
68 Files which need to be copied to a workflow staging area.
69 file_distribution_uri: `str`
70 Path on the edge node accessed storage,
71 including access protocol, bucket name to place files.
72 max_copy_workers : `int`
73 Maximum number of workers for copying files.
75 Raises
76 ------
77 RuntimeError
78 Raised when error copying files to the distribution point.
79 """
80 files_to_copy = {}
82 # In case there are folders we iterate over its content
83 for local_pfn in files_to_stage.values():
84 folder_name = os.path.basename(os.path.normpath(local_pfn))
85 if os.path.isdir(local_pfn):
86 files_in_folder = ResourcePath.findFileResources([local_pfn])
87 for file in files_in_folder:
88 file_name = file.basename()
89 files_to_copy[file] = ResourcePath(
90 os.path.join(file_distribution_uri, folder_name, file_name)
91 )
92 else:
93 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
94 os.path.join(file_distribution_uri, folder_name)
95 )
97 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers)
98 future_file_copy = []
99 for src, trgt in files_to_copy.items():
100 # S3 clients explicitly instantiate here to overpass this
101 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
102 trgt.exists()
103 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
105 for future in concurrent.futures.as_completed(future_file_copy):
106 if future.result() is not None:
107 raise RuntimeError("Error of placing files to the distribution point")
110def get_idds_client(config):
111 """Get the idds client.
113 Parameters
114 ----------
115 config : `lsst.ctrl.bps.BpsConfig`
116 BPS configuration.
118 Returns
119 -------
120 idds_client: `idds.client.clientmanager.ClientManager`
121 iDDS ClientManager object.
122 """
123 idds_server = None
124 if isinstance(config, BpsConfig):
125 _, idds_server = config.search("iddsServer", opt={"default": None})
126 elif isinstance(config, dict) and "iddsServer" in config:
127 idds_server = config["iddsServer"]
128 # if idds_server is None, a default value on the panda relay service
129 # will be used
130 idds_client = pandaclient.idds_api.get_api(
131 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
132 )
133 return idds_client
136def get_idds_result(ret):
137 """Parse the results returned from iDDS.
139 Parameters
140 ----------
141 ret: `tuple` of (`int`, (`bool`, payload)).
142 The first part ret[0] is the status of PanDA relay service.
143 The part of ret[1][0] is the status of iDDS service.
144 The part of ret[1][1] is the returned payload.
145 If ret[1][0] is False, ret[1][1] can be error messages.
147 Returns
148 -------
149 status: `bool`
150 The status of iDDS calls.
151 result: `int` or `list` or `dict`
152 The result returned from iDDS.
153 error: `str`
154 Error messages.
155 """
156 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
157 if not isinstance(ret, list | tuple) or ret[0] != 0:
158 # Something wrong with the PanDA relay service.
159 # The call may not be delivered to iDDS.
160 status = False
161 result = None
162 error = f"PanDA relay service returns errors: {str(ret)}"
163 else:
164 if ret[1][0]:
165 status = True
166 result = ret[1][1]
167 error = None
168 if isinstance(result, str) and "Authentication no permission" in result:
169 status = False
170 result = None
171 error = result
172 else:
173 # iDDS returns errors
174 status = False
175 result = None
176 error = f"iDDS returns errors: {str(ret[1][1])}"
177 return status, result, error
180def _make_pseudo_filename(config, gwjob):
181 """Make the job pseudo filename.
183 Parameters
184 ----------
185 config : `lsst.ctrl.bps.BpsConfig`
186 BPS configuration.
187 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
188 Job for which to create the pseudo filename.
190 Returns
191 -------
192 pseudo_filename : `str`
193 The pseudo filename for the given job.
194 """
195 cmd_line_embedder = CommandLineEmbedder(config)
196 _, pseudo_filename = cmd_line_embedder.substitute_command_line(
197 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name
198 )
199 return pseudo_filename
202def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk):
203 """Make the DOMA Work object for a PanDA task.
205 Parameters
206 ----------
207 config : `lsst.ctrl.bps.BpsConfig`
208 BPS configuration.
209 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
210 Job representing the jobs for the PanDA task.
211 task_count : `int`
212 Count of PanDA tasks used when making unique names.
213 task_chunk : `int`
214 Count of chunk of a PanDA tasks used when making unique names.
216 Returns
217 -------
218 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork`
219 The client representation of a PanDA task.
220 local_pfns : `dict` [`str`, `str`]
221 Files which need to be copied to a workflow staging area.
222 """
223 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob)
224 cvals = {"curr_cluster": gwjob.label}
225 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True})
226 cvals["curr_site"] = site
227 _, processing_type = config.search(
228 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE}
229 )
230 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE})
231 _, prod_source_label = config.search(
232 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL}
233 )
234 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO})
236 _, file_distribution_end_point = config.search(
237 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None}
238 )
240 _, file_distribution_end_point_default = config.search(
241 "fileDistributionEndPointDefault", opt={"curvals": cvals, "default": None}
242 )
244 # Assume input files are same across task
245 local_pfns = {}
246 direct_io_files = set()
248 if gwjob.executable.transfer_executable:
249 local_pfns["job_executable"] = gwjob.executable.src_uri
250 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}"
251 else:
252 job_executable = gwjob.executable.src_uri
253 cmd_line_embedder = CommandLineEmbedder(config)
254 cmd_line, _ = cmd_line_embedder.substitute_command_line(
255 job_executable + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name
256 )
258 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True):
259 local_pfns[gwfile.name] = gwfile.src_uri
260 if os.path.isdir(gwfile.src_uri):
261 # this is needed to make isdir function working
262 # properly in ButlerURL instance on the edge node
263 local_pfns[gwfile.name] += "/"
265 if gwfile.job_access_remote:
266 direct_io_files.add(gwfile.name)
268 if not direct_io_files:
269 direct_io_files.add("cmdlineplaceholder")
271 lsst_temp = "LSST_RUN_TEMP_SPACE"
272 if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ:
273 file_distribution_end_point = file_distribution_end_point_default
275 executable = add_decoder_prefix(
276 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files)
277 )
278 work = DomaPanDAWork(
279 executable=executable,
280 primary_input_collection={
281 "scope": "pseudo_dataset",
282 "name": f"pseudo_input_collection#{str(task_count)}",
283 },
284 output_collections=[
285 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"}
286 ],
287 log_collections=[],
288 dependency_map=[],
289 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}",
290 task_queue=gwjob.queue,
291 task_log={
292 "destination": "local",
293 "value": "log.tgz",
294 "dataset": "PandaJob_#{pandaid}/",
295 "token": "local",
296 "param_type": "log",
297 "type": "template",
298 },
299 encode_command_line=True,
300 task_rss=gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS,
301 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD,
302 task_site=site,
303 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY,
304 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT,
305 working_group=gwjob.accounting_group,
306 processing_type=processing_type,
307 task_type=task_type,
308 prodSourceLabel=prod_source_label,
309 vo=vo,
310 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS,
311 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME,
312 )
313 return work, local_pfns
316def add_final_idds_work(
317 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk
318):
319 """Add the special final PanDA task to the client workflow.
321 Parameters
322 ----------
323 config : `lsst.ctrl.bps.BpsConfig`
324 BPS configuration.
325 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
326 Generic workflow in which to find the final job.
327 idds_client_workflow : `idds.workflowv2.workflow.Workflow`
328 iDDS client representation of the workflow to which the final task
329 is added.
330 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
331 The work nodes in the client workflow which have no successors.
332 task_count : `int`
333 Count of PanDA tasks used when making unique names.
334 task_chunk : `int`
335 Count of chunk of a PanDA tasks used when making unique names.
337 Returns
338 -------
339 files : `dict` [`str`, `str`]
340 Files which need to be copied to a workflow staging area.
342 Raises
343 ------
344 NotImplementedError
345 Raised if final job in GenericWorkflow is itself a workflow.
346 TypeError
347 Raised if final job in GenericWorkflow is invalid type.
348 """
349 files = {}
351 # If final job exists in generic workflow, create DAG final job
352 final = generic_workflow.get_final()
353 if final:
354 if isinstance(final, GenericWorkflow):
355 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
357 if not isinstance(final, GenericWorkflowJob):
358 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
360 dag_final_work, files = _make_doma_work(
361 config,
362 generic_workflow,
363 final,
364 task_count,
365 task_chunk,
366 )
367 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:"
368 dag_final_work.dependency_map.append(
369 {"name": pseudo_filename, "submitted": False, "dependencies": []}
370 )
371 idds_client_workflow.add_work(dag_final_work)
372 conditions = []
373 for work in dag_sink_work:
374 conditions.append(work.is_terminated)
375 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work])
376 idds_client_workflow.add_condition(and_cond)
377 else:
378 _LOG.debug("No final job in GenericWorkflow")
379 return files
382def convert_exec_string_to_hex(cmdline):
383 """Convert the command line into hex representation.
385 This step is currently involved because large blocks of command lines
386 including special symbols passed to the pilot/container. To make sure
387 the 1 to 1 matching and pass by the special symbol stripping
388 performed by the Pilot we applied the hexing.
390 Parameters
391 ----------
392 cmdline : `str`
393 UTF-8 command line string
395 Returns
396 -------
397 hex : `str`
398 Hex representation of string
399 """
400 return binascii.hexlify(cmdline.encode()).decode("utf-8")
403def add_decoder_prefix(config, cmd_line, distribution_path, files):
404 """Compose the command line sent to the pilot from the functional part
405 (the actual SW running) and the middleware part (containers invocation)
407 Parameters
408 ----------
409 config : `lsst.ctrl.bps.BpsConfig`
410 Configuration information
411 cmd_line : `str`
412 UTF-8 based functional part of the command line
413 distribution_path : `str`
414 URI of path where all files are located for distribution
415 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]]
416 File names needed for a task (copied local, direct access)
418 Returns
419 -------
420 decoder_prefix : `str`
421 Full command line to be executed on the edge node
422 """
423 # Manipulate file paths for placement on cmdline
424 files_plc_hldr = {}
425 for key, pfn in files[0].items():
426 if pfn.endswith("/"):
427 files_plc_hldr[key] = os.path.basename(pfn[:-1])
428 isdir = True
429 else:
430 files_plc_hldr[key] = os.path.basename(pfn)
431 _, extension = os.path.splitext(pfn)
432 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml")
433 if isdir:
434 # this is needed to make isdir function working
435 # properly in ButlerURL instance on the egde node
436 files_plc_hldr[key] += "/"
437 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key])
439 cmdline_hex = convert_exec_string_to_hex(cmd_line)
440 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
441 runner_command = runner_command.replace("\n", " ")
442 decoder_prefix = runner_command.replace(
443 "_cmd_line_",
444 str(cmdline_hex)
445 + " ${IN/L} "
446 + distribution_path
447 + " "
448 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items())
449 + " "
450 + "+".join(files[1]),
451 )
452 return decoder_prefix
455def add_idds_work(config, generic_workflow, idds_workflow):
456 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS
457 workflow.
459 Parameters
460 ----------
461 config : `lsst.ctrl.bps.BpsConfig`
462 BPS configuration
463 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
464 Generic workflow containing jobs to convert.
465 idds_workflow : `idds.workflowv2.workflow.Workflow`
466 iDDS workflow to which the converted jobs should be added.
468 Returns
469 -------
470 files_to_pre_stage : `dict` [`str`, `str`]
471 Files that need to be copied to the staging area before submission.
472 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
473 The work nodes in the client workflow which have no successors.
474 task_count : `int`
475 Number of tasks in iDDS workflow used for unique task names
477 Raises
478 ------
479 RuntimeError
480 If cannot recover from dependency issues after pass through workflow.
481 """
482 # Limit number of jobs in single PanDA task
483 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK})
485 files_to_pre_stage = {}
486 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task
487 job_to_task = {}
488 job_to_pseudo_filename = {}
489 task_count = 0 # Task number/ID in idds workflow used for unique name
491 # To avoid dying due to optimizing number of times through workflow,
492 # catch dependency issues to loop through again later.
493 jobs_with_dependency_issues = {}
495 # Assume jobs with same label share config values
496 for job_label in generic_workflow.labels:
497 _LOG.debug("job_label = %s", job_label)
498 # Add each job with a particular label to a corresponding PanDA task
499 # A PanDA task has a limit on number of jobs, so break into multiple
500 # PanDA tasks if needed.
501 job_count = 0 # Number of jobs in idds task used for task chunking
502 task_chunk = 1 # Task chunk number within job label used for unique name
503 work = None
505 # Instead of changing code to make chunks up front and round-robin
506 # assign jobs to chunks, for now keeping chunk creation in loop
507 # but using knowledge of how many chunks there will be to set better
508 # maximum number of jobs in a chunk for more even distribution.
509 jobs_by_label = generic_workflow.get_jobs_by_label(job_label)
510 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil
511 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks)
512 _LOG.debug(
513 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s",
514 job_label,
515 len(jobs_by_label),
516 num_chunks,
517 max_jobs_per_task_this_label,
518 )
519 for gwjob in jobs_by_label:
520 job_count += 1
521 if job_count > max_jobs_per_task_this_label:
522 job_count = 1
523 task_chunk += 1
525 if job_count == 1:
526 # Create new PanDA task object
527 task_count += 1
528 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk)
529 files_to_pre_stage.update(files)
530 idds_workflow.add_work(work)
531 if generic_workflow.out_degree(gwjob.name) == 0:
532 dag_sink_work.append(work)
534 pseudo_filename = _make_pseudo_filename(config, gwjob)
535 job_to_pseudo_filename[gwjob.name] = pseudo_filename
536 job_to_task[gwjob.name] = work.get_work_name()
537 deps = []
538 missing_deps = False
539 for parent_job_name in generic_workflow.predecessors(gwjob.name):
540 if parent_job_name not in job_to_task:
541 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
542 missing_deps = True
543 break
544 else:
545 deps.append(
546 {
547 "task": job_to_task[parent_job_name],
548 "inputname": job_to_pseudo_filename[parent_job_name],
549 "available": False,
550 }
551 )
552 if not missing_deps:
553 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
554 else:
555 jobs_with_dependency_issues[gwjob.name] = work
557 # If there were any issues figuring out dependencies through earlier loop
558 if jobs_with_dependency_issues:
559 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.")
560 _LOG.info("Trying to recover...")
561 for job_name, work in jobs_with_dependency_issues.items():
562 deps = []
563 for parent_job_name in generic_workflow.predecessors(job_name):
564 if parent_job_name not in job_to_task:
565 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
566 raise RuntimeError(
567 "Could not recover from dependency issues ({job_name} missing {parent_job_name})."
568 )
569 deps.append(
570 {
571 "task": job_to_task[parent_job_name],
572 "inputname": job_to_pseudo_filename[parent_job_name],
573 "available": False,
574 }
575 )
576 pseudo_filename = job_to_pseudo_filename[job_name]
577 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
578 _LOG.info("Successfully recovered.")
580 return files_to_pre_stage, dag_sink_work, task_count