Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%
192 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-09-02 09:51 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-09-02 09:51 +0000
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Utilities for bps PanDA plugin."""
24__all__ = [
25 "copy_files_for_distribution",
26 "get_idds_client",
27 "get_idds_result",
28 "convert_exec_string_to_hex",
29 "add_decoder_prefix",
30]
32import binascii
33import concurrent.futures
34import logging
35import os
37import idds.common.utils as idds_utils
38import pandaclient.idds_api
39from idds.doma.workflowv2.domapandawork import DomaPanDAWork
40from idds.workflowv2.workflow import AndCondition
41from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
42from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
43from lsst.ctrl.bps.panda.constants import (
44 PANDA_DEFAULT_CLOUD,
45 PANDA_DEFAULT_CORE_COUNT,
46 PANDA_DEFAULT_MAX_ATTEMPTS,
47 PANDA_DEFAULT_MAX_JOBS_PER_TASK,
48 PANDA_DEFAULT_MAX_WALLTIME,
49 PANDA_DEFAULT_PRIORITY,
50 PANDA_DEFAULT_PROCESSING_TYPE,
51 PANDA_DEFAULT_PROD_SOURCE_LABEL,
52 PANDA_DEFAULT_RSS,
53 PANDA_DEFAULT_TASK_TYPE,
54 PANDA_DEFAULT_VO,
55)
56from lsst.resources import ResourcePath
58_LOG = logging.getLogger(__name__)
61def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers):
62 """Brings locally generated files into Cloud for further
63 utilization them on the edge nodes.
65 Parameters
66 ----------
67 local_pfns : `dict` [`str`, `str`]
68 Files which need to be copied to a workflow staging area.
69 file_distribution_uri: `str`
70 Path on the edge node accessed storage,
71 including access protocol, bucket name to place files.
72 max_copy_workers : `int`
73 Maximum number of workers for copying files.
75 Raises
76 ------
77 RuntimeError
78 Raised when error copying files to the distribution point.
79 """
80 files_to_copy = {}
82 # In case there are folders we iterate over its content
83 for local_pfn in files_to_stage.values():
84 folder_name = os.path.basename(os.path.normpath(local_pfn))
85 if os.path.isdir(local_pfn):
86 files_in_folder = ResourcePath.findFileResources([local_pfn])
87 for file in files_in_folder:
88 file_name = file.basename()
89 files_to_copy[file] = ResourcePath(
90 os.path.join(file_distribution_uri, folder_name, file_name)
91 )
92 else:
93 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
94 os.path.join(file_distribution_uri, folder_name)
95 )
97 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers)
98 future_file_copy = []
99 for src, trgt in files_to_copy.items():
100 _LOG.debug("Staging %s to %s", src, trgt)
101 # S3 clients explicitly instantiate here to overpass this
102 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
103 trgt.exists()
104 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
106 for future in concurrent.futures.as_completed(future_file_copy):
107 if future.result() is not None:
108 raise RuntimeError("Error of placing files to the distribution point")
111def get_idds_client(config):
112 """Get the idds client.
114 Parameters
115 ----------
116 config : `lsst.ctrl.bps.BpsConfig`
117 BPS configuration.
119 Returns
120 -------
121 idds_client: `idds.client.clientmanager.ClientManager`
122 iDDS ClientManager object.
123 """
124 idds_server = None
125 if isinstance(config, BpsConfig):
126 _, idds_server = config.search("iddsServer", opt={"default": None})
127 elif isinstance(config, dict) and "iddsServer" in config:
128 idds_server = config["iddsServer"]
129 # if idds_server is None, a default value on the panda relay service
130 # will be used
131 idds_client = pandaclient.idds_api.get_api(
132 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
133 )
134 return idds_client
137def get_idds_result(ret):
138 """Parse the results returned from iDDS.
140 Parameters
141 ----------
142 ret: `tuple` of (`int`, (`bool`, payload)).
143 The first part ret[0] is the status of PanDA relay service.
144 The part of ret[1][0] is the status of iDDS service.
145 The part of ret[1][1] is the returned payload.
146 If ret[1][0] is False, ret[1][1] can be error messages.
148 Returns
149 -------
150 status: `bool`
151 The status of iDDS calls.
152 result: `int` or `list` or `dict`
153 The result returned from iDDS.
154 error: `str`
155 Error messages.
156 """
157 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
158 if not isinstance(ret, list | tuple) or ret[0] != 0:
159 # Something wrong with the PanDA relay service.
160 # The call may not be delivered to iDDS.
161 status = False
162 result = None
163 error = f"PanDA relay service returns errors: {str(ret)}"
164 else:
165 if ret[1][0]:
166 status = True
167 result = ret[1][1]
168 error = None
169 if isinstance(result, str) and "Authentication no permission" in result:
170 status = False
171 result = None
172 error = result
173 else:
174 # iDDS returns errors
175 status = False
176 result = None
177 error = f"iDDS returns errors: {str(ret[1][1])}"
178 return status, result, error
181def _make_pseudo_filename(config, gwjob):
182 """Make the job pseudo filename.
184 Parameters
185 ----------
186 config : `lsst.ctrl.bps.BpsConfig`
187 BPS configuration.
188 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
189 Job for which to create the pseudo filename.
191 Returns
192 -------
193 pseudo_filename : `str`
194 The pseudo filename for the given job.
195 """
196 cmd_line_embedder = CommandLineEmbedder(config)
197 _, pseudo_filename = cmd_line_embedder.substitute_command_line(
198 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name, []
199 )
200 return pseudo_filename
203def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk):
204 """Make the DOMA Work object for a PanDA task.
206 Parameters
207 ----------
208 config : `lsst.ctrl.bps.BpsConfig`
209 BPS configuration.
210 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
211 Job representing the jobs for the PanDA task.
212 task_count : `int`
213 Count of PanDA tasks used when making unique names.
214 task_chunk : `int`
215 Count of chunk of a PanDA tasks used when making unique names.
217 Returns
218 -------
219 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork`
220 The client representation of a PanDA task.
221 local_pfns : `dict` [`str`, `str`]
222 Files which need to be copied to a workflow staging area.
223 """
224 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob)
225 cvals = {"curr_cluster": gwjob.label}
226 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True})
227 cvals["curr_site"] = site
228 _, processing_type = config.search(
229 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE}
230 )
231 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE})
232 _, prod_source_label = config.search(
233 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL}
234 )
235 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO})
237 _, file_distribution_end_point = config.search(
238 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None}
239 )
241 _, file_distribution_end_point_default = config.search(
242 "fileDistributionEndPointDefault", opt={"curvals": cvals, "default": None}
243 )
245 # Assume input files are same across task
246 local_pfns = {}
247 direct_io_files = set()
249 if gwjob.executable.transfer_executable:
250 local_pfns["job_executable"] = gwjob.executable.src_uri
251 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}"
252 else:
253 job_executable = gwjob.executable.src_uri
254 cmd_line_embedder = CommandLineEmbedder(config)
255 _LOG.debug(
256 "job %s inputs = %s, outputs = %s",
257 gwjob.name,
258 generic_workflow.get_job_inputs(gwjob.name),
259 generic_workflow.get_job_outputs(gwjob.name),
260 )
262 cmd_line, _ = cmd_line_embedder.substitute_command_line(
263 job_executable + " " + gwjob.arguments,
264 gwjob.cmdvals,
265 gwjob.name,
266 generic_workflow.get_job_inputs(gwjob.name) + generic_workflow.get_job_outputs(gwjob.name),
267 )
269 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True):
270 local_pfns[gwfile.name] = gwfile.src_uri
271 if os.path.isdir(gwfile.src_uri):
272 # this is needed to make isdir function working
273 # properly in ButlerURL instance on the edge node
274 local_pfns[gwfile.name] += "/"
276 if gwfile.job_access_remote:
277 direct_io_files.add(gwfile.name)
279 if not direct_io_files:
280 direct_io_files.add("cmdlineplaceholder")
282 lsst_temp = "LSST_RUN_TEMP_SPACE"
283 if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ:
284 file_distribution_end_point = file_distribution_end_point_default
286 executable = add_decoder_prefix(
287 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files)
288 )
289 work = DomaPanDAWork(
290 executable=executable,
291 primary_input_collection={
292 "scope": "pseudo_dataset",
293 "name": f"pseudo_input_collection#{str(task_count)}",
294 },
295 output_collections=[
296 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"}
297 ],
298 log_collections=[],
299 dependency_map=[],
300 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}",
301 task_queue=gwjob.queue,
302 task_log={
303 "destination": "local",
304 "value": "log.tgz",
305 "dataset": "PandaJob_#{pandaid}/",
306 "token": "local",
307 "param_type": "log",
308 "type": "template",
309 },
310 encode_command_line=True,
311 task_rss=gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS,
312 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD,
313 task_site=site,
314 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY,
315 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT,
316 working_group=gwjob.accounting_group,
317 processing_type=processing_type,
318 task_type=task_type,
319 prodSourceLabel=prod_source_label,
320 vo=vo,
321 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS,
322 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME,
323 )
324 return work, local_pfns
327def add_final_idds_work(
328 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk
329):
330 """Add the special final PanDA task to the client workflow.
332 Parameters
333 ----------
334 config : `lsst.ctrl.bps.BpsConfig`
335 BPS configuration.
336 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
337 Generic workflow in which to find the final job.
338 idds_client_workflow : `idds.workflowv2.workflow.Workflow`
339 iDDS client representation of the workflow to which the final task
340 is added.
341 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
342 The work nodes in the client workflow which have no successors.
343 task_count : `int`
344 Count of PanDA tasks used when making unique names.
345 task_chunk : `int`
346 Count of chunk of a PanDA tasks used when making unique names.
348 Returns
349 -------
350 files : `dict` [`str`, `str`]
351 Files which need to be copied to a workflow staging area.
353 Raises
354 ------
355 NotImplementedError
356 Raised if final job in GenericWorkflow is itself a workflow.
357 TypeError
358 Raised if final job in GenericWorkflow is invalid type.
359 """
360 files = {}
362 # If final job exists in generic workflow, create DAG final job
363 final = generic_workflow.get_final()
364 if final:
365 if isinstance(final, GenericWorkflow):
366 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
368 if not isinstance(final, GenericWorkflowJob):
369 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
371 dag_final_work, files = _make_doma_work(
372 config,
373 generic_workflow,
374 final,
375 task_count,
376 task_chunk,
377 )
378 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:"
379 dag_final_work.dependency_map.append(
380 {"name": pseudo_filename, "submitted": False, "dependencies": []}
381 )
382 idds_client_workflow.add_work(dag_final_work)
383 conditions = []
384 for work in dag_sink_work:
385 conditions.append(work.is_terminated)
386 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work])
387 idds_client_workflow.add_condition(and_cond)
388 else:
389 _LOG.debug("No final job in GenericWorkflow")
390 return files
393def convert_exec_string_to_hex(cmdline):
394 """Convert the command line into hex representation.
396 This step is currently involved because large blocks of command lines
397 including special symbols passed to the pilot/container. To make sure
398 the 1 to 1 matching and pass by the special symbol stripping
399 performed by the Pilot we applied the hexing.
401 Parameters
402 ----------
403 cmdline : `str`
404 UTF-8 command line string
406 Returns
407 -------
408 hex : `str`
409 Hex representation of string
410 """
411 return binascii.hexlify(cmdline.encode()).decode("utf-8")
414def add_decoder_prefix(config, cmd_line, distribution_path, files):
415 """Compose the command line sent to the pilot from the functional part
416 (the actual SW running) and the middleware part (containers invocation)
418 Parameters
419 ----------
420 config : `lsst.ctrl.bps.BpsConfig`
421 Configuration information
422 cmd_line : `str`
423 UTF-8 based functional part of the command line
424 distribution_path : `str`
425 URI of path where all files are located for distribution
426 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]]
427 File names needed for a task (copied local, direct access)
429 Returns
430 -------
431 decoder_prefix : `str`
432 Full command line to be executed on the edge node
433 """
434 # Manipulate file paths for placement on cmdline
435 files_plc_hldr = {}
436 for key, pfn in files[0].items():
437 if pfn.endswith("/"):
438 files_plc_hldr[key] = os.path.basename(pfn[:-1])
439 isdir = True
440 else:
441 files_plc_hldr[key] = os.path.basename(pfn)
442 _, extension = os.path.splitext(pfn)
443 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml")
444 if isdir:
445 # this is needed to make isdir function working
446 # properly in ButlerURL instance on the egde node
447 files_plc_hldr[key] += "/"
448 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key])
450 cmdline_hex = convert_exec_string_to_hex(cmd_line)
451 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
452 runner_command = runner_command.replace("\n", " ")
453 decoder_prefix = runner_command.replace(
454 "_cmd_line_",
455 str(cmdline_hex)
456 + " ${IN/L} "
457 + distribution_path
458 + " "
459 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items())
460 + " "
461 + "+".join(files[1]),
462 )
463 return decoder_prefix
466def add_idds_work(config, generic_workflow, idds_workflow):
467 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS
468 workflow.
470 Parameters
471 ----------
472 config : `lsst.ctrl.bps.BpsConfig`
473 BPS configuration
474 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
475 Generic workflow containing jobs to convert.
476 idds_workflow : `idds.workflowv2.workflow.Workflow`
477 iDDS workflow to which the converted jobs should be added.
479 Returns
480 -------
481 files_to_pre_stage : `dict` [`str`, `str`]
482 Files that need to be copied to the staging area before submission.
483 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
484 The work nodes in the client workflow which have no successors.
485 task_count : `int`
486 Number of tasks in iDDS workflow used for unique task names
488 Raises
489 ------
490 RuntimeError
491 If cannot recover from dependency issues after pass through workflow.
492 """
493 # Limit number of jobs in single PanDA task
494 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK})
496 files_to_pre_stage = {}
497 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task
498 job_to_task = {}
499 job_to_pseudo_filename = {}
500 task_count = 0 # Task number/ID in idds workflow used for unique name
502 # To avoid dying due to optimizing number of times through workflow,
503 # catch dependency issues to loop through again later.
504 jobs_with_dependency_issues = {}
506 # Assume jobs with same label share config values
507 for job_label in generic_workflow.labels:
508 _LOG.debug("job_label = %s", job_label)
509 # Add each job with a particular label to a corresponding PanDA task
510 # A PanDA task has a limit on number of jobs, so break into multiple
511 # PanDA tasks if needed.
512 job_count = 0 # Number of jobs in idds task used for task chunking
513 task_chunk = 1 # Task chunk number within job label used for unique name
514 work = None
516 # Instead of changing code to make chunks up front and round-robin
517 # assign jobs to chunks, for now keeping chunk creation in loop
518 # but using knowledge of how many chunks there will be to set better
519 # maximum number of jobs in a chunk for more even distribution.
520 jobs_by_label = generic_workflow.get_jobs_by_label(job_label)
521 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil
522 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks)
523 _LOG.debug(
524 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s",
525 job_label,
526 len(jobs_by_label),
527 num_chunks,
528 max_jobs_per_task_this_label,
529 )
530 for gwjob in jobs_by_label:
531 job_count += 1
532 if job_count > max_jobs_per_task_this_label:
533 job_count = 1
534 task_chunk += 1
536 if job_count == 1:
537 # Create new PanDA task object
538 task_count += 1
539 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk)
540 files_to_pre_stage.update(files)
541 idds_workflow.add_work(work)
542 if generic_workflow.out_degree(gwjob.name) == 0:
543 dag_sink_work.append(work)
545 pseudo_filename = _make_pseudo_filename(config, gwjob)
546 job_to_pseudo_filename[gwjob.name] = pseudo_filename
547 job_to_task[gwjob.name] = work.get_work_name()
548 deps = []
549 missing_deps = False
550 for parent_job_name in generic_workflow.predecessors(gwjob.name):
551 if parent_job_name not in job_to_task:
552 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
553 missing_deps = True
554 break
555 else:
556 deps.append(
557 {
558 "task": job_to_task[parent_job_name],
559 "inputname": job_to_pseudo_filename[parent_job_name],
560 "available": False,
561 }
562 )
563 if not missing_deps:
564 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
565 else:
566 jobs_with_dependency_issues[gwjob.name] = work
568 # If there were any issues figuring out dependencies through earlier loop
569 if jobs_with_dependency_issues:
570 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.")
571 _LOG.info("Trying to recover...")
572 for job_name, work in jobs_with_dependency_issues.items():
573 deps = []
574 for parent_job_name in generic_workflow.predecessors(job_name):
575 if parent_job_name not in job_to_task:
576 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
577 raise RuntimeError(
578 "Could not recover from dependency issues ({job_name} missing {parent_job_name})."
579 )
580 deps.append(
581 {
582 "task": job_to_task[parent_job_name],
583 "inputname": job_to_pseudo_filename[parent_job_name],
584 "available": False,
585 }
586 )
587 pseudo_filename = job_to_pseudo_filename[job_name]
588 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
589 _LOG.info("Successfully recovered.")
591 return files_to_pre_stage, dag_sink_work, task_count