Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%
192 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-04 10:04 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-04 10:04 +0000
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Utilities for bps PanDA plugin."""
30__all__ = [
31 "copy_files_for_distribution",
32 "get_idds_client",
33 "get_idds_result",
34 "convert_exec_string_to_hex",
35 "add_decoder_prefix",
36]
38import binascii
39import concurrent.futures
40import logging
41import os
43import idds.common.utils as idds_utils
44import pandaclient.idds_api
45from idds.doma.workflowv2.domapandawork import DomaPanDAWork
46from idds.workflowv2.workflow import AndCondition
47from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
48from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
49from lsst.ctrl.bps.panda.constants import (
50 PANDA_DEFAULT_CLOUD,
51 PANDA_DEFAULT_CORE_COUNT,
52 PANDA_DEFAULT_MAX_ATTEMPTS,
53 PANDA_DEFAULT_MAX_JOBS_PER_TASK,
54 PANDA_DEFAULT_MAX_WALLTIME,
55 PANDA_DEFAULT_PRIORITY,
56 PANDA_DEFAULT_PROCESSING_TYPE,
57 PANDA_DEFAULT_PROD_SOURCE_LABEL,
58 PANDA_DEFAULT_RSS,
59 PANDA_DEFAULT_TASK_TYPE,
60 PANDA_DEFAULT_VO,
61)
62from lsst.resources import ResourcePath
64_LOG = logging.getLogger(__name__)
67def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers):
68 """Brings locally generated files into Cloud for further
69 utilization them on the edge nodes.
71 Parameters
72 ----------
73 local_pfns : `dict` [`str`, `str`]
74 Files which need to be copied to a workflow staging area.
75 file_distribution_uri: `str`
76 Path on the edge node accessed storage,
77 including access protocol, bucket name to place files.
78 max_copy_workers : `int`
79 Maximum number of workers for copying files.
81 Raises
82 ------
83 RuntimeError
84 Raised when error copying files to the distribution point.
85 """
86 files_to_copy = {}
88 # In case there are folders we iterate over its content
89 for local_pfn in files_to_stage.values():
90 folder_name = os.path.basename(os.path.normpath(local_pfn))
91 if os.path.isdir(local_pfn):
92 files_in_folder = ResourcePath.findFileResources([local_pfn])
93 for file in files_in_folder:
94 file_name = file.basename()
95 files_to_copy[file] = ResourcePath(
96 os.path.join(file_distribution_uri, folder_name, file_name)
97 )
98 else:
99 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
100 os.path.join(file_distribution_uri, folder_name)
101 )
103 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers)
104 future_file_copy = []
105 for src, trgt in files_to_copy.items():
106 _LOG.debug("Staging %s to %s", src, trgt)
107 # S3 clients explicitly instantiate here to overpass this
108 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
109 trgt.exists()
110 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
112 for future in concurrent.futures.as_completed(future_file_copy):
113 if future.result() is not None:
114 raise RuntimeError("Error of placing files to the distribution point")
117def get_idds_client(config):
118 """Get the idds client.
120 Parameters
121 ----------
122 config : `lsst.ctrl.bps.BpsConfig`
123 BPS configuration.
125 Returns
126 -------
127 idds_client: `idds.client.clientmanager.ClientManager`
128 iDDS ClientManager object.
129 """
130 idds_server = None
131 if isinstance(config, BpsConfig):
132 _, idds_server = config.search("iddsServer", opt={"default": None})
133 elif isinstance(config, dict) and "iddsServer" in config:
134 idds_server = config["iddsServer"]
135 # if idds_server is None, a default value on the panda relay service
136 # will be used
137 idds_client = pandaclient.idds_api.get_api(
138 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
139 )
140 return idds_client
143def get_idds_result(ret):
144 """Parse the results returned from iDDS.
146 Parameters
147 ----------
148 ret: `tuple` of (`int`, (`bool`, payload)).
149 The first part ret[0] is the status of PanDA relay service.
150 The part of ret[1][0] is the status of iDDS service.
151 The part of ret[1][1] is the returned payload.
152 If ret[1][0] is False, ret[1][1] can be error messages.
154 Returns
155 -------
156 status: `bool`
157 The status of iDDS calls.
158 result: `int` or `list` or `dict`
159 The result returned from iDDS.
160 error: `str`
161 Error messages.
162 """
163 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
164 if not isinstance(ret, list | tuple) or ret[0] != 0:
165 # Something wrong with the PanDA relay service.
166 # The call may not be delivered to iDDS.
167 status = False
168 result = None
169 error = f"PanDA relay service returns errors: {str(ret)}"
170 else:
171 if ret[1][0]:
172 status = True
173 result = ret[1][1]
174 error = None
175 if isinstance(result, str) and "Authentication no permission" in result:
176 status = False
177 result = None
178 error = result
179 else:
180 # iDDS returns errors
181 status = False
182 result = None
183 error = f"iDDS returns errors: {str(ret[1][1])}"
184 return status, result, error
187def _make_pseudo_filename(config, gwjob):
188 """Make the job pseudo filename.
190 Parameters
191 ----------
192 config : `lsst.ctrl.bps.BpsConfig`
193 BPS configuration.
194 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
195 Job for which to create the pseudo filename.
197 Returns
198 -------
199 pseudo_filename : `str`
200 The pseudo filename for the given job.
201 """
202 cmd_line_embedder = CommandLineEmbedder(config)
203 _, pseudo_filename = cmd_line_embedder.substitute_command_line(
204 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name, []
205 )
206 return pseudo_filename
209def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk):
210 """Make the DOMA Work object for a PanDA task.
212 Parameters
213 ----------
214 config : `lsst.ctrl.bps.BpsConfig`
215 BPS configuration.
216 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
217 Job representing the jobs for the PanDA task.
218 task_count : `int`
219 Count of PanDA tasks used when making unique names.
220 task_chunk : `int`
221 Count of chunk of a PanDA tasks used when making unique names.
223 Returns
224 -------
225 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork`
226 The client representation of a PanDA task.
227 local_pfns : `dict` [`str`, `str`]
228 Files which need to be copied to a workflow staging area.
229 """
230 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob)
231 cvals = {"curr_cluster": gwjob.label}
232 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True})
233 cvals["curr_site"] = site
234 _, processing_type = config.search(
235 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE}
236 )
237 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE})
238 _, prod_source_label = config.search(
239 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL}
240 )
241 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO})
243 _, file_distribution_end_point = config.search(
244 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None}
245 )
247 _, file_distribution_end_point_default = config.search(
248 "fileDistributionEndPointDefault", opt={"curvals": cvals, "default": None}
249 )
251 # Assume input files are same across task
252 local_pfns = {}
253 direct_io_files = set()
255 if gwjob.executable.transfer_executable:
256 local_pfns["job_executable"] = gwjob.executable.src_uri
257 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}"
258 else:
259 job_executable = gwjob.executable.src_uri
260 cmd_line_embedder = CommandLineEmbedder(config)
261 _LOG.debug(
262 "job %s inputs = %s, outputs = %s",
263 gwjob.name,
264 generic_workflow.get_job_inputs(gwjob.name),
265 generic_workflow.get_job_outputs(gwjob.name),
266 )
268 cmd_line, _ = cmd_line_embedder.substitute_command_line(
269 job_executable + " " + gwjob.arguments,
270 gwjob.cmdvals,
271 gwjob.name,
272 generic_workflow.get_job_inputs(gwjob.name) + generic_workflow.get_job_outputs(gwjob.name),
273 )
275 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True):
276 local_pfns[gwfile.name] = gwfile.src_uri
277 if os.path.isdir(gwfile.src_uri):
278 # this is needed to make isdir function working
279 # properly in ButlerURL instance on the edge node
280 local_pfns[gwfile.name] += "/"
282 if gwfile.job_access_remote:
283 direct_io_files.add(gwfile.name)
285 if not direct_io_files:
286 direct_io_files.add("cmdlineplaceholder")
288 lsst_temp = "LSST_RUN_TEMP_SPACE"
289 if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ:
290 file_distribution_end_point = file_distribution_end_point_default
292 executable = add_decoder_prefix(
293 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files)
294 )
295 work = DomaPanDAWork(
296 executable=executable,
297 primary_input_collection={
298 "scope": "pseudo_dataset",
299 "name": f"pseudo_input_collection#{str(task_count)}",
300 },
301 output_collections=[
302 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"}
303 ],
304 log_collections=[],
305 dependency_map=[],
306 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}",
307 task_queue=gwjob.queue,
308 task_log={
309 "destination": "local",
310 "value": "log.tgz",
311 "dataset": "PandaJob_#{pandaid}/",
312 "token": "local",
313 "param_type": "log",
314 "type": "template",
315 },
316 encode_command_line=True,
317 task_rss=gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS,
318 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD,
319 task_site=site,
320 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY,
321 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT,
322 working_group=gwjob.accounting_group,
323 processing_type=processing_type,
324 task_type=task_type,
325 prodSourceLabel=prod_source_label,
326 vo=vo,
327 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS,
328 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME,
329 )
330 return work, local_pfns
333def add_final_idds_work(
334 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk
335):
336 """Add the special final PanDA task to the client workflow.
338 Parameters
339 ----------
340 config : `lsst.ctrl.bps.BpsConfig`
341 BPS configuration.
342 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
343 Generic workflow in which to find the final job.
344 idds_client_workflow : `idds.workflowv2.workflow.Workflow`
345 iDDS client representation of the workflow to which the final task
346 is added.
347 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
348 The work nodes in the client workflow which have no successors.
349 task_count : `int`
350 Count of PanDA tasks used when making unique names.
351 task_chunk : `int`
352 Count of chunk of a PanDA tasks used when making unique names.
354 Returns
355 -------
356 files : `dict` [`str`, `str`]
357 Files which need to be copied to a workflow staging area.
359 Raises
360 ------
361 NotImplementedError
362 Raised if final job in GenericWorkflow is itself a workflow.
363 TypeError
364 Raised if final job in GenericWorkflow is invalid type.
365 """
366 files = {}
368 # If final job exists in generic workflow, create DAG final job
369 final = generic_workflow.get_final()
370 if final:
371 if isinstance(final, GenericWorkflow):
372 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
374 if not isinstance(final, GenericWorkflowJob):
375 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
377 dag_final_work, files = _make_doma_work(
378 config,
379 generic_workflow,
380 final,
381 task_count,
382 task_chunk,
383 )
384 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:"
385 dag_final_work.dependency_map.append(
386 {"name": pseudo_filename, "submitted": False, "dependencies": []}
387 )
388 idds_client_workflow.add_work(dag_final_work)
389 conditions = []
390 for work in dag_sink_work:
391 conditions.append(work.is_terminated)
392 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work])
393 idds_client_workflow.add_condition(and_cond)
394 else:
395 _LOG.debug("No final job in GenericWorkflow")
396 return files
399def convert_exec_string_to_hex(cmdline):
400 """Convert the command line into hex representation.
402 This step is currently involved because large blocks of command lines
403 including special symbols passed to the pilot/container. To make sure
404 the 1 to 1 matching and pass by the special symbol stripping
405 performed by the Pilot we applied the hexing.
407 Parameters
408 ----------
409 cmdline : `str`
410 UTF-8 command line string
412 Returns
413 -------
414 hex : `str`
415 Hex representation of string
416 """
417 return binascii.hexlify(cmdline.encode()).decode("utf-8")
420def add_decoder_prefix(config, cmd_line, distribution_path, files):
421 """Compose the command line sent to the pilot from the functional part
422 (the actual SW running) and the middleware part (containers invocation)
424 Parameters
425 ----------
426 config : `lsst.ctrl.bps.BpsConfig`
427 Configuration information
428 cmd_line : `str`
429 UTF-8 based functional part of the command line
430 distribution_path : `str`
431 URI of path where all files are located for distribution
432 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]]
433 File names needed for a task (copied local, direct access)
435 Returns
436 -------
437 decoder_prefix : `str`
438 Full command line to be executed on the edge node
439 """
440 # Manipulate file paths for placement on cmdline
441 files_plc_hldr = {}
442 for key, pfn in files[0].items():
443 if pfn.endswith("/"):
444 files_plc_hldr[key] = os.path.basename(pfn[:-1])
445 isdir = True
446 else:
447 files_plc_hldr[key] = os.path.basename(pfn)
448 _, extension = os.path.splitext(pfn)
449 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml")
450 if isdir:
451 # this is needed to make isdir function working
452 # properly in ButlerURL instance on the egde node
453 files_plc_hldr[key] += "/"
454 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key])
456 cmdline_hex = convert_exec_string_to_hex(cmd_line)
457 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
458 runner_command = runner_command.replace("\n", " ")
459 decoder_prefix = runner_command.replace(
460 "_cmd_line_",
461 str(cmdline_hex)
462 + " ${IN/L} "
463 + distribution_path
464 + " "
465 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items())
466 + " "
467 + "+".join(files[1]),
468 )
469 return decoder_prefix
472def add_idds_work(config, generic_workflow, idds_workflow):
473 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS
474 workflow.
476 Parameters
477 ----------
478 config : `lsst.ctrl.bps.BpsConfig`
479 BPS configuration
480 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
481 Generic workflow containing jobs to convert.
482 idds_workflow : `idds.workflowv2.workflow.Workflow`
483 iDDS workflow to which the converted jobs should be added.
485 Returns
486 -------
487 files_to_pre_stage : `dict` [`str`, `str`]
488 Files that need to be copied to the staging area before submission.
489 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
490 The work nodes in the client workflow which have no successors.
491 task_count : `int`
492 Number of tasks in iDDS workflow used for unique task names
494 Raises
495 ------
496 RuntimeError
497 If cannot recover from dependency issues after pass through workflow.
498 """
499 # Limit number of jobs in single PanDA task
500 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK})
502 files_to_pre_stage = {}
503 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task
504 job_to_task = {}
505 job_to_pseudo_filename = {}
506 task_count = 0 # Task number/ID in idds workflow used for unique name
508 # To avoid dying due to optimizing number of times through workflow,
509 # catch dependency issues to loop through again later.
510 jobs_with_dependency_issues = {}
512 # Assume jobs with same label share config values
513 for job_label in generic_workflow.labels:
514 _LOG.debug("job_label = %s", job_label)
515 # Add each job with a particular label to a corresponding PanDA task
516 # A PanDA task has a limit on number of jobs, so break into multiple
517 # PanDA tasks if needed.
518 job_count = 0 # Number of jobs in idds task used for task chunking
519 task_chunk = 1 # Task chunk number within job label used for unique name
520 work = None
522 # Instead of changing code to make chunks up front and round-robin
523 # assign jobs to chunks, for now keeping chunk creation in loop
524 # but using knowledge of how many chunks there will be to set better
525 # maximum number of jobs in a chunk for more even distribution.
526 jobs_by_label = generic_workflow.get_jobs_by_label(job_label)
527 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil
528 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks)
529 _LOG.debug(
530 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s",
531 job_label,
532 len(jobs_by_label),
533 num_chunks,
534 max_jobs_per_task_this_label,
535 )
536 for gwjob in jobs_by_label:
537 job_count += 1
538 if job_count > max_jobs_per_task_this_label:
539 job_count = 1
540 task_chunk += 1
542 if job_count == 1:
543 # Create new PanDA task object
544 task_count += 1
545 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk)
546 files_to_pre_stage.update(files)
547 idds_workflow.add_work(work)
548 if generic_workflow.out_degree(gwjob.name) == 0:
549 dag_sink_work.append(work)
551 pseudo_filename = _make_pseudo_filename(config, gwjob)
552 job_to_pseudo_filename[gwjob.name] = pseudo_filename
553 job_to_task[gwjob.name] = work.get_work_name()
554 deps = []
555 missing_deps = False
556 for parent_job_name in generic_workflow.predecessors(gwjob.name):
557 if parent_job_name not in job_to_task:
558 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
559 missing_deps = True
560 break
561 else:
562 deps.append(
563 {
564 "task": job_to_task[parent_job_name],
565 "inputname": job_to_pseudo_filename[parent_job_name],
566 "available": False,
567 }
568 )
569 if not missing_deps:
570 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
571 else:
572 jobs_with_dependency_issues[gwjob.name] = work
574 # If there were any issues figuring out dependencies through earlier loop
575 if jobs_with_dependency_issues:
576 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.")
577 _LOG.info("Trying to recover...")
578 for job_name, work in jobs_with_dependency_issues.items():
579 deps = []
580 for parent_job_name in generic_workflow.predecessors(job_name):
581 if parent_job_name not in job_to_task:
582 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
583 raise RuntimeError(
584 "Could not recover from dependency issues ({job_name} missing {parent_job_name})."
585 )
586 deps.append(
587 {
588 "task": job_to_task[parent_job_name],
589 "inputname": job_to_pseudo_filename[parent_job_name],
590 "available": False,
591 }
592 )
593 pseudo_filename = job_to_pseudo_filename[job_name]
594 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
595 _LOG.info("Successfully recovered.")
597 return files_to_pre_stage, dag_sink_work, task_count