Coverage for python/lsst/ctrl/bps/panda/utils.py: 11%
162 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-15 02:53 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-15 02:53 -0800
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Utilities for bps PanDA plugin."""
24__all__ = [
25 "copy_files_for_distribution",
26 "get_idds_client",
27 "get_idds_result",
28 "convert_exec_string_to_hex",
29 "add_decoder_prefix",
30]
32import binascii
33import concurrent.futures
34import logging
35import os
37import idds.common.utils as idds_utils
38import pandaclient.idds_api
39from idds.doma.workflowv2.domapandawork import DomaPanDAWork
40from idds.workflowv2.workflow import AndCondition
41from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
42from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
43from lsst.ctrl.bps.panda.constants import (
44 PANDA_DEFAULT_CLOUD,
45 PANDA_DEFAULT_CORE_COUNT,
46 PANDA_DEFAULT_MAX_ATTEMPTS,
47 PANDA_DEFAULT_MAX_JOBS_PER_TASK,
48 PANDA_DEFAULT_MAX_WALLTIME,
49 PANDA_DEFAULT_PRIORITY,
50 PANDA_DEFAULT_PROCESSING_TYPE,
51 PANDA_DEFAULT_PROD_SOURCE_LABEL,
52 PANDA_DEFAULT_RSS,
53 PANDA_DEFAULT_TASK_TYPE,
54 PANDA_DEFAULT_VO,
55)
56from lsst.resources import ResourcePath
58_LOG = logging.getLogger(__name__)
61def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers):
62 """Brings locally generated files into Cloud for further
63 utilization them on the edge nodes.
65 Parameters
66 ----------
67 local_pfns : `dict` [`str`, `str`]
68 Files which need to be copied to a workflow staging area.
69 file_distribution_uri: `str`
70 Path on the edge node accessed storage,
71 including access protocol, bucket name to place files.
72 max_copy_workers : `int`
73 Maximum number of workers for copying files.
75 Raises
76 ------
77 RuntimeError
78 Raised when error copying files to the distribution point.
79 """
80 files_to_copy = {}
82 # In case there are folders we iterate over its content
83 for local_pfn in files_to_stage.values():
84 folder_name = os.path.basename(os.path.normpath(local_pfn))
85 if os.path.isdir(local_pfn):
86 files_in_folder = ResourcePath.findFileResources([local_pfn])
87 for file in files_in_folder:
88 file_name = file.basename()
89 files_to_copy[file] = ResourcePath(
90 os.path.join(file_distribution_uri, folder_name, file_name)
91 )
92 else:
93 files_to_copy[ResourcePath(local_pfn)] = ResourcePath(
94 os.path.join(file_distribution_uri, folder_name)
95 )
97 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers)
98 future_file_copy = []
99 for src, trgt in files_to_copy.items():
100 # S3 clients explicitly instantiate here to overpass this
101 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe
102 trgt.exists()
103 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy"))
105 for future in concurrent.futures.as_completed(future_file_copy):
106 if not future.result() is None:
107 raise RuntimeError("Error of placing files to the distribution point")
110def get_idds_client(config):
111 """Get the idds client.
113 Parameters
114 ----------
115 config : `lsst.ctrl.bps.BpsConfig`
116 BPS configuration.
118 Returns
119 -------
120 idds_client: `idds.client.clientmanager.ClientManager`
121 iDDS ClientManager object.
122 """
123 idds_server = None
124 if isinstance(config, BpsConfig):
125 _, idds_server = config.search("iddsServer", opt={"default": None})
126 elif isinstance(config, dict) and "iddsServer" in config:
127 idds_server = config["iddsServer"]
128 # if idds_server is None, a default value on the panda relay service
129 # will be used
130 idds_client = pandaclient.idds_api.get_api(
131 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True
132 )
133 return idds_client
136def get_idds_result(ret):
137 """Parse the results returned from iDDS.
139 Parameters
140 ----------
141 ret: `tuple` of (`int`, (`bool`, payload)).
142 The first part ret[0] is the status of PanDA relay service.
143 The part of ret[1][0] is the status of iDDS service.
144 The part of ret[1][1] is the returned payload.
145 If ret[1][0] is False, ret[1][1] can be error messages.
147 Returns
148 -------
149 status: `bool`
150 The status of iDDS calls.
151 result: `int` or `list` or `dict`
152 The result returned from iDDS.
153 error: `str`
154 Error messages.
155 """
156 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html
157 if not isinstance(ret, (list, tuple)) or ret[0] != 0:
158 # Something wrong with the PanDA relay service.
159 # The call may not be delivered to iDDS.
160 status = False
161 result = None
162 error = f"PanDA relay service returns errors: {str(ret)}"
163 else:
164 if ret[1][0]:
165 status = True
166 result = ret[1][1]
167 error = None
168 if isinstance(result, str) and "Authentication no permission" in result:
169 status = False
170 result = None
171 error = result
172 else:
173 # iDDS returns errors
174 status = False
175 result = None
176 error = f"iDDS returns errors: {str(ret[1][1])}"
177 return status, result, error
180def _make_pseudo_filename(config, gwjob):
181 """Make the job pseudo filename.
183 Parameters
184 ----------
185 config : `lsst.ctrl.bps.BpsConfig`
186 BPS configuration.
187 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
188 Job for which to create the pseudo filename.
190 Returns
191 -------
192 pseudo_filename : `str`
193 The pseudo filename for the given job.
194 """
195 cmd_line_embedder = CommandLineEmbedder(config)
196 _, pseudo_filename = cmd_line_embedder.substitute_command_line(
197 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name
198 )
199 return pseudo_filename
202def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk):
203 """Make the DOMA Work object for a PanDA task.
205 Parameters
206 ----------
207 config : `lsst.ctrl.bps.BpsConfig`
208 BPS configuration.
209 gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
210 Job representing the jobs for the PanDA task.
211 task_count : `int`
212 Count of PanDA tasks used when making unique names.
213 task_chunk : `int`
214 Count of chunk of a PanDA tasks used when making unique names.
216 Returns
217 -------
218 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork`
219 The client representation of a PanDA task.
220 local_pfns : `dict` [`str`, `str`]
221 Files which need to be copied to a workflow staging area.
222 """
223 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob)
224 cvals = {"curr_cluster": gwjob.label}
225 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True})
226 cvals["curr_site"] = site
227 _, processing_type = config.search(
228 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE}
229 )
230 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE})
231 _, prod_source_label = config.search(
232 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL}
233 )
234 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO})
236 _, file_distribution_end_point = config.search(
237 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None}
238 )
240 # Assume input files are same across task
241 local_pfns = {}
242 direct_io_files = set()
244 if gwjob.executable.transfer_executable:
245 local_pfns["job_executable"] = gwjob.executable.src_uri
246 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}"
247 else:
248 job_executable = gwjob.executable.src_uri
249 cmd_line_embedder = CommandLineEmbedder(config)
250 cmd_line, _ = cmd_line_embedder.substitute_command_line(
251 job_executable + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name
252 )
254 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True):
255 local_pfns[gwfile.name] = gwfile.src_uri
256 if os.path.isdir(gwfile.src_uri):
257 # this is needed to make isdir function working
258 # properly in ButlerURL instance on the edge node
259 local_pfns[gwfile.name] += "/"
261 if gwfile.job_access_remote:
262 direct_io_files.add(gwfile.name)
264 if not direct_io_files:
265 direct_io_files.add("cmdlineplaceholder")
267 executable = add_decoder_prefix(
268 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files)
269 )
270 work = DomaPanDAWork(
271 executable=executable,
272 primary_input_collection={
273 "scope": "pseudo_dataset",
274 "name": f"pseudo_input_collection#{str(task_count)}",
275 },
276 output_collections=[
277 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"}
278 ],
279 log_collections=[],
280 dependency_map=[],
281 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}",
282 task_queue=gwjob.queue,
283 task_log={
284 "destination": "local",
285 "value": "log.tgz",
286 "dataset": "PandaJob_#{pandaid}/",
287 "token": "local",
288 "param_type": "log",
289 "type": "template",
290 },
291 encode_command_line=True,
292 task_rss=gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS,
293 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD,
294 task_site=site,
295 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY,
296 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT,
297 working_group=gwjob.accounting_group,
298 processing_type=processing_type,
299 task_type=task_type,
300 prodSourceLabel=prod_source_label,
301 vo=vo,
302 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS,
303 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME,
304 )
305 return work, local_pfns
308def add_final_idds_work(
309 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk
310):
311 """Add the special final PanDA task to the client workflow.
313 Parameters
314 ----------
315 config : `lsst.ctrl.bps.BpsConfig`
316 BPS configuration.
317 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
318 Generic workflow in which to find the final job.
319 idds_client_workflow : `idds.workflowv2.workflow.Workflow`
320 iDDS client representation of the workflow to which the final task
321 is added.
322 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
323 The work nodes in the client workflow which have no successors.
324 task_count : `int`
325 Count of PanDA tasks used when making unique names.
326 task_chunk : `int`
327 Count of chunk of a PanDA tasks used when making unique names.
329 Returns
330 -------
331 files : `dict` [`str`, `str`]
332 Files which need to be copied to a workflow staging area.
334 Raises
335 ------
336 NotImplementedError
337 Raised if final job in GenericWorkflow is itself a workflow.
338 TypeError
339 Raised if final job in GenericWorkflow is invalid type.
340 """
341 files = {}
343 # If final job exists in generic workflow, create DAG final job
344 final = generic_workflow.get_final()
345 if final:
346 if isinstance(final, GenericWorkflow):
347 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
349 if not isinstance(final, GenericWorkflowJob):
350 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
352 dag_final_work, files = _make_doma_work(
353 config,
354 generic_workflow,
355 final,
356 task_count,
357 task_chunk,
358 )
359 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:"
360 dag_final_work.dependency_map.append(
361 {"name": pseudo_filename, "submitted": False, "dependencies": []}
362 )
363 idds_client_workflow.add_work(dag_final_work)
364 conditions = []
365 for work in dag_sink_work:
366 conditions.append(work.is_terminated)
367 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work])
368 idds_client_workflow.add_condition(and_cond)
369 else:
370 _LOG.debug("No final job in GenericWorkflow")
371 return files
374def convert_exec_string_to_hex(cmdline):
375 """Convert the command line into hex representation.
377 This step is currently involved because large blocks of command lines
378 including special symbols passed to the pilot/container. To make sure
379 the 1 to 1 matching and pass by the special symbol stripping
380 performed by the Pilot we applied the hexing.
382 Parameters
383 ----------
384 cmdline : `str`
385 UTF-8 command line string
387 Returns
388 -------
389 hex : `str`
390 Hex representation of string
391 """
392 return binascii.hexlify(cmdline.encode()).decode("utf-8")
395def add_decoder_prefix(config, cmd_line, distribution_path, files):
396 """Compose the command line sent to the pilot from the functional part
397 (the actual SW running) and the middleware part (containers invocation)
399 Parameters
400 ----------
401 config : `lsst.ctrl.bps.BpsConfig`
402 Configuration information
403 cmd_line : `str`
404 UTF-8 based functional part of the command line
405 distribution_path : `str`
406 URI of path where all files are located for distribution
407 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]]
408 File names needed for a task (copied local, direct access)
410 Returns
411 -------
412 decoder_prefix : `str`
413 Full command line to be executed on the edge node
414 """
415 # Manipulate file paths for placement on cmdline
416 files_plc_hldr = {}
417 for key, pfn in files[0].items():
418 if pfn.endswith("/"):
419 files_plc_hldr[key] = os.path.basename(pfn[:-1])
420 isdir = True
421 else:
422 files_plc_hldr[key] = os.path.basename(pfn)
423 _, extension = os.path.splitext(pfn)
424 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml")
425 if isdir:
426 # this is needed to make isdir function working
427 # properly in ButlerURL instance on the egde node
428 files_plc_hldr[key] += "/"
429 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key])
431 cmdline_hex = convert_exec_string_to_hex(cmd_line)
432 _, decoder_prefix = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
433 decoder_prefix = decoder_prefix.replace(
434 "_cmd_line_",
435 str(cmdline_hex)
436 + " ${IN/L} "
437 + distribution_path
438 + " "
439 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items())
440 + " "
441 + "+".join(files[1]),
442 )
443 return decoder_prefix
446def add_idds_work(config, generic_workflow, idds_workflow):
447 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS
448 workflow.
450 Parameters
451 ----------
452 config : `lsst.ctrl.bps.BpsConfig`
453 BPS configuration
454 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
455 Generic workflow containing jobs to convert.
456 idds_workflow : `idds.workflowv2.workflow.Workflow`
457 iDDS workflow to which the converted jobs should be added.
459 Returns
460 -------
461 files_to_pre_stage : `dict` [`str`, `str`]
462 Files that need to be copied to the staging area before submission.
463 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`]
464 The work nodes in the client workflow which have no successors.
465 task_count : `int`
466 Number of tasks in iDDS workflow used for unique task names
467 """
468 # Limit number of jobs in single PanDA task
469 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK})
471 files_to_pre_stage = {}
472 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task
473 job_to_task = {}
474 job_to_pseudo_filename = {}
475 task_count = 0 # Task number/ID in idds workflow used for unique name
477 # Assume jobs with same label share config values
478 for job_label in generic_workflow.labels:
479 _LOG.debug("job_label = %s", job_label)
480 # Add each job with a particular label to a corresponding PanDA task
481 # A PanDA task has a limit on number of jobs, so break into multiple
482 # PanDA tasks if needed.
483 job_count = 0 # Number of jobs in idds task used for task chunking
484 task_chunk = 1 # Task chunk number within job label used for unique name
485 work = None
486 for gwjob in generic_workflow.get_jobs_by_label(job_label):
487 job_count += 1
488 if job_count >= max_jobs_per_task:
489 job_count = 1
490 task_chunk += 1
492 if job_count == 1:
493 # Create new PanDA task object
494 task_count += 1
495 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk)
496 files_to_pre_stage.update(files)
497 if generic_workflow.out_degree(gwjob.name) == 0:
498 dag_sink_work.append(work)
500 pseudo_filename = _make_pseudo_filename(config, gwjob)
501 job_to_pseudo_filename[gwjob.name] = pseudo_filename
502 job_to_task[gwjob.name] = work.get_work_name()
503 deps = []
504 for parent_job_name in generic_workflow.predecessors(gwjob.name):
505 if parent_job_name not in job_to_task:
506 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys())
507 deps.append(
508 {
509 "task": job_to_task[parent_job_name],
510 "inputname": job_to_pseudo_filename[parent_job_name],
511 "available": False,
512 }
513 )
514 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps})
515 idds_workflow.add_work(work)
516 return files_to_pre_stage, dag_sink_work, task_count