Coverage for python/lsst/ctrl/bps/wms/pegasus/pegasus_service.py: 3%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Support for using Pegasus WMS.
23"""
25__all__ = ["PegasusService", "PegasusWorkflow"]
28import copy
29import logging
30import os
31import re
32import shlex
33import shutil
34import subprocess
36from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog
37from Pegasus.DAX3 import ADAG, PFN, Executable, File, Job, Link, Namespace, Profile
39from ... import BaseWmsService, BaseWmsWorkflow
40from ...bps_utils import chdir
41from ..htcondor import HTCondorService, htc_write_attribs
43_LOG = logging.getLogger(__name__)
46class PegasusService(BaseWmsService):
47 """Pegasus version of workflow engine."""
49 def prepare(self, config, generic_workflow, out_prefix=None):
50 """Create submission for a generic workflow in a specific WMS.
52 Parameters
53 ----------
54 config : `lsst.ctrl.bps.BpsConfig`
55 BPS configuration that includes necessary submit/runtime
56 information.
57 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
58 The generic workflow (e.g., has executable name and arguments)
59 out_prefix : `str`
60 The root directory into which all WMS-specific files are written.
62 Returns
63 ----------
64 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow`
65 A workflow ready for Pegasus to run.
66 """
67 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}"
68 peg_workflow = PegasusWorkflow.from_generic_workflow(
69 config, generic_workflow, out_prefix, service_class
70 )
71 peg_workflow.write(out_prefix)
72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs)
73 return peg_workflow
75 def submit(self, workflow):
76 """Submit a single WMS workflow
78 Parameters
79 ----------
80 workflow : `lsst.ctrl.bps.BaseWorkflow`
81 A single HTCondor workflow to submit
82 """
83 with chdir(workflow.submit_path):
84 _LOG.info("Submitting from directory: %s", os.getcwd())
85 command = f"pegasus-run {workflow.run_id}"
86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh:
87 process = subprocess.Popen(
88 shlex.split(command), shell=False, stdout=outfh, stderr=subprocess.STDOUT
89 )
90 process.wait()
92 if process.returncode != 0:
93 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode)
95 # Note:
96 #
97 # No need to save run id as the same as the run id generated when
98 # running pegasus-plan earlier.
100 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=True):
101 """Query WMS for list of submitted WMS workflows/jobs.
103 This should be a quick lookup function to create list of jobs for
104 other functions.
106 Parameters
107 ----------
108 wms_id : `int` or `str`, optional
109 Id or path that can be used by WMS service to look up job.
110 user : `str`, optional
111 User whose submitted jobs should be listed.
112 require_bps : `bool`, optional
113 Whether to require jobs returned in list to be bps-submitted jobs.
114 pass_thru : `str`, optional
115 Information to pass through to WMS.
116 is_global : `bool`, optional
117 If set, all job queues (and their histories) will be queried for
118 job information. Defaults to False which means that only the local
119 job queue will be queried.
121 Returns
122 -------
123 job_ids : `list` [`Any`]
124 Only job ids to be used by cancel and other functions. Typically
125 this means top-level jobs (i.e., not children jobs).
126 """
127 htc_service = HTCondorService(self.config)
128 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru, is_global)
130 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=True):
131 """Query WMS for status of submitted WMS workflows
132 Parameters
133 ----------
134 wms_workflow_id : `int` or `str`, optional
135 Id that can be used by WMS service to look up status.
136 user : `str`, optional
137 Limit report to submissions by this particular user
138 hist : `int`, optional
139 Number of days to expand report to include finished WMS workflows.
140 pass_thru : `str`, optional
141 Additional arguments to pass through to the specific WMS service.
142 is_global : `bool`, optional
143 If set, all job queues (and their histories) will be queried for
144 job information. Defaults to False which means that only the local
145 job queue will be queried.
147 Returns
148 -------
149 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`]
150 Status information for submitted WMS workflows
151 message : `str`
152 Message to user on how to find more status information specific to
153 WMS.
154 """
155 htc_service = HTCondorService(self.config)
156 return htc_service.report(wms_workflow_id, user, hist, pass_thru, is_global)
158 def cancel(self, wms_id, pass_thru=None):
159 """Cancel submitted workflows/jobs.
161 Parameters
162 ----------
163 wms_id : `str`
164 ID or path of job that should be canceled.
165 pass_thru : `str`, optional
166 Information to pass through to WMS.
168 Returns
169 --------
170 deleted : `bool`
171 Whether successful deletion or not. Currently, if any doubt or any
172 individual jobs not deleted, return False.
173 message : `str`
174 Any message from WMS (e.g., error details).
175 """
176 _LOG.debug("Canceling wms_id = %s", wms_id)
178 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete
179 try:
180 float(wms_id)
181 htc_service = HTCondorService(self.config)
182 deleted, message = htc_service.cancel(wms_id, pass_thru)
183 except ValueError:
184 command = f"pegasus-remove {wms_id}"
185 _LOG.debug(command)
186 completed_process = subprocess.run(
187 shlex.split(command),
188 shell=False,
189 check=False,
190 stdout=subprocess.PIPE,
191 stderr=subprocess.STDOUT,
192 )
193 _LOG.debug(completed_process.stdout)
194 _LOG.debug("Return code = %s", completed_process.returncode)
196 if completed_process.returncode != 0:
197 deleted = False
198 m = re.match(b"443", completed_process.stdout)
199 if m:
200 message = "no such bps job in batch queue"
201 else:
202 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}"
203 print("XXX", completed_process.stdout.decode(), "XXX")
204 print(message)
205 else:
206 deleted = True
208 return deleted, message
211class PegasusWorkflow(BaseWmsWorkflow):
212 """Single Pegasus Workflow
214 Parameters
215 ----------
216 name : `str`
217 Name of workflow.
218 config : `lsst.ctrl.bps.BpsConfig`
219 BPS configuration that includes necessary submit/runtime information.
220 """
222 def __init__(self, name, config):
223 # config, run_id, submit_path
224 super().__init__(name, config)
225 self.dax = ADAG(name)
226 self.run_attrs = None
228 self.replica_catalog = None
229 self.sites_catalog = None
230 self.transformation_catalog = None
231 self._init_catalogs()
232 self.properties_filename = None
233 self.dax_filename = None
235 def _init_catalogs(self):
236 # Set workdir in catalogs at write time. So pass None as value here.
238 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to
239 # physical file ids/names (PFN's)
240 if "rcFile" not in self.config:
241 fname = "rc.txt"
242 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname)
244 # Transformation Catalog describes all of the executables
245 # (called "transformations") used by the workflow.
246 if "tcFile" not in self.config:
247 fname = "tc.txt"
248 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname)
250 # Note:
251 #
252 # SitesCatalog needs workdir at initialization to create local site
253 # for submit side directory where the output data from the workflow
254 # will be stored. So delaying creation of SitesCatalog until all the
255 # write function is called with a given output directory.
257 @classmethod
258 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
259 # Docstring inherited.
260 peg_workflow = cls(generic_workflow.name, config)
261 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs)
262 peg_workflow.run_attrs["bps_wms_service"] = service_class
263 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}"
265 # Create initial Pegasus File objects for all files that WMS must
266 # handle.
267 peg_files = {}
268 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True):
269 if gwf_file.wms_transfer:
270 peg_file = File(gwf_file.name)
271 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local"))
272 peg_files[gwf_file.name] = peg_file
274 # Add jobs to the DAX.
275 for job_name in generic_workflow:
276 gwf_job = generic_workflow.get_job(job_name)
277 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files)
278 peg_workflow.dax.addJob(job)
280 # Add job dependencies to the DAX.
281 for job_name in generic_workflow:
282 for child_name in generic_workflow.successors(job_name):
283 peg_workflow.dax.depends(
284 parent=peg_workflow.dax.getJob(job_name), child=peg_workflow.dax.getJob(child_name)
285 )
287 return peg_workflow
289 def create_job(self, generic_workflow, gwf_job, peg_files):
290 """Create a Pegasus job corresponding to the given GenericWorkflow job.
292 Parameters
293 ----------
294 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
295 Generic workflow that is being converted.
296 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob`
297 The generic job to convert to a Pegasus job.
298 peg_files : `dict` [`str`, `Pegasus.DAX3.File`]
299 Pegasus Files needed when creating Pegasus Job.
301 Returns
302 -------
303 job : `Pegasus.DAX3.Job`
304 Pegasus job created from the generic workflow job.
306 Notes
307 -----
308 https://pegasus.isi.edu/documentation/reference-guide/variable
309 -expansion.html Says that ${VAR} gets expanded with submit side
310 values during pegasus-plan. If try $VAR (which isn't supposed to get
311 expanded by pegasus-plan), the environment variable (e.g.,
312 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path
313 and job dies because cannot find executable (/bin/pipetask).
315 So, currently Pegasus plugin only works if environment variables used
316 in commands are same on submit machine and compute machine.
317 """
318 _LOG.debug("GenericWorkflowJob=%s", gwf_job)
320 # Save transformation.
321 executable = Executable(gwf_job.executable.name, installed=not gwf_job.executable.transfer_executable)
322 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", gwf_job.executable.src_uri)
323 _LOG.debug("Executable after replacing any environment variables = %s", newexec)
324 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site))
325 self.transformation_catalog.add(executable)
327 # Create Pegasus Job.
328 job = Job(gwf_job.executable.name, id=gwf_job.name, node_label=gwf_job.label)
330 if gwf_job.arguments:
331 arguments = gwf_job.arguments
332 # Replace command variables
333 arguments = arguments.format(**gwf_job.cmdvals)
335 # Replace env vars
336 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments)
337 _LOG.debug("Arguments after replacing any environment variables = %s", arguments)
339 # Replace file placeholders
340 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments)
341 _LOG.debug("Command line arguments: %s", arguments)
343 # Break up command string into separate args for Pegasus Job object
344 # replacing file names with Pegasus File objects
345 args = arguments.split()
346 logical_file_names = list(set(peg_files) & set(args))
347 if logical_file_names:
348 indices = [args.index(lfn) for lfn in logical_file_names]
349 for idx, lfn in zip(indices, logical_file_names):
350 args[idx] = peg_files[lfn]
352 job.addArguments(*args)
353 else:
354 _LOG.warning("Job %s does not have any arguments", gwf_job.name)
356 if gwf_job.request_memory: # MB
357 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory))
358 if gwf_job.request_cpus: # cores
359 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus))
360 if gwf_job.request_disk: # MB
361 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk))
362 if gwf_job.priority: # MB
363 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority))
365 # Add extra job attributes
366 for key, value in gwf_job.profile.items():
367 job.addProfile(Profile(Namespace.CONDOR, key, value))
369 for key, value in gwf_job.environment.items():
370 job.addProfile(Profile(Namespace.ENV, key, value))
372 # Add run attributes
373 for key, value in self.run_attrs.items():
374 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"'))
376 for key, value in gwf_job.attrs.items():
377 _LOG.debug("create_job: attrs = %s", gwf_job.attrs)
378 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"'))
380 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"'))
381 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"'))
382 if "quanta_summary" in gwf_job.tags:
383 job.addProfile(
384 Profile(
385 Namespace.CONDOR, key="+bps_job_quanta", value=f"\"{gwf_job.tags['quanta_summary']}\""
386 )
387 )
389 # Specify job's inputs.
390 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True):
391 peg_file = peg_files[gwf_file.name]
392 job.uses(peg_file, link=Link.INPUT)
393 for pfn in peg_file.pfns:
394 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site)
396 # Specify job's outputs
397 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True):
398 peg_file = peg_files[gwf_file.name]
399 job.uses(peg_file, link=Link.OUTPUT)
400 for pfn in peg_file.pfns:
401 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site)
403 return job
405 def _define_sites(self, out_prefix):
406 """Create Pegasus Site Catalog
408 Parameters
409 ----------
410 out_prefix : `str`
411 Directory prefix for the site catalog file.
413 Notes
414 -----
415 SitesCatalog needs workdir at initialization to create local site for
416 submit side directory where the output data from the workflow will be
417 stored.
418 """
419 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml")
421 # Adding information for all sites defined in config instead of
422 # limiting to those actually used by the workflow
423 for site, site_data in self.config["site"].items():
424 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"])
425 if "directory" in site_data:
426 # Workaround because no Python API
427 dir_dict = {}
428 for site_dir in site_data["directory"]:
429 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]}
430 self.sites_catalog._sites[site]["directories"] = dir_dict
432 # add config provided site attributes
433 if "profile" in site_data:
434 for pname, pdata in site_data["profile"].items():
435 for key, val in pdata.items():
436 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val)
437 self.sites_catalog.add_site_profile(
438 site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE", value=f"{self.name}.node_status"
439 )
441 def write(self, out_prefix):
442 """Write Pegasus Catalogs and DAX to files.
444 Parameters
445 ----------
446 out_prefix : `str`
447 Directory prefix for all the Pegasus workflow files.
448 """
449 self.submit_path = out_prefix
451 # filenames needed for properties file
452 filenames = {}
454 # Write down the workflow in DAX format.
455 self.dax_filename = f"{self.dax.name}.dax"
456 if out_prefix is not None:
457 os.makedirs(out_prefix, exist_ok=True)
458 self.dax_filename = os.path.join(out_prefix, self.dax_filename)
459 with open(self.dax_filename, "w") as outfh:
460 self.dax.writeXML(outfh)
462 # output site catalog
463 filename = f"{self.name}_sites.xml"
464 if "scFile" not in self.config:
465 self._define_sites(out_prefix)
466 self.sites_catalog.workflow_dir = out_prefix
467 self.sites_catalog.filename = filename
468 self.sites_catalog.write()
469 else:
470 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename))
471 filenames["sites"] = filename
473 # output transformation catalog
474 filename = f"{self.name}_tc.txt"
475 if self.transformation_catalog is not None:
476 self.transformation_catalog.workflow_dir = out_prefix
477 self.transformation_catalog.filename = filename
478 self.transformation_catalog.write()
479 else:
480 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename))
481 filenames["transformation"] = filename
483 # output replica catalog
484 filename = f"{self.name}_rc.txt"
485 if self.replica_catalog is not None:
486 self.replica_catalog.workflow_dir = out_prefix
487 self.replica_catalog.filename = filename
488 self.replica_catalog.write()
489 else:
490 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename))
491 filenames["replica"] = filename
493 self.properties_filename = self._write_properties_file(out_prefix, filenames)
495 def run_pegasus_plan(self, out_prefix, run_attr):
496 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission.
498 Parameters
499 ----------
500 out_prefix : `str`
501 Root directory in which to output all files.
502 run_attr : `dict`
503 Attributes to add to main DAG.
504 """
505 cmd = (
506 f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir "
507 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} "
508 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output"
509 )
510 _LOG.debug("Plan command: %s", cmd)
511 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out"
512 with chdir(self.submit_path):
513 _LOG.debug("pegasus-plan in directory: %s", os.getcwd())
514 _LOG.debug("pegasus-plan output in %s", pegout)
515 with open(pegout, "w") as pegfh:
516 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line
517 process = subprocess.run(
518 shlex.split(cmd), shell=False, stdout=pegfh, stderr=subprocess.STDOUT, check=False
519 )
520 if process.returncode != 0:
521 print(f"Error trying to generate Pegasus files. See {pegout}.")
522 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})")
524 # Grab run id from pegasus-plan output and save
525 with open(pegout, "r") as pegfh:
526 for line in pegfh:
527 match = re.search(r"pegasus-run\s+(\S+)", line)
528 if match:
529 self.run_id = match.group(1)
530 break
532 # Hack - Using profile in sites.xml doesn't add run attributes to DAG
533 # submission file. So adding them here:
534 if run_attr is not None:
535 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub"
536 shutil.copyfile(subname, subname + ".orig")
537 with open(subname + ".orig", "r") as infh:
538 with open(subname, "w") as outfh:
539 for line in infh:
540 line = line.strip()
541 if line == "queue":
542 htc_write_attribs(outfh, run_attr)
543 htc_write_attribs(outfh, {"bps_job_label": "DAG"})
544 print(line, file=outfh)
546 def _write_properties_file(self, out_prefix, filenames):
547 """Write Pegasus Properties File.
549 Parameters
550 ----------
551 out_prefix : `str`
552 Directory prefix for properties file.
553 filenames : `dict` [`str`, `str`]
554 Mapping of Pegasus file keys to filenames.
556 Returns
557 -------
558 properties : `str`
559 Filename of the pegasus properties file.
560 """
561 properties = f"{self.name}_pegasus.properties"
562 if out_prefix is not None:
563 properties = os.path.join(out_prefix, properties)
564 with open(properties, "w") as outfh:
565 print("# This tells Pegasus where to find the Site Catalog.", file=outfh)
566 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh)
568 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh)
569 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh)
571 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh)
572 print("pegasus.catalog.transformation=Text", file=outfh)
573 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh)
575 print("# Run Pegasus in shared file system mode.", file=outfh)
576 print("pegasus.data.configuration=sharedfs", file=outfh)
578 print("# Make Pegasus use links instead of transferring files.", file=outfh)
579 print("pegasus.transfer.*.impl=Transfer", file=outfh)
580 print("pegasus.transfer.links=true", file=outfh)
582 return properties